total 1803784
-rw-r--r-- 1 hqin simctr 145450513 Nov 9 13:06 gin_intermediate20171101.csv
-rw------- 1 hqin simctr 136896635 Nov 14 15:03 gin_intermediate_uniquePairs20171113.csv
-rw------- 1 hqin simctr 136896635 Nov 15 09:43 gin_intermediate_uniquePairs20171115.csv
-rw-r--r-- 1 hqin simctr 415614742 Nov 9 13:06 gin_lenient20171101.csv
-rw------- 1 hqin simctr 398220211 Nov 14 15:03 gin_lenient_uniquePairs20171113.csv
-rw------- 1 hqin simctr 398220211 Nov 15 09:43 gin_lenient_uniquePairs20171115.csv
-rw-r--r-- 1 hqin simctr 65823951 Nov 9 13:06 gin_stringent20171101.csv
-rw------- 1 hqin simctr 61189366 Nov 14 15:03 gin_stringent_uniquePairs20171113.csv
-rw------- 1 hqin simctr 61189366 Nov 15 09:44 gin_stringent_uniquePairs20171115.csv
drwx------ 2 hqin simctr 8192 Nov 13 10:01 trash
code: cellmap_explore20171101ridgeside.Rmd
---
title: "Genetic interactions CellMap exploration"
author: "H Qin"
date: "11/1 - 11/9 /2017 "
output: html_document
---
See http://hongqinlab.blogspot.com/2017/06/yeast-genetic-map.html
CELLMAP provids genetics interaction in pairwise format.
The interactions are provided for strain_ids (alleles).
The mapping of strain_id and ORF is in "strain_ids_and_single_mutant_fitness.csv" .
```{r}
rm(list=ls());
set.seed(2017);
datapath = "~/github/cellmap_parent/CellMap/20170626/S1.pairwise/"; #ridgeside
#datapath = "~/data/Sce/CellMap/20170626/S1.pairwise/"; #applejack
debug = 0;
list.files(path=datapath);
```
Load naming lookup tables
```{r}
dic = read.csv(paste(datapath, "strain_ids_and_single_mutant_fitness.csv", sep=''))
```
Load essential and non-essential infor that H. Qin generated.
```{r}
list.files(path="data");
fit = read.csv("data/SummaryRegressionHetHomFactorized2015Oct13.csv")
```
Load pairwise interaction data
```{r}
#Essential X Essential
tb.ee = read.table(paste(datapath,"SGA_ExE.txt", sep=''), header=T, sep="\t");
summary(tb.ee);
```
```{r}
#EXN and NXE
tb.en = read.table(paste(datapath,"SGA_ExN_NxE.txt", sep=''), header=T, sep="\t");
summary(tb.en);
```
```{r}
#NXN I do not need consider NxN for my aging modeling project
tb.nn = read.table(paste(datapath,"SGA_NxN.txt", sep=''), header=T, sep="\t");
summary(tb.nn);
```
Columns names in the 3 tables are the same.
```{r}
rbind( names(tb.en), names(tb.ee), names(tb.nn))
```
Merge 3 tables into 1 table.
```{r}
tb.gin = rbind(tb.ee, tb.en, tb.nn);
```
Double-check the merged results
```{r}
length(tb.gin[,1]) == sum(length(tb.ee[,1]), length(tb.en[,1]), length(tb.nn[,1]))
```
Remove unused table to free up memory
```{r}
if (debug==0){
ls()
rm(tb.en, tb.ee, tb.nn)
ls()
}
```
Costanzo2016 suggested lenient, intermediate, and stringennt ways for gin quality check.
```{r}
tb.gin.lenient = tb.gin[ tb.gin$P.value<=0.05, ];
if (debug ==0) { rm(tb.gin); } #freeup memory
```
Map strain IDs to ORFs, add my essentialFalgs
```{r}
tb.gin.lenient$ORF1 = as.character( dic$Systematic.gene.name[match( tb.gin.lenient$Query.Strain.ID, dic$Strain.ID)] )
tb.gin.lenient$ORF2 = as.character( dic$Systematic.gene.name[match( tb.gin.lenient$Array.Strain.ID, dic$Strain.ID)] )
tb.gin.lenient$essenflag1 = fit$essenflag[ match(tb.gin.lenient$ORF1, fit$orf)]
tb.gin.lenient$essenflag2 = fit$essenflag[ match(tb.gin.lenient$ORF2, fit$orf)]
head(tb.gin.lenient)
```
I need to take only unqiue interactions. This however raise questions how to store score andn p-values.
I will store the unique key_pairs first.
```{r}
# This chunk of codes only runs for subset of tb.gin.lenient, but not
#pairs = tb.gin.lenient[ c(1:2, 100, 1001, 5000, 9000), c("ORF1", "ORF2")]; #debug
#pairs = tb.gin.lenient[ 1:length(tb.gin.lenient[,1]), c("ORF1", "ORF2")];#DOES NOT RUN correctly
#pairs = tb.gin.lenient[ , c("ORF1", "ORF2")]; #DOES NOT RUN correctly
#pairs = tb.gin.lenient[ 1000:1050, c("ORF1", "ORF2")];
#pairs = data.frame(pairs)
#ordered_pairs = t(apply(pairs, 1, sort));
#ordered_pairs = data.frame(ordered_pairs);
#names(ordered_pairs) = c('id1', 'id2');
#ordered_pairs$id1 = as.character( ordered_pairs$id1)
#ordered_pairs$id2 = as.character( ordered_pairs$id2)
#tb.gin.lenient$ordered_pairs = paste( ordered_pairs$id1, ordered_pairs$id2, sep="_")[5:10]
```
```{r}
# i=199990
#i=5347
#i=5348
#for (i in 5347:6999) {
total = length(tb.gin.lenient[,1]);
for (i in 1:total) {
if( (round(i/1000)*1000 - i)==0 ) { print(paste(i, "::", total - i) ); }
pairs = tb.gin.lenient[i , c("ORF1", "ORF2")];
if ( ! is.na(pairs[1]) ) {
ordered_pairs = sort(pairs)
tb.gin.lenient$ordered_pairs[i] = paste( ordered_pairs[1], ordered_pairs[2], sep="_");
} else {
tb.gin.lenient$ordered_pairs[i] = "NA_found";
}
}
```
How many unique interactions (gene pairs) ?
1394733 in unique_pairs, which is 88.86% of the tb.gin.lenient rows
```{r}
unique_pairs = unique( tb.gin.lenient$ordered_pairs )
length(unique_pairs) / length(tb.gin.lenient[,1])
```
Using author's definition to find intermediate and stringent subsets.
```{r}
tb.gin.intermediate = tb.gin.lenient[ abs(tb.gin.lenient$Genetic.interaction.score..ε.) >0.08, ];
tb.gin.stringent = tb.gin.lenient[ tb.gin.lenient$Genetic.interaction.score..ε.>0.16 | tb.gin.lenient$Genetic.interaction.score..ε.< -0.12, ]
```
```{r}
hist(tb.gin.lenient$Genetic.interaction.score..ε., breaks = 100)
```
```{r}
summary(tb.gin.lenient);
summary(tb.gin.intermediate);
summary(tb.gin.stringent)
```
output
```{r}
write.csv(tb.gin.lenient, "../cellmap_data/gin_lenient20171101.csv", row.names = F);
write.csv(tb.gin.intermediate, "../cellmap_data/gin_intermediate20171101.csv", row.names = F);
write.csv(tb.gin.stringent, "../cellmap_data/gin_stringent20171101.csv", row.names = F);
```
Generate "lenient" unique GINx using average values
```{r lenient GIN}
#Generate unique GINs using match(). This does not average the genetic interaction score
tb.gin.lenient.uniqueByMatch = tb.gin.lenient[match(unique_pairs, tb.gin.lenient$ordered_pairs), ]
tb.gin.lenient.uniqueByMatch = tb.gin.lenient.uniqueByMatch[ !is.na(tb.gin.lenient.uniqueByMatch$ordered_pairs), ]
tb.gin.lenient.uniqueByMatch = tb.gin.lenient.uniqueByMatch[ !is.na(tb.gin.lenient.uniqueByMatch$Query.Strain.ID), ]
tb.gin.lenient.uniqueByAverage = tb.gin.lenient.uniqueByMatch;
tb.gin.lenient.uniqueByAverage$buffer = "NA"
total = length(tb.gin.lenient.uniqueByAverage[,1]); #error 20171113
for (i in 1:total ) {
if( (round(i/100)*100 - i)==0 ) { print(paste(i, ":leniet:", total - i) ); }
tmp = tb.gin.lenient[ tb.gin.lenient$ordered_pairs == unique_pairs[i], ]
tmp = tmp[ !is.na(tmp$Query.Strain.ID), ]
if( length(tmp[,1]) > 1) {#more than 1 records, take average
tb.gin.lenient.uniqueByAverage$buffer[i] = paste( tmp$Query.Strain.ID, tmp$Array.Strain.ID, sep=" ", collapse="::");
tb.gin.lenient.uniqueByAverage$Genetic.interaction.score..ε.[i] = mean(tmp$Genetic.interaction.score..ε.)
tb.gin.lenient.uniqueByAverage$P.value[i] = mean(tmp$P.value)
}
#else do nothing
}
```
Generate "intermediate" unique GINx using average values
```{r intermediate GIN}
tb.gin.intermediate.uniqueByMatch = tb.gin.intermediate[match(unique_pairs, tb.gin.intermediate$ordered_pairs), ]
tb.gin.intermediate.uniqueByMatch = tb.gin.intermediate.uniqueByMatch[ !is.na(tb.gin.intermediate.uniqueByMatch$ordered_pairs), ]
tb.gin.intermediate.uniqueByMatch = tb.gin.intermediate.uniqueByMatch[ !is.na(tb.gin.intermediate.uniqueByMatch$Query.Strain.ID), ]
tb.gin.intermediate.uniqueByAverage = tb.gin.intermediate.uniqueByMatch;
tb.gin.intermediate.uniqueByAverage$buffer = "NA"
totalIntermediate = length(tb.gin.intermediate.uniqueByMatch[,1]); #20171113 correction
for (i in 1:totalIntermediate ) {
if( (round(i/100)*100 - i)==0 ) { print(paste(i, ":intermediate:", totalIntermediate - i) ); }
tmp = tb.gin.intermediate[ tb.gin.intermediate$ordered_pairs == unique_pairs[i], ]
tmp = tmp[ !is.na(tmp$Query.Strain.ID), ]
if( length(tmp[,1]) > 1) {#more than 1 records, take average
tb.gin.intermediate.uniqueByAverage$buffer[i] = paste( tmp$Query.Strain.ID, tmp$Array.Strain.ID, sep=" ", collapse="::");
tb.gin.intermediate.uniqueByAverage$Genetic.interaction.score..ε.[i] = mean(tmp$Genetic.interaction.score..ε.)
tb.gin.intermediate.uniqueByAverage$P.value[i] = mean(tmp$P.value)
}
#else do nothing
}
```
Generate "stringent" unique GINx using average values
```{r stringent GIN}
tb.gin.stringent.uniqueByMatch = tb.gin.stringent[match(unique_pairs, tb.gin.stringent$ordered_pairs), ]
tb.gin.stringent.uniqueByMatch = tb.gin.stringent.uniqueByMatch[ !is.na(tb.gin.stringent.uniqueByMatch$ordered_pairs), ]
tb.gin.stringent.uniqueByMatch = tb.gin.stringent.uniqueByMatch[ !is.na(tb.gin.stringent.uniqueByMatch$Query.Strain.ID), ]
tb.gin.stringent.uniqueByAverage = tb.gin.stringent.uniqueByMatch;
tb.gin.stringent.uniqueByAverage$buffer = "NA"
totalstringent = length(tb.gin.stringent.uniqueByMatch[,1]);
for (i in 1:totalstringent) {
if( (round(i/100)*100 - i)==0 ) { print(paste(i, ":stringent:", totalstringent - i) ); }
tmp = tb.gin.stringent[ tb.gin.stringent$ordered_pairs == unique_pairs[i], ]
tmp = tmp[ !is.na(tmp$Query.Strain.ID), ]
if( length(tmp[,1]) > 1) {#more than 1 records, take average
tb.gin.stringent.uniqueByAverage$buffer[i] = paste( tmp$Query.Strain.ID, tmp$Array.Strain.ID, sep=" ", collapse="::");
tb.gin.stringent.uniqueByAverage$Genetic.interaction.score..ε.[i] = mean(tmp$Genetic.interaction.score..ε.)
tb.gin.stringent.uniqueByAverage$P.value[i] = mean(tmp$P.value)
}
#else do nothing
}
```
output
```{r output}
write.csv(tb.gin.lenient.uniqueByAverage, "../cellmap_data/gin_lenient_uniquePairs20171115.csv", row.names = F);
write.csv(tb.gin.intermediate.uniqueByAverage, "../cellmap_data/gin_intermediate_uniquePairs20171115.csv", row.names = F);
write.csv(tb.gin.stringent.uniqueByAverage, "../cellmap_data/gin_stringent_uniquePairs20171115.csv", row.names = F);
```
No comments:
Post a Comment