Tuesday, October 24, 2017

CellMap analysis


Q: Why do I have 'abnormal', 'essential', 'nonessential' genes? 
CELLMAP provids genetics interaction in pairwise format. The interactions are provided for strain_ids (alleles). The mapping of strain_id and ORF is in “strain_ids_and_single_mutant_fitness.csv” .
rm(list=ls());
set.seed(2017);
datapath = "~/data/Sce/CellMap/20170626/S1.pairwise/";
debug = 0;
list.files(path=datapath);
## [1] "SGA_DAmP.txt"                             
## [2] "SGA_ExE.txt"                              
## [3] "SGA_ExN_NxE.txt"                          
## [4] "SGA_NxN.txt"                              
## [5] "strain_ids_and_single_mutant_fitness.csv" 
## [6] "strain_ids_and_single_mutant_fitness.xlsx"
Load naming lookup tables
dic = read.csv(paste(datapath, "strain_ids_and_single_mutant_fitness.csv", sep=''))
Load essential and non-essential infor that H. Qin generated.
list.files(path="data");
## [1] "SummaryRegressionHetHom2015Oct12.csv"          
## [2] "SummaryRegressionHetHomFactorized2015Oct13.csv"
fit = read.csv("data/SummaryRegressionHetHomFactorized2015Oct13.csv")
Load pairwise interaction data
#Essential X Essential 
tb.ee = read.table(paste(datapath,"SGA_ExE.txt", sep=''), header=T, sep="\t");
summary(tb.ee);
##         Query.Strain.ID       Query.allele.name        Array.Strain.ID  
##  YNL308C_tsq2680:   792   mob2-11-supp1:  1419   YCR002C_tsa78 :  1090  
##  YEL019C_tsq533 :   788   kri1-5001    :   792   YCR002C_tsa79 :  1090  
##  YNL287W_tsq38  :   788   mms21-1      :   788   YAL041W_tsa410:  1088  
##  YCL059C_tsq1104:   785   sec21-1      :   788   YAR007C_tsa273:  1088  
##  YCL059C_tsq326 :   785   cdc10-1      :   785   YAL038W_tsa34 :  1087  
##  YCR002C_tsq1072:   785   cdc10-2      :   785   YAL041W_tsa412:  1086  
##  (Other)        :813847   (Other)      :813213   (Other)       :812041  
##  Array.allele.name Arraytype.Temp Genetic.interaction.score..ε.
##  cdc10-1:  1090    TSA26:815635   Min.   :-0.790500            
##  cdc10-2:  1090    TSA30:  2935   1st Qu.:-0.034800            
##  cdc24-2:  1088                   Median : 0.005500            
##  rfa1-m2:  1088                   Mean   :-0.008011            
##  cdc19-1:  1087                   3rd Qu.: 0.035700            
##  cdc24-3:  1086                   Max.   : 0.440300            
##  (Other):812041                                                
##     P.value        Query.single.mutant.fitness..SMF.   Array.SMF     
##  Min.   :0.00000   Min.   :0.13                      Min.   :0.1137  
##  1st Qu.:0.03872   1st Qu.:0.72                      1st Qu.:0.7570  
##  Median :0.19940   Median :0.85                      Median :0.8581  
##  Mean   :0.20933   Mean   :0.82                      Mean   :0.8326  
##  3rd Qu.:0.35770   3rd Qu.:0.93                      3rd Qu.:0.9359  
##  Max.   :1.00000   Max.   :1.14                      Max.   :1.0550  
##                    NA's   :109963                                    
##  Double.mutant.fitness Double.mutant.fitness.standard.deviation
##  Min.   :-0.1435       Min.   :0.00000                         
##  1st Qu.: 0.5666       1st Qu.:0.02290                         
##  Median : 0.7123       Median :0.03850                         
##  Mean   : 0.6912       Mean   :0.04786                         
##  3rd Qu.: 0.8370       3rd Qu.:0.06120                         
##  Max.   : 1.3817       Max.   :1.24310                         
## 
#EXN and NXE
tb.en = read.table(paste(datapath,"SGA_ExN_NxE.txt", sep=''), header=T, sep="\t");
summary(tb.en);
##         Query.Strain.ID     Query.allele.name          Array.Strain.ID   
##  YJL143W_tsq3031:   3657   prp21-ts  :   3657   YAR007C_tsa273 :   2212  
##  YJL203W_tsq401 :   3657   tim17-5001:   3657   YFR036W_tsa88  :   2212  
##  YIL021W_tsq1136:   3623   med6-ts   :   3646   YAL025C_tsa1066:   2211  
##  YJL072C_tsq2842:   3623   psf2-5001 :   3623   YAL038W_tsa34  :   2210  
##  YIL021W_tsq2727:   3615   rpb3-2    :   3623   YFL008W_tsa68  :   2210  
##  YJR064W_tsq2853:   3585   rpb3-5001 :   3615   YFL009W_tsa334 :   2210  
##  (Other)        :3687193   (Other)   :3687132   (Other)        :3695688  
##   Array.allele.name   Arraytype.Temp  Genetic.interaction.score..ε.
##  cdc26-1   :   2212   DMA26:1212145   Min.   :-1.09550             
##  rfa1-m2   :   2212   DMA30: 798532   1st Qu.:-0.02740             
##  mak16-5001:   2211   TSA26:1677715   Median :-0.00060             
##  cdc19-1   :   2210   TSA30:  20561   Mean   :-0.00528             
##  cdc4-3    :   2210                   3rd Qu.: 0.02390             
##  rsc8-ts16 :   2210                   Max.   : 1.33530             
##  (Other)   :3695688                                                
##     P.value       Query.single.mutant.fitness..SMF.   Array.SMF     
##  Min.   :0.0000   Min.   :0.1                       Min.   :0.1137  
##  1st Qu.:0.1155   1st Qu.:0.7                       1st Qu.:0.8393  
##  Median :0.2746   Median :0.9                       Median :0.9577  
##  Mean   :0.2554   Mean   :0.8                       Mean   :0.9072  
##  3rd Qu.:0.3950   3rd Qu.:1.0                       3rd Qu.:1.0065  
##  Max.   :1.0000   Max.   :1.1                       Max.   :1.1118  
##                   NA's   :403475                                    
##  Double.mutant.fitness Double.mutant.fitness.standard.deviation
##  Min.   :-0.1764       Min.   :0.00000                         
##  1st Qu.: 0.6532       1st Qu.:0.02160                         
##  Median : 0.8001       Median :0.03650                         
##  Mean   : 0.7680       Mean   :0.04677                         
##  3rd Qu.: 0.9126       3rd Qu.:0.05910                         
##  Max.   : 2.1866       Max.   :1.19310                         
## 
#NXN I do not need consider NxN for my aging modeling project
tb.nn = read.table(paste(datapath,"SGA_NxN.txt", sep=''), header=T, sep="\t");
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : number of items read is not a multiple of the number of columns
summary(tb.nn);
##        Query.Strain.ID    Query.allele.name        Array.Strain.ID   
##  YJL219W_sn2362:   3708   hxt9   :   3708   YIL173W_dma2391:   1712  
##  YIR017C_sn2001:   3696   met28  :   3696   YIL165C_dma2396:   1710  
##  YJL154C_sn371 :   3674   vps35  :   3674   YIL170W_dma2392:   1706  
##  YJL141C_sn1525:   3673   yak1   :   3673   YIL141W_dma2367:   1700  
##  YJL056C_sn2004:   3667   zap1   :   3667   YIL145C_dma2366:   1698  
##  YJR117W_sn882 :   3666   ste24  :   3666   YIL140W_dma2368:   1694  
##  (Other)       :6261409   (Other):6261409   (Other)        :6273273  
##  Array.allele.name Arraytype.Temp  Genetic.interaction.score..ε.
##  vth1   :   1712        :      1   Min.   :-1.16160             
##  yil165c:   1710   DMA26: 203777   1st Qu.:-0.02200             
##  hxt12  :   1706   DMA30:6079715   Median :-0.00220             
##  yil141w:   1700                   Mean   :-0.00389             
##  pan6   :   1698                   3rd Qu.: 0.01780             
##  axl2   :   1694                   Max.   : 0.86760             
##  (Other):6273273                   NA's   :1                    
##     P.value       Query.single.mutant.fitness..SMF.   Array.SMF     
##  Min.   :0.0000   Min.   :0.2                       Min.   :0.2265  
##  1st Qu.:0.1633   1st Qu.:0.9                       1st Qu.:0.9667  
##  Median :0.3083   Median :1.0                       Median :1.0010  
##  Mean   :0.2809   Mean   :0.9                       Mean   :0.9712  
##  3rd Qu.:0.4113   3rd Qu.:1.0                       3rd Qu.:1.0195  
##  Max.   :1.0000   Max.   :1.1                       Max.   :1.1118  
##  NA's   :1        NA's   :479989                    NA's   :1       
##  Double.mutant.fitness Double.mutant.fitness.standard.deviation
##  Min.   :-0.2577       Min.   :0.00000                         
##  1st Qu.: 0.8521       1st Qu.:0.01900                         
##  Median : 0.9695       Median :0.03170                         
##  Mean   : 0.9069       Mean   :0.04062                         
##  3rd Qu.: 1.0199       3rd Qu.:0.05050                         
##  Max.   : 1.7606       Max.   :1.00090                         
##  NA's   :1             NA's   :1
Columns names in the 3 tables are the same.
rbind( names(tb.en), names(tb.ee), names(tb.nn))
##      [,1]              [,2]                [,3]             
## [1,] "Query.Strain.ID" "Query.allele.name" "Array.Strain.ID"
## [2,] "Query.Strain.ID" "Query.allele.name" "Array.Strain.ID"
## [3,] "Query.Strain.ID" "Query.allele.name" "Array.Strain.ID"
##      [,4]                [,5]             [,6]                           
## [1,] "Array.allele.name" "Arraytype.Temp" "Genetic.interaction.score..ε."
## [2,] "Array.allele.name" "Arraytype.Temp" "Genetic.interaction.score..ε."
## [3,] "Array.allele.name" "Arraytype.Temp" "Genetic.interaction.score..ε."
##      [,7]      [,8]                                [,9]       
## [1,] "P.value" "Query.single.mutant.fitness..SMF." "Array.SMF"
## [2,] "P.value" "Query.single.mutant.fitness..SMF." "Array.SMF"
## [3,] "P.value" "Query.single.mutant.fitness..SMF." "Array.SMF"
##      [,10]                   [,11]                                     
## [1,] "Double.mutant.fitness" "Double.mutant.fitness.standard.deviation"
## [2,] "Double.mutant.fitness" "Double.mutant.fitness.standard.deviation"
## [3,] "Double.mutant.fitness" "Double.mutant.fitness.standard.deviation"
Merge 3 tables into 1 table.
tb.gin = rbind(tb.ee, tb.en, tb.nn);
Double-check the merged results
length(tb.gin[,1]) == sum(length(tb.ee[,1]), length(tb.en[,1]), length(tb.nn[,1]))
## [1] TRUE
Remove unused table to free up memory
ls()
## [1] "datapath" "debug"    "dic"      "fit"      "tb.ee"    "tb.en"   
## [7] "tb.gin"   "tb.nn"
rm(tb.en, tb.ee, tb.nn)
ls()
## [1] "datapath" "debug"    "dic"      "fit"      "tb.gin"
Costanzo2016 suggested lenient, intermediate, and stringennt ways for gin quality check.
tb.gin.lenient = tb.gin[ tb.gin$P.value<=0.05, ];
if (debug ==0) { rm(tb.gin); } #freeup memory
Map strain IDs to ORFs, add my essentialFalgs
tb.gin.lenient$ORF1 = dic$Systematic.gene.name[match( tb.gin.lenient$Query.Strain.ID, dic$Strain.ID)]
tb.gin.lenient$ORF2 = dic$Systematic.gene.name[match( tb.gin.lenient$Array.Strain.ID, dic$Strain.ID)]

tb.gin.lenient$essenflag1 = fit$essenflag[ match(tb.gin.lenient$ORF1, fit$orf)]
tb.gin.lenient$essenflag2 = fit$essenflag[ match(tb.gin.lenient$ORF2, fit$orf)]

head(tb.gin.lenient)
##    Query.Strain.ID Query.allele.name Array.Strain.ID Array.allele.name
## 1   YAL001C_tsq508        tfc3-g349e  YBL023C_tsa111            mcm2-1
## 2   YAL001C_tsq508        tfc3-g349e YBL026W_tsa1065         lsm2-5001
## 7   YAL001C_tsq508        tfc3-g349e  YBL034C_tsa950            stu1-7
## 13  YAL001C_tsq508        tfc3-g349e  YBL076C_tsa275            ils1-1
## 16  YAL001C_tsq508        tfc3-g349e  YBL097W_tsa510            brn1-9
## 25  YAL001C_tsq508        tfc3-g349e YBR029C_tsa1063         cds1-5001
##    Arraytype.Temp Genetic.interaction.score..ε.   P.value
## 1           TSA30                       -0.0348 5.042e-03
## 2           TSA30                       -0.3529 3.591e-06
## 7           TSA30                       -0.1294 1.931e-02
## 13          TSA30                       -0.0250 1.301e-04
## 16          TSA30                       -0.0808 5.582e-15
## 25          TSA30                       -0.1173 8.243e-05
##    Query.single.mutant.fitness..SMF. Array.SMF Double.mutant.fitness
## 1                             0.8285    0.9254                0.7319
## 2                             0.8285    0.9408                0.4266
## 7                             0.8285    0.6690                0.4249
## 13                            0.8285    0.8097                0.6458
## 16                            0.8285    0.5464                0.3719
## 25                            0.8285    0.9007                0.6289
##    Double.mutant.fitness.standard.deviation    ORF1    ORF2   essenflag1
## 1                                    0.0102 YAL001C YBL023C nonessential
## 2                                    0.0790 YAL001C YBL026W nonessential
## 7                                    0.0482 YAL001C YBL034C nonessential
## 13                                   0.0054 YAL001C YBL076C nonessential
## 16                                   0.0077 YAL001C YBL097W nonessential
## 25                                   0.0226 YAL001C YBR029C nonessential
##    essenflag2
## 1   essential
## 2   essential
## 7    abnormal
## 13  essential
## 16  essential
## 25  essential
tb.gin.intermediate = tb.gin.lenient[ abs(tb.gin.lenient$Genetic.interaction.score..ε.) >0.08, ];
tb.gin.stringent = tb.gin.lenient[ tb.gin.lenient$Genetic.interaction.score..ε.>0.16 | tb.gin.lenient$Genetic.interaction.score..ε.< -0.12, ]
hist(tb.gin.lenient$Genetic.interaction.score..ε., breaks = 100)
summary(tb.gin.lenient);
##         Query.Strain.ID     Query.allele.name         Array.Strain.ID   
##  YJL029C_sn248  :   1364   vps53     :   1364   YIL048W_tsa188:   1368  
##  YIR033W_sn1943 :   1214   mga2      :   1214   YDL008W_tsa783:   1328  
##  YIL004C_tsq1171:   1208   bet1-1    :   1208   YDR172W_tsa28 :   1316  
##  YJR002W_tsq2069:   1124   mpp10-5001:   1124   YFL039C_tsa140:   1315  
##  YJR045C_tsq2790:   1045   ssc1-2    :   1045   YHR164C_tsa352:   1274  
##  (Other)        :1563516   (Other)   :1563516   (Other)       :1562870  
##  NA's           :      1   NA's      :      1   NA's          :      1  
##  Array.allele.name  Arraytype.Temp Genetic.interaction.score..ε.
##  neo1-2  :   1368   TSA26:523518   Min.   :-1.16160             
##  apc11-13:   1328   TSA30:  5813   1st Qu.:-0.07190             
##  sup35-td:   1316   DMA26:203445   Median :-0.01780             
##  act1-125:   1315   DMA30:836695   Mean   :-0.01934             
##  dna2-2  :   1274        :     0   3rd Qu.: 0.05520             
##  (Other) :1562870   NA's :     1   Max.   : 1.33530             
##  NA's    :      1                  NA's   :1                    
##     P.value          Query.single.mutant.fitness..SMF.   Array.SMF     
##  Min.   :0.0000000   Min.   :0.11                      Min.   :0.1137  
##  1st Qu.:0.0000455   1st Qu.:0.76                      1st Qu.:0.8243  
##  Median :0.0048550   Median :0.92                      Median :0.9525  
##  Mean   :0.0125985   Mean   :0.86                      Mean   :0.8981  
##  3rd Qu.:0.0229400   3rd Qu.:1.00                      3rd Qu.:1.0060  
##  Max.   :0.0500000   Max.   :1.14                      Max.   :1.1118  
##  NA's   :1           NA's   :165086                    NA's   :1       
##  Double.mutant.fitness Double.mutant.fitness.standard.deviation
##  Min.   :-0.2577       Min.   :0.0000                          
##  1st Qu.: 0.6044       1st Qu.:0.0090                          
##  Median : 0.8079       Median :0.0163                          
##  Mean   : 0.7654       Mean   :0.0260                          
##  3rd Qu.: 0.9619       3rd Qu.:0.0301                          
##  Max.   : 2.1866       Max.   :0.9147                          
##  NA's   :1             NA's   :1                               
##         ORF1                ORF2                essenflag1    
##  YFL039C  :   8053   YFL039C  :  14881   abnormal    : 27510  
##  YFL034C-B:   3562   YFL034C-B:   7918   essential   :493722  
##  YNL181W  :   3529   YHR036W  :   4162   nonessential:916926  
##  YBL105C  :   3090   YNL061W  :   3851   NA's        :131314  
##  YBR088C  :   2778   YJR076C  :   3789                        
##  (Other)  :1548459   (Other)  :1534870                        
##  NA's     :      1   NA's     :      1                        
##         essenflag2     
##  abnormal    :  25160  
##  essential   : 480824  
##  nonessential:1009399  
##  NA's        :  54089  
##                        
##                        
## 
summary(tb.gin.intermediate);
##         Query.Strain.ID    Query.allele.name        Array.Strain.ID  
##  YNL243W_sn760  :   711   sla2      :   711   YDL008W_tsa783:  1017  
##  YDR293C_sn729  :   692   ssd1      :   692   YLR078C_tsa199:   940  
##  YKL154W_tsq1163:   678   srp102-510:   678   YDR172W_tsa28 :   935  
##  YJR045C_tsq2790:   664   ssc1-2    :   664   YFL039C_tsa140:   883  
##  YLR275W_tsq2653:   656   smd2-5005 :   656   YER157W_tsa41 :   881  
##  (Other)        :566233   (Other)   :566233   (Other)       :564978  
##  NA's           :     1   NA's      :     1   NA's          :     1  
##  Array.allele.name Arraytype.Temp Genetic.interaction.score..ε.
##  apc11-13:  1017   TSA26:256208   Min.   :-1.16160             
##  bos1-1  :   940   TSA30:  3153   1st Qu.:-0.15690             
##  sup35-td:   935   DMA26: 81038   Median :-0.09740             
##  act1-125:   883   DMA30:229235   Mean   :-0.05968             
##  cog3-1  :   881        :     0   3rd Qu.: 0.09720             
##  (Other) :564978   NA's :     1   Max.   : 1.33530             
##  NA's    :     1                  NA's   :1                    
##     P.value          Query.single.mutant.fitness..SMF.   Array.SMF     
##  Min.   :0.0000000   Min.   :0.18                      Min.   :0.1137  
##  1st Qu.:0.0000015   1st Qu.:0.76                      1st Qu.:0.7400  
##  Median :0.0016875   Median :0.89                      Median :0.8690  
##  Mean   :0.0097757   Mean   :0.85                      Mean   :0.8414  
##  3rd Qu.:0.0160700   3rd Qu.:0.99                      3rd Qu.:0.9745  
##  Max.   :0.0500000   Max.   :1.14                      Max.   :1.1118  
##  NA's   :1           NA's   :87094                     NA's   :1       
##  Double.mutant.fitness Double.mutant.fitness.standard.deviation
##  Min.   :-0.2577       Min.   :0.00000                         
##  1st Qu.: 0.4933       1st Qu.:0.02280                         
##  Median : 0.6851       Median :0.03680                         
##  Mean   : 0.6773       Mean   :0.04859                         
##  3rd Qu.: 0.8650       3rd Qu.:0.05780                         
##  Max.   : 2.1866       Max.   :0.91470                         
##  NA's   :1             NA's   :1                               
##         ORF1               ORF2               essenflag1    
##  YFL039C  :  4584   YFL039C  :  8373   abnormal    : 13170  
##  YNL181W  :  2435   YFL034C-B:  4706   essential   :221747  
##  YFL034C-B:  2028   YJR076C  :  2437   nonessential:290106  
##  YBL105C  :  1752   YDR182W  :  2224   NA's        : 44612  
##  YEL034W  :  1522   YNL061W  :  1928                        
##  (Other)  :557313   (Other)  :549966                        
##  NA's     :     1   NA's     :     1                        
##         essenflag2    
##  abnormal    : 12310  
##  essential   :234519  
##  nonessential:303111  
##  NA's        : 19695  
##                       
##                       
## 
summary(tb.gin.stringent)
##         Query.Strain.ID       Query.allele.name        Array.Strain.ID  
##  YNL243W_sn760  :   574   sla2         :   574   YDL008W_tsa783:   649  
##  YDR293C_sn729  :   460   ssd1         :   460   YLR268W_tsa121:   629  
##  YBR049C_tsq1348:   448   reb1-5001    :   448   YFL039C_tsa140:   518  
##  YLR275W_tsq2653:   425   smd2-5005    :   425   YER157W_tsa41 :   512  
##  YEL034W_tsq737 :   377   mob2-11-supp1:   380   YJR076C_tsa84 :   494  
##  (Other)        :250780   (Other)      :250777   (Other)       :250262  
##  NA's           :     1   NA's         :     1   NA's          :     1  
##  Array.allele.name Arraytype.Temp Genetic.interaction.score..ε.
##  apc11-13:   649   TSA26:124897   Min.   :-1.1616              
##  sec22-3 :   629   TSA30:  1638   1st Qu.:-0.2354              
##  act1-125:   518   DMA26: 35986   Median :-0.1670              
##  cog3-1  :   512   DMA30: 90543   Mean   :-0.1610              
##  cdc11-5 :   494        :     0   3rd Qu.:-0.1322              
##  (Other) :250262   NA's :     1   Max.   : 1.3353              
##  NA's    :     1                  NA's   :1                    
##     P.value          Query.single.mutant.fitness..SMF.   Array.SMF     
##  Min.   :0.0000000   Min.   :0.18                      Min.   :0.1137  
##  1st Qu.:0.0000000   1st Qu.:0.76                      1st Qu.:0.7235  
##  Median :0.0003424   Median :0.89                      Median :0.8465  
##  Mean   :0.0072841   Mean   :0.86                      Mean   :0.8220  
##  3rd Qu.:0.0092682   3rd Qu.:0.98                      3rd Qu.:0.9510  
##  Max.   :0.0500000   Max.   :1.14                      Max.   :1.1118  
##  NA's   :1           NA's   :43715                     NA's   :1       
##  Double.mutant.fitness Double.mutant.fitness.standard.deviation
##  Min.   :-0.2577       Min.   :0.0000                          
##  1st Qu.: 0.3829       1st Qu.:0.0310                          
##  Median : 0.5491       Median :0.0536                          
##  Mean   : 0.5619       Mean   :0.0686                          
##  3rd Qu.: 0.7233       3rd Qu.:0.0857                          
##  Max.   : 2.1866       Max.   :0.9147                          
##  NA's   :1             NA's   :1                               
##         ORF1               ORF2               essenflag1    
##  YFL039C  :  2200   YFL039C  :  4416   abnormal    :  6379  
##  YNL181W  :  1350   YFL034C-B:  2238   essential   :104477  
##  YFL034C-B:  1068   YJR076C  :  1381   nonessential:122575  
##  YEL034W  :   897   YDR182W  :  1215   NA's        : 19634  
##  YBL105C  :   818   YLR268W  :   950                        
##  (Other)  :246731   (Other)  :242864                        
##  NA's     :     1   NA's     :     1                        
##         essenflag2    
##  abnormal    :  6140  
##  essential   :113948  
##  nonessential:124214  
##  NA's        :  8763  
##                       
##                       
## 

No comments:

Post a Comment