Monday, November 27, 2017

biogrid yeast PPI analysis



rm(list=ls());
datapath = "~/data/biogrid/";
list.files(path=datapath, pattern = "csv")
## [1] "yeast-biogrid-smalltest.csv" "yeast-biogrid.csv"
debug = 1; 
#tb = read.csv(paste(datapath, "yeast-biogrid-smalltest.csv", sep=''))
tb = read.csv(paste(datapath, "yeast-biogrid.csv", sep=''))
tb$Systematic.Name.Interactor.A = as.character(  tb$Systematic.Name.Interactor.A )
tb$Systematic.Name.Interactor.B = as.character(  tb$Systematic.Name.Interactor.B )
tb.ppi = tb[ tb$Experimental.System.Type =='physical', ] #142657 rows
tb.ppi = tb.ppi[ !is.na(tb.ppi$Systematic.Name.Interactor.A), ]#141962 rows
tb.ppi = tb.ppi[ !is.na(tb.ppi$Systematic.Name.Interactor.B), ]#141640 rows
#tb.ppi$Synonyms.Interactor.A = as.character(  tb.ppi$Synonyms.Interactor.A )
#tb.ppi$Synonyms.Interactor.B = as.character(  tb.ppi$Synonyms.Interactor.B )
str(tb.ppi)
## 'data.frame':    141640 obs. of  36 variables:
##  $ X.BioGRID.Interaction.ID    : int  68770 68771 68774 68775 68778 68779 68782 68783 68786 68787 ...
##  $ Entrez.Gene.Interactor.A    : int  851136 854020 851136 854290 851136 852582 851136 854933 851136 852607 ...
##  $ Entrez.Gene.Interactor.B    : int  854020 851136 854290 851136 852582 851136 854933 851136 852607 851136 ...
##  $ BioGRID.ID.Interactor.A     : int  31676 34272 31676 34518 31676 32973 31676 35097 31676 32996 ...
##  $ BioGRID.ID.Interactor.B     : int  34272 31676 34518 31676 32973 31676 35097 31676 32996 31676 ...
##  $ Systematic.Name.Interactor.A: chr  "YLR418C" "YOL145C" "YLR418C" "YOR123C" ...
##  $ Systematic.Name.Interactor.B: chr  "YOL145C" "YLR418C" "YOR123C" "YLR418C" ...
##  $ Official.Symbol.Interactor.A: Factor w/ 5925 levels "1-Oct","15S_RRNA",..: 631 860 631 2072 631 2818 631 3081 631 3783 ...
##  $ Official.Symbol.Interactor.B: Factor w/ 6376 levels "","1-Oct","15S_RRNA",..: 906 667 2196 667 2962 667 3236 667 3955 667 ...
##  $ Synonyms.Interactor.A       : Factor w/ 4432 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 1954 552 1954 1697 1954 1928 1954 1083 1954 700 ...
##  $ Synonyms.Interactor.B       : Factor w/ 4746 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 602 2089 1813 2089 2063 2089 1154 2089 752 2089 ...
##  $ Experimental.System         : Factor w/ 28 levels "","Affinity Capture-Luminescence",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Experimental.System.Type    : Factor w/ 3 levels "","genetic","physical": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Author                      : Factor w/ 13744 levels "","Aalto MK (1993)",..: 6197 6197 6197 6197 6197 6197 6197 6197 6197 6197 ...
##  $ Pubmed.ID                   : int  14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 ...
##  $ Organism.Interactor.A       : int  559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
##  $ Organism.Interactor.B       : int  559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
##  $ Throughput                  : Factor w/ 4 levels "","High Throughput",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Score                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Modification                : Factor w/ 17 levels "","Acetylation",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Phenotypes                  : Factor w/ 2172 levels "","acid pH resistance:partial rescue",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Qualifications              : Factor w/ 334656 levels "","(F20A","(PAB1) as a dosage suppressor of the Slg phenotype of the rpl33a G76R",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Tags                        : logi  NA NA NA NA NA NA ...
##  $ Source.Database             : Factor w/ 2 levels "","BIOGRID": 2 2 2 2 2 2 2 2 2 2 ...
##  $ X                           : Factor w/ 12 levels ""," P-value = 0.0004064]|alleles: arc35-6 - act1-136 [SGA score = -0.3705",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.1                         : Factor w/ 11 levels ""," P-value = 0.0001617]|alleles: pbr1-5012 - pkc1-1 [SGA score = -0.7905",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.2                         : Factor w/ 10 levels ""," P-value = 0.001011]|alleles: bet4-5007 - act1-124 [SGA score = -0.2097",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.3                         : Factor w/ 9 levels ""," P-value = 0.000581]|alleles: pkc1-4 - act1-136 [SGA score = -0.2153",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.4                         : Factor w/ 8 levels ""," P-value = 0.001378]|alleles: arc35-6 - act1-4 [SGA score = -0.1451",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.5                         : Factor w/ 8 levels ""," P-value = 0.0001648]|alleles: bet4-5007 - act1-155 [SGA score = -0.1568",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.6                         : Factor w/ 6 levels ""," P-value = 0.0009274]|alleles: bet4-5007 - act1-159 [SGA score = -0.1793",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.7                         : Factor w/ 5 levels ""," P-value = 0.0002055]|alleles: act1-4 - pfy1-13 [SGA score = -0.3401",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.8                         : Factor w/ 4 levels ""," P-value = 0.0002784]|alleles: act1-159 - pkc1-4 [SGA score = -0.1767",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.9                         : Factor w/ 3 levels ""," P-value = 0.01853]|alleles: act1-3 - pkc1-2 [SGA score = -0.2618",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.10                        : Factor w/ 3 levels ""," P-value = 1.166E-9]\tNA\tBIOGRID",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.11                        : Factor w/ 2 levels ""," P-value = 0.0001836]\tNA\tBIOGRID": 1 1 1 1 1 1 1 1 1 1 ...
if( debug == 0) { rm(tb); }
total = length(tb.ppi[,1]);
tb.ppi$ordered_pairs = NA; 
for (i in 1:total ) {
 if( (round(i/1000)*1000 - i)==0 ) { print(paste(i, "::", total - i) ); }  
 pairs = tb.ppi[i , c("Systematic.Name.Interactor.A", "Systematic.Name.Interactor.B")];
 if ( is.na(pairs[1,1]) ) {
  tb.ppi$ordered_pairs[i] = "NA_found"; 
 } else {
  ordered_pairs = sort(pairs)
  tb.ppi$ordered_pairs[i] =  paste( ordered_pairs[1,1], ordered_pairs[1,2], sep="_");
 }
}
## [1] "1000 :: 140640"
## [1] "2000 :: 139640"
## [1] "3000 :: 138640"
## [1] "4000 :: 137640"
## [1] "5000 :: 136640"
## [1] "6000 :: 135640"
## [1] "7000 :: 134640"
## [1] "8000 :: 133640"
## [1] "9000 :: 132640"
## [1] "10000 :: 131640"
## [1] "11000 :: 130640"
## [1] "12000 :: 129640"
## [1] "13000 :: 128640"
## [1] "14000 :: 127640"
## [1] "15000 :: 126640"
## [1] "16000 :: 125640"
## [1] "17000 :: 124640"
## [1] "18000 :: 123640"
## [1] "19000 :: 122640"
## [1] "20000 :: 121640"
## [1] "21000 :: 120640"
## [1] "22000 :: 119640"
## [1] "23000 :: 118640"
## [1] "24000 :: 117640"
## [1] "25000 :: 116640"
## [1] "26000 :: 115640"
## [1] "27000 :: 114640"
## [1] "28000 :: 113640"
## [1] "29000 :: 112640"
## [1] "30000 :: 111640"
## [1] "31000 :: 110640"
## [1] "32000 :: 109640"
## [1] "33000 :: 108640"
## [1] "34000 :: 107640"
## [1] "35000 :: 106640"
## [1] "36000 :: 105640"
## [1] "37000 :: 104640"
## [1] "38000 :: 103640"
## [1] "39000 :: 102640"
## [1] "40000 :: 101640"
## [1] "41000 :: 100640"
## [1] "42000 :: 99640"
## [1] "43000 :: 98640"
## [1] "44000 :: 97640"
## [1] "45000 :: 96640"
## [1] "46000 :: 95640"
## [1] "47000 :: 94640"
## [1] "48000 :: 93640"
## [1] "49000 :: 92640"
## [1] "50000 :: 91640"
## [1] "51000 :: 90640"
## [1] "52000 :: 89640"
## [1] "53000 :: 88640"
## [1] "54000 :: 87640"
## [1] "55000 :: 86640"
## [1] "56000 :: 85640"
## [1] "57000 :: 84640"
## [1] "58000 :: 83640"
## [1] "59000 :: 82640"
## [1] "60000 :: 81640"
## [1] "61000 :: 80640"
## [1] "62000 :: 79640"
## [1] "63000 :: 78640"
## [1] "64000 :: 77640"
## [1] "65000 :: 76640"
## [1] "66000 :: 75640"
## [1] "67000 :: 74640"
## [1] "68000 :: 73640"
## [1] "69000 :: 72640"
## [1] "70000 :: 71640"
## [1] "71000 :: 70640"
## [1] "72000 :: 69640"
## [1] "73000 :: 68640"
## [1] "74000 :: 67640"
## [1] "75000 :: 66640"
## [1] "76000 :: 65640"
## [1] "77000 :: 64640"
## [1] "78000 :: 63640"
## [1] "79000 :: 62640"
## [1] "80000 :: 61640"
## [1] "81000 :: 60640"
## [1] "82000 :: 59640"
## [1] "83000 :: 58640"
## [1] "84000 :: 57640"
## [1] "85000 :: 56640"
## [1] "86000 :: 55640"
## [1] "87000 :: 54640"
## [1] "88000 :: 53640"
## [1] "89000 :: 52640"
## [1] "90000 :: 51640"
## [1] "91000 :: 50640"
## [1] "92000 :: 49640"
## [1] "93000 :: 48640"
## [1] "94000 :: 47640"
## [1] "95000 :: 46640"
## [1] "96000 :: 45640"
## [1] "97000 :: 44640"
## [1] "98000 :: 43640"
## [1] "99000 :: 42640"
## [1] "100000 :: 41640"
## [1] "101000 :: 40640"
## [1] "102000 :: 39640"
## [1] "103000 :: 38640"
## [1] "104000 :: 37640"
## [1] "105000 :: 36640"
## [1] "106000 :: 35640"
## [1] "107000 :: 34640"
## [1] "108000 :: 33640"
## [1] "109000 :: 32640"
## [1] "110000 :: 31640"
## [1] "111000 :: 30640"
## [1] "112000 :: 29640"
## [1] "113000 :: 28640"
## [1] "114000 :: 27640"
## [1] "115000 :: 26640"
## [1] "116000 :: 25640"
## [1] "117000 :: 24640"
## [1] "118000 :: 23640"
## [1] "119000 :: 22640"
## [1] "120000 :: 21640"
## [1] "121000 :: 20640"
## [1] "122000 :: 19640"
## [1] "123000 :: 18640"
## [1] "124000 :: 17640"
## [1] "125000 :: 16640"
## [1] "126000 :: 15640"
## [1] "127000 :: 14640"
## [1] "128000 :: 13640"
## [1] "129000 :: 12640"
## [1] "130000 :: 11640"
## [1] "131000 :: 10640"
## [1] "132000 :: 9640"
## [1] "133000 :: 8640"
## [1] "134000 :: 7640"
## [1] "135000 :: 6640"
## [1] "136000 :: 5640"
## [1] "137000 :: 4640"
## [1] "138000 :: 3640"
## [1] "139000 :: 2640"
## [1] "140000 :: 1640"
## [1] "141000 :: 640"
str(tb.ppi)
## 'data.frame':    141640 obs. of  37 variables:
##  $ X.BioGRID.Interaction.ID    : int  68770 68771 68774 68775 68778 68779 68782 68783 68786 68787 ...
##  $ Entrez.Gene.Interactor.A    : int  851136 854020 851136 854290 851136 852582 851136 854933 851136 852607 ...
##  $ Entrez.Gene.Interactor.B    : int  854020 851136 854290 851136 852582 851136 854933 851136 852607 851136 ...
##  $ BioGRID.ID.Interactor.A     : int  31676 34272 31676 34518 31676 32973 31676 35097 31676 32996 ...
##  $ BioGRID.ID.Interactor.B     : int  34272 31676 34518 31676 32973 31676 35097 31676 32996 31676 ...
##  $ Systematic.Name.Interactor.A: chr  "YLR418C" "YOL145C" "YLR418C" "YOR123C" ...
##  $ Systematic.Name.Interactor.B: chr  "YOL145C" "YLR418C" "YOR123C" "YLR418C" ...
##  $ Official.Symbol.Interactor.A: Factor w/ 5925 levels "1-Oct","15S_RRNA",..: 631 860 631 2072 631 2818 631 3081 631 3783 ...
##  $ Official.Symbol.Interactor.B: Factor w/ 6376 levels "","1-Oct","15S_RRNA",..: 906 667 2196 667 2962 667 3236 667 3955 667 ...
##  $ Synonyms.Interactor.A       : Factor w/ 4432 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 1954 552 1954 1697 1954 1928 1954 1083 1954 700 ...
##  $ Synonyms.Interactor.B       : Factor w/ 4746 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 602 2089 1813 2089 2063 2089 1154 2089 752 2089 ...
##  $ Experimental.System         : Factor w/ 28 levels "","Affinity Capture-Luminescence",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Experimental.System.Type    : Factor w/ 3 levels "","genetic","physical": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Author                      : Factor w/ 13744 levels "","Aalto MK (1993)",..: 6197 6197 6197 6197 6197 6197 6197 6197 6197 6197 ...
##  $ Pubmed.ID                   : int  14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 ...
##  $ Organism.Interactor.A       : int  559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
##  $ Organism.Interactor.B       : int  559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
##  $ Throughput                  : Factor w/ 4 levels "","High Throughput",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Score                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Modification                : Factor w/ 17 levels "","Acetylation",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Phenotypes                  : Factor w/ 2172 levels "","acid pH resistance:partial rescue",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Qualifications              : Factor w/ 334656 levels "","(F20A","(PAB1) as a dosage suppressor of the Slg phenotype of the rpl33a G76R",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Tags                        : logi  NA NA NA NA NA NA ...
##  $ Source.Database             : Factor w/ 2 levels "","BIOGRID": 2 2 2 2 2 2 2 2 2 2 ...
##  $ X                           : Factor w/ 12 levels ""," P-value = 0.0004064]|alleles: arc35-6 - act1-136 [SGA score = -0.3705",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.1                         : Factor w/ 11 levels ""," P-value = 0.0001617]|alleles: pbr1-5012 - pkc1-1 [SGA score = -0.7905",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.2                         : Factor w/ 10 levels ""," P-value = 0.001011]|alleles: bet4-5007 - act1-124 [SGA score = -0.2097",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.3                         : Factor w/ 9 levels ""," P-value = 0.000581]|alleles: pkc1-4 - act1-136 [SGA score = -0.2153",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.4                         : Factor w/ 8 levels ""," P-value = 0.001378]|alleles: arc35-6 - act1-4 [SGA score = -0.1451",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.5                         : Factor w/ 8 levels ""," P-value = 0.0001648]|alleles: bet4-5007 - act1-155 [SGA score = -0.1568",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.6                         : Factor w/ 6 levels ""," P-value = 0.0009274]|alleles: bet4-5007 - act1-159 [SGA score = -0.1793",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.7                         : Factor w/ 5 levels ""," P-value = 0.0002055]|alleles: act1-4 - pfy1-13 [SGA score = -0.3401",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.8                         : Factor w/ 4 levels ""," P-value = 0.0002784]|alleles: act1-159 - pkc1-4 [SGA score = -0.1767",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.9                         : Factor w/ 3 levels ""," P-value = 0.01853]|alleles: act1-3 - pkc1-2 [SGA score = -0.2618",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.10                        : Factor w/ 3 levels ""," P-value = 1.166E-9]\tNA\tBIOGRID",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.11                        : Factor w/ 2 levels ""," P-value = 0.0001836]\tNA\tBIOGRID": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ordered_pairs               : chr  "YLR418C_YOL145C" "YLR418C_YOL145C" "YLR418C_YOR123C" "YLR418C_YOR123C" ...
How many unique biogrid PPI?
unique.ordered_pairs = unique(tb.ppi$ordered_pairs); 
length( unique(tb.ppi$ordered_pairs) ) / length(tb.ppi$ordered_pairs)
## [1] 0.6522522
tb.ppi.unique = tb.ppi[ match(unique.ordered_pairs, tb.ppi$ordered_pairs),  ];
Identify essential and nonessential genes
tb.fit = read.csv( "fitness.data/SummaryRegressionHetHom20171122.csv", colClasses=rep("character",9) );

How many interactions per essential genes?

First find out interaxctions per gene, then pick the essential genes
degree = table( c( tb.ppi.unique$Systematic.Name.Interactor.A, tb.ppi.unique$Systematic.Name.Interactor.B) ); 
net = data.frame(degree);
names(net) = c("orf", "degree");
net$orf = as.character(net$orf);
Match essentiality to biogrid PPI
net$essentiality = tb.fit$essentiality[ match(net$orf, tb.fit$orf) ];
net.nonessential = net[net$essentiality=='nonessential', ];
net.essential = net[ net$essentiality == 'essential',  ];
summary(net.nonessential)
##      orf                degree        essentiality      
##  Length:4910        Min.   :   1.00   Length:4910       
##  Class :character   1st Qu.:   6.00   Class :character  
##  Mode  :character   Median :  14.00   Mode  :character  
##                     Mean   :  28.27                     
##                     3rd Qu.:  30.00                     
##                     Max.   :1771.00                     
##                     NA's   :838
summary(net.essential);
##      orf                degree        essentiality      
##  Length:1894        Min.   :   1.00   Length:1894       
##  Class :character   1st Qu.:  17.00   Class :character  
##  Mode  :character   Median :  35.00   Mode  :character  
##                     Mean   :  52.56                     
##                     3rd Qu.:  61.00                     
##                     Max.   :2558.00                     
##                     NA's   :838

No comments:

Post a Comment