BIOGRID yeast PPI analysis
H Qin
11/16-27/2017
rm(list=ls());
datapath = "~/data/biogrid/";
list.files(path=datapath, pattern = "csv")
## [1] "yeast-biogrid-smalltest.csv" "yeast-biogrid.csv"
debug = 1;
#tb = read.csv(paste(datapath, "yeast-biogrid-smalltest.csv", sep=''))
tb = read.csv(paste(datapath, "yeast-biogrid.csv", sep=''))
tb$Systematic.Name.Interactor.A = as.character( tb$Systematic.Name.Interactor.A )
tb$Systematic.Name.Interactor.B = as.character( tb$Systematic.Name.Interactor.B )
tb.ppi = tb[ tb$Experimental.System.Type =='physical', ] #142657 rows
tb.ppi = tb.ppi[ !is.na(tb.ppi$Systematic.Name.Interactor.A), ]#141962 rows
tb.ppi = tb.ppi[ !is.na(tb.ppi$Systematic.Name.Interactor.B), ]#141640 rows
#tb.ppi$Synonyms.Interactor.A = as.character( tb.ppi$Synonyms.Interactor.A )
#tb.ppi$Synonyms.Interactor.B = as.character( tb.ppi$Synonyms.Interactor.B )
str(tb.ppi)
## 'data.frame': 141640 obs. of 36 variables:
## $ X.BioGRID.Interaction.ID : int 68770 68771 68774 68775 68778 68779 68782 68783 68786 68787 ...
## $ Entrez.Gene.Interactor.A : int 851136 854020 851136 854290 851136 852582 851136 854933 851136 852607 ...
## $ Entrez.Gene.Interactor.B : int 854020 851136 854290 851136 852582 851136 854933 851136 852607 851136 ...
## $ BioGRID.ID.Interactor.A : int 31676 34272 31676 34518 31676 32973 31676 35097 31676 32996 ...
## $ BioGRID.ID.Interactor.B : int 34272 31676 34518 31676 32973 31676 35097 31676 32996 31676 ...
## $ Systematic.Name.Interactor.A: chr "YLR418C" "YOL145C" "YLR418C" "YOR123C" ...
## $ Systematic.Name.Interactor.B: chr "YOL145C" "YLR418C" "YOR123C" "YLR418C" ...
## $ Official.Symbol.Interactor.A: Factor w/ 5925 levels "1-Oct","15S_RRNA",..: 631 860 631 2072 631 2818 631 3081 631 3783 ...
## $ Official.Symbol.Interactor.B: Factor w/ 6376 levels "","1-Oct","15S_RRNA",..: 906 667 2196 667 2962 667 3236 667 3955 667 ...
## $ Synonyms.Interactor.A : Factor w/ 4432 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 1954 552 1954 1697 1954 1928 1954 1083 1954 700 ...
## $ Synonyms.Interactor.B : Factor w/ 4746 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 602 2089 1813 2089 2063 2089 1154 2089 752 2089 ...
## $ Experimental.System : Factor w/ 28 levels "","Affinity Capture-Luminescence",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Experimental.System.Type : Factor w/ 3 levels "","genetic","physical": 3 3 3 3 3 3 3 3 3 3 ...
## $ Author : Factor w/ 13744 levels "","Aalto MK (1993)",..: 6197 6197 6197 6197 6197 6197 6197 6197 6197 6197 ...
## $ Pubmed.ID : int 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 ...
## $ Organism.Interactor.A : int 559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
## $ Organism.Interactor.B : int 559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
## $ Throughput : Factor w/ 4 levels "","High Throughput",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Score : num NA NA NA NA NA NA NA NA NA NA ...
## $ Modification : Factor w/ 17 levels "","Acetylation",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Phenotypes : Factor w/ 2172 levels "","acid pH resistance:partial rescue",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Qualifications : Factor w/ 334656 levels "","(F20A","(PAB1) as a dosage suppressor of the Slg phenotype of the rpl33a G76R",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Tags : logi NA NA NA NA NA NA ...
## $ Source.Database : Factor w/ 2 levels "","BIOGRID": 2 2 2 2 2 2 2 2 2 2 ...
## $ X : Factor w/ 12 levels ""," P-value = 0.0004064]|alleles: arc35-6 - act1-136 [SGA score = -0.3705",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.1 : Factor w/ 11 levels ""," P-value = 0.0001617]|alleles: pbr1-5012 - pkc1-1 [SGA score = -0.7905",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.2 : Factor w/ 10 levels ""," P-value = 0.001011]|alleles: bet4-5007 - act1-124 [SGA score = -0.2097",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.3 : Factor w/ 9 levels ""," P-value = 0.000581]|alleles: pkc1-4 - act1-136 [SGA score = -0.2153",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.4 : Factor w/ 8 levels ""," P-value = 0.001378]|alleles: arc35-6 - act1-4 [SGA score = -0.1451",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.5 : Factor w/ 8 levels ""," P-value = 0.0001648]|alleles: bet4-5007 - act1-155 [SGA score = -0.1568",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.6 : Factor w/ 6 levels ""," P-value = 0.0009274]|alleles: bet4-5007 - act1-159 [SGA score = -0.1793",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.7 : Factor w/ 5 levels ""," P-value = 0.0002055]|alleles: act1-4 - pfy1-13 [SGA score = -0.3401",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.8 : Factor w/ 4 levels ""," P-value = 0.0002784]|alleles: act1-159 - pkc1-4 [SGA score = -0.1767",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.9 : Factor w/ 3 levels ""," P-value = 0.01853]|alleles: act1-3 - pkc1-2 [SGA score = -0.2618",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.10 : Factor w/ 3 levels ""," P-value = 1.166E-9]\tNA\tBIOGRID",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.11 : Factor w/ 2 levels ""," P-value = 0.0001836]\tNA\tBIOGRID": 1 1 1 1 1 1 1 1 1 1 ...
if( debug == 0) { rm(tb); }
total = length(tb.ppi[,1]);
tb.ppi$ordered_pairs = NA;
for (i in 1:total ) {
if( (round(i/1000)*1000 - i)==0 ) { print(paste(i, "::", total - i) ); }
pairs = tb.ppi[i , c("Systematic.Name.Interactor.A", "Systematic.Name.Interactor.B")];
if ( is.na(pairs[1,1]) ) {
tb.ppi$ordered_pairs[i] = "NA_found";
} else {
ordered_pairs = sort(pairs)
tb.ppi$ordered_pairs[i] = paste( ordered_pairs[1,1], ordered_pairs[1,2], sep="_");
}
}
## [1] "1000 :: 140640"
## [1] "2000 :: 139640"
## [1] "3000 :: 138640"
## [1] "4000 :: 137640"
## [1] "5000 :: 136640"
## [1] "6000 :: 135640"
## [1] "7000 :: 134640"
## [1] "8000 :: 133640"
## [1] "9000 :: 132640"
## [1] "10000 :: 131640"
## [1] "11000 :: 130640"
## [1] "12000 :: 129640"
## [1] "13000 :: 128640"
## [1] "14000 :: 127640"
## [1] "15000 :: 126640"
## [1] "16000 :: 125640"
## [1] "17000 :: 124640"
## [1] "18000 :: 123640"
## [1] "19000 :: 122640"
## [1] "20000 :: 121640"
## [1] "21000 :: 120640"
## [1] "22000 :: 119640"
## [1] "23000 :: 118640"
## [1] "24000 :: 117640"
## [1] "25000 :: 116640"
## [1] "26000 :: 115640"
## [1] "27000 :: 114640"
## [1] "28000 :: 113640"
## [1] "29000 :: 112640"
## [1] "30000 :: 111640"
## [1] "31000 :: 110640"
## [1] "32000 :: 109640"
## [1] "33000 :: 108640"
## [1] "34000 :: 107640"
## [1] "35000 :: 106640"
## [1] "36000 :: 105640"
## [1] "37000 :: 104640"
## [1] "38000 :: 103640"
## [1] "39000 :: 102640"
## [1] "40000 :: 101640"
## [1] "41000 :: 100640"
## [1] "42000 :: 99640"
## [1] "43000 :: 98640"
## [1] "44000 :: 97640"
## [1] "45000 :: 96640"
## [1] "46000 :: 95640"
## [1] "47000 :: 94640"
## [1] "48000 :: 93640"
## [1] "49000 :: 92640"
## [1] "50000 :: 91640"
## [1] "51000 :: 90640"
## [1] "52000 :: 89640"
## [1] "53000 :: 88640"
## [1] "54000 :: 87640"
## [1] "55000 :: 86640"
## [1] "56000 :: 85640"
## [1] "57000 :: 84640"
## [1] "58000 :: 83640"
## [1] "59000 :: 82640"
## [1] "60000 :: 81640"
## [1] "61000 :: 80640"
## [1] "62000 :: 79640"
## [1] "63000 :: 78640"
## [1] "64000 :: 77640"
## [1] "65000 :: 76640"
## [1] "66000 :: 75640"
## [1] "67000 :: 74640"
## [1] "68000 :: 73640"
## [1] "69000 :: 72640"
## [1] "70000 :: 71640"
## [1] "71000 :: 70640"
## [1] "72000 :: 69640"
## [1] "73000 :: 68640"
## [1] "74000 :: 67640"
## [1] "75000 :: 66640"
## [1] "76000 :: 65640"
## [1] "77000 :: 64640"
## [1] "78000 :: 63640"
## [1] "79000 :: 62640"
## [1] "80000 :: 61640"
## [1] "81000 :: 60640"
## [1] "82000 :: 59640"
## [1] "83000 :: 58640"
## [1] "84000 :: 57640"
## [1] "85000 :: 56640"
## [1] "86000 :: 55640"
## [1] "87000 :: 54640"
## [1] "88000 :: 53640"
## [1] "89000 :: 52640"
## [1] "90000 :: 51640"
## [1] "91000 :: 50640"
## [1] "92000 :: 49640"
## [1] "93000 :: 48640"
## [1] "94000 :: 47640"
## [1] "95000 :: 46640"
## [1] "96000 :: 45640"
## [1] "97000 :: 44640"
## [1] "98000 :: 43640"
## [1] "99000 :: 42640"
## [1] "100000 :: 41640"
## [1] "101000 :: 40640"
## [1] "102000 :: 39640"
## [1] "103000 :: 38640"
## [1] "104000 :: 37640"
## [1] "105000 :: 36640"
## [1] "106000 :: 35640"
## [1] "107000 :: 34640"
## [1] "108000 :: 33640"
## [1] "109000 :: 32640"
## [1] "110000 :: 31640"
## [1] "111000 :: 30640"
## [1] "112000 :: 29640"
## [1] "113000 :: 28640"
## [1] "114000 :: 27640"
## [1] "115000 :: 26640"
## [1] "116000 :: 25640"
## [1] "117000 :: 24640"
## [1] "118000 :: 23640"
## [1] "119000 :: 22640"
## [1] "120000 :: 21640"
## [1] "121000 :: 20640"
## [1] "122000 :: 19640"
## [1] "123000 :: 18640"
## [1] "124000 :: 17640"
## [1] "125000 :: 16640"
## [1] "126000 :: 15640"
## [1] "127000 :: 14640"
## [1] "128000 :: 13640"
## [1] "129000 :: 12640"
## [1] "130000 :: 11640"
## [1] "131000 :: 10640"
## [1] "132000 :: 9640"
## [1] "133000 :: 8640"
## [1] "134000 :: 7640"
## [1] "135000 :: 6640"
## [1] "136000 :: 5640"
## [1] "137000 :: 4640"
## [1] "138000 :: 3640"
## [1] "139000 :: 2640"
## [1] "140000 :: 1640"
## [1] "141000 :: 640"
str(tb.ppi)
## 'data.frame': 141640 obs. of 37 variables:
## $ X.BioGRID.Interaction.ID : int 68770 68771 68774 68775 68778 68779 68782 68783 68786 68787 ...
## $ Entrez.Gene.Interactor.A : int 851136 854020 851136 854290 851136 852582 851136 854933 851136 852607 ...
## $ Entrez.Gene.Interactor.B : int 854020 851136 854290 851136 852582 851136 854933 851136 852607 851136 ...
## $ BioGRID.ID.Interactor.A : int 31676 34272 31676 34518 31676 32973 31676 35097 31676 32996 ...
## $ BioGRID.ID.Interactor.B : int 34272 31676 34518 31676 32973 31676 35097 31676 32996 31676 ...
## $ Systematic.Name.Interactor.A: chr "YLR418C" "YOL145C" "YLR418C" "YOR123C" ...
## $ Systematic.Name.Interactor.B: chr "YOL145C" "YLR418C" "YOR123C" "YLR418C" ...
## $ Official.Symbol.Interactor.A: Factor w/ 5925 levels "1-Oct","15S_RRNA",..: 631 860 631 2072 631 2818 631 3081 631 3783 ...
## $ Official.Symbol.Interactor.B: Factor w/ 6376 levels "","1-Oct","15S_RRNA",..: 906 667 2196 667 2962 667 3236 667 3955 667 ...
## $ Synonyms.Interactor.A : Factor w/ 4432 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 1954 552 1954 1697 1954 1928 1954 1083 1954 700 ...
## $ Synonyms.Interactor.B : Factor w/ 4746 levels "","(R","[acyl-carrier-protein] S-malonyltransferase",..: 602 2089 1813 2089 2063 2089 1154 2089 752 2089 ...
## $ Experimental.System : Factor w/ 28 levels "","Affinity Capture-Luminescence",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Experimental.System.Type : Factor w/ 3 levels "","genetic","physical": 3 3 3 3 3 3 3 3 3 3 ...
## $ Author : Factor w/ 13744 levels "","Aalto MK (1993)",..: 6197 6197 6197 6197 6197 6197 6197 6197 6197 6197 ...
## $ Pubmed.ID : int 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 14759368 ...
## $ Organism.Interactor.A : int 559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
## $ Organism.Interactor.B : int 559292 559292 559292 559292 559292 559292 559292 559292 559292 559292 ...
## $ Throughput : Factor w/ 4 levels "","High Throughput",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Score : num NA NA NA NA NA NA NA NA NA NA ...
## $ Modification : Factor w/ 17 levels "","Acetylation",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Phenotypes : Factor w/ 2172 levels "","acid pH resistance:partial rescue",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Qualifications : Factor w/ 334656 levels "","(F20A","(PAB1) as a dosage suppressor of the Slg phenotype of the rpl33a G76R",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Tags : logi NA NA NA NA NA NA ...
## $ Source.Database : Factor w/ 2 levels "","BIOGRID": 2 2 2 2 2 2 2 2 2 2 ...
## $ X : Factor w/ 12 levels ""," P-value = 0.0004064]|alleles: arc35-6 - act1-136 [SGA score = -0.3705",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.1 : Factor w/ 11 levels ""," P-value = 0.0001617]|alleles: pbr1-5012 - pkc1-1 [SGA score = -0.7905",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.2 : Factor w/ 10 levels ""," P-value = 0.001011]|alleles: bet4-5007 - act1-124 [SGA score = -0.2097",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.3 : Factor w/ 9 levels ""," P-value = 0.000581]|alleles: pkc1-4 - act1-136 [SGA score = -0.2153",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.4 : Factor w/ 8 levels ""," P-value = 0.001378]|alleles: arc35-6 - act1-4 [SGA score = -0.1451",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.5 : Factor w/ 8 levels ""," P-value = 0.0001648]|alleles: bet4-5007 - act1-155 [SGA score = -0.1568",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.6 : Factor w/ 6 levels ""," P-value = 0.0009274]|alleles: bet4-5007 - act1-159 [SGA score = -0.1793",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.7 : Factor w/ 5 levels ""," P-value = 0.0002055]|alleles: act1-4 - pfy1-13 [SGA score = -0.3401",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.8 : Factor w/ 4 levels ""," P-value = 0.0002784]|alleles: act1-159 - pkc1-4 [SGA score = -0.1767",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.9 : Factor w/ 3 levels ""," P-value = 0.01853]|alleles: act1-3 - pkc1-2 [SGA score = -0.2618",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.10 : Factor w/ 3 levels ""," P-value = 1.166E-9]\tNA\tBIOGRID",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ X.11 : Factor w/ 2 levels ""," P-value = 0.0001836]\tNA\tBIOGRID": 1 1 1 1 1 1 1 1 1 1 ...
## $ ordered_pairs : chr "YLR418C_YOL145C" "YLR418C_YOL145C" "YLR418C_YOR123C" "YLR418C_YOR123C" ...
How many unique biogrid PPI?
unique.ordered_pairs = unique(tb.ppi$ordered_pairs);
length( unique(tb.ppi$ordered_pairs) ) / length(tb.ppi$ordered_pairs)
## [1] 0.6522522
tb.ppi.unique = tb.ppi[ match(unique.ordered_pairs, tb.ppi$ordered_pairs), ];
Identify essential and nonessential genes
tb.fit = read.csv( "fitness.data/SummaryRegressionHetHom20171122.csv", colClasses=rep("character",9) );
How many interactions per essential genes?
First find out interaxctions per gene, then pick the essential genes
degree = table( c( tb.ppi.unique$Systematic.Name.Interactor.A, tb.ppi.unique$Systematic.Name.Interactor.B) );
net = data.frame(degree);
names(net) = c("orf", "degree");
net$orf = as.character(net$orf);
Match essentiality to biogrid PPI
net$essentiality = tb.fit$essentiality[ match(net$orf, tb.fit$orf) ];
net.nonessential = net[net$essentiality=='nonessential', ];
net.essential = net[ net$essentiality == 'essential', ];
summary(net.nonessential)
## orf degree essentiality
## Length:4910 Min. : 1.00 Length:4910
## Class :character 1st Qu.: 6.00 Class :character
## Mode :character Median : 14.00 Mode :character
## Mean : 28.27
## 3rd Qu.: 30.00
## Max. :1771.00
## NA's :838
summary(net.essential);
## orf degree essentiality ## Length:1894 Min. : 1.00 Length:1894 ## Class :character 1st Qu.: 17.00 Class :character ## Mode :character
Median : 35.00
Mode :character ##
Mean : 52.56
## 3rd Qu.: 61.00 ## Max. :2558.00 ## NA's :838
No comments:
Post a Comment