Showing posts with label biogrid. Show all posts
Showing posts with label biogrid. Show all posts

Monday, April 18, 2022

parse yeast PPI from biogrid 4.4.208

code at  https://github.com/QinLab/Biogrid-Qin2022 


download BIOGRID-ALL-4.4.208.tab3.zip 

write a python code to parse out yeast entries

myfile ='data-large-unsynced/BIOGRID-ALL-4.4.208.tab3.txt'
df = pd.read_csv(myfile,sep='\t', header=(0))
df = df[df['Organism Name Interactor A'].str.contains('Saccharomyces cerevisiae') ]
df = df[df['Organism Name Interactor B'].str.contains('Saccharomyces cerevisiae') ]

Remove duplicated  interactions

def alphabetic_ordered_tag(in_tag1, in_tag2):
    tmp = [str(in_tag1), str(in_tag2)]
    tmp.sort()
    return( str(in_tag1) + "_" + str(in_tag2))
df['alphabetic_ordered_tag'] = df.apply(lambda x: alphabetic_ordered_tag(x['Systematic Name Interactor A'], x['Systematic Name Interactor B']), axis=1)
df2 = df.drop_duplicates(subset=['alphabetic_ordered_tag'])

Output a lean version form small file size

df3 = df2[['Systematic Name Interactor A', 'Systematic Name Interactor B', 'Official Symbol Interactor A', 'Official Symbol Interactor B', 'alphabetic_ordered_tag' ]]
df3.to_csv("biogrid_s288c_4.4.208.lean.csv")

Output a dictionary from systematic names to symbols. 

dicA = df3[['Systematic Name Interactor A', 'Official Symbol Interactor A']]
dicB = df3[['Systematic Name Interactor B', 'Official Symbol Interactor B']]
dicA.columns = ['Name', 'Symbol']
dicB.columns = ['Name', 'Symbol']
dic = pd.concat([dicA, dicB])
dic2 = dic.drop_duplicates(subset=['Name', 'Symbol'])
dic2.to_csv("Sce_Name2Symbol.csv")

A total 627732 interactions and 6155 unique names/symbols were found for s288c biogrid data set. 

Note: Self-interactions were included. 









Tuesday, April 17, 2018

biogrid PPI


read my old codes:

applejack:qin.biogrid.code hqin$ pwd
/Users/hqin/data/biogrid/qin.biogrid.code


My code took 'physical' interactions from biogrid, and found only unique interactions.

Thursday, November 30, 2017

RLS pairwise difference, P-value < 1/10K using ms02 permutation

rm(list=ls())
#setwd("~/github/0.network.aging.ms02/1.Fraser02")
setwd("/home/hqin/github/network.aging.configuration/1.Fraser02")
source("../network.r")
set.seed(2017)
debug = 1; 
start_time = Sys.time();
list.files(path="../data/")
## [1] "ken-RLS-byORF.csv"                             
## [2] "SummaryRegressionHetHomFactorized2015Oct13.csv"
## [3] "unique_biogrid_ScePPI.csv"
rls = read.csv("../data/ken-RLS-byORF.csv");
biogrid = read.csv("../data/unique_biogrid_ScePPI.csv");
fit = read.csv("../data/SummaryRegressionHetHomFactorized2015Oct13.csv")
ppi = biogrid[, c("Systematic.Name.Interactor.A","Systematic.Name.Interactor.B")];
names(ppi) = c("ORF1", "ORF2" )
#First, define a function to calculate V difference in pairs of proteins
 diff.RLS = function( inpairs ) {
   inpairs$rls1 = rls$avgLS[match( inpairs$ORF1, rls$ORF ) ];
   inpairs$rls2 = rls$avgLS[match( inpairs$ORF2, rls$ORF ) ];
   
   inpairs$essen1 = fit$essenflag[match(inpairs$ORF1, fit$orf)];
   inpairs$essen2 = fit$essenflag[match(inpairs$ORF2, fit$orf)];
   
   inpairs$rls1 = ifelse( inpairs$essen1=='essential', 0, inpairs$rls1);
   inpairs$rls2 = ifelse( inpairs$essen2=='essential', 0, inpairs$rls2);
   
   ret = mean( abs( inpairs$rls1 - inpairs$rls2 ), na.rm=T );
 } 
 # calculate the observed difference in RLS
 diff.RLS.obs = diff.RLS ( ppi );
 paste( "Observed deltaRLS = ", diff.RLS.obs); 
## [1] "Observed deltaRLS =  12.759632586095"
#permutation of pairs, and their difference in Ka
 Nsims = 10000; #number of permutations
 permutated.diff.RLS = numeric( Nsims ); #empty vector to store calculations

library(foreach)
library(doMC)
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores=8) #Intel i7 has 6 cores, Xeon E5-2603 @ridgeside has 8 cores

permutated.diff.RLS = foreach(i=1:Nsims) %dopar% {
   new.pairs = ms02_singlerun(ppi ) #generate a new MS02 random network
   new.pairs = new.pairs[,1:2] #reformating into two-columns
   names(new.pairs) = c("ORF1", "ORF2")
   diff.RLS( new.pairs ); 
  }
p-value
permutated.diff.RLS = unlist(permutated.diff.RLS)

summary(permutated.diff.RLS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.03   14.20   14.22   14.22   14.25   14.37
sub = permutated.diff.RLS[ permutated.diff.RLS < diff.RLS.obs ]
paste("pvalue = ", length(sub)/Nsims)
## [1] "pvalue =  0"
hist(permutated.diff.RLS)
stop_time = Sys.time()
start_time;
## [1] "2017-11-29 10:01:30 EST"
stop_time;
## [1] "2017-11-29 13:04:53 EST"
paste( "running time = ", stop_time - start_time) 
## [1] "running time =  3.05638415979015"

Wednesday, November 29, 2017

p=0.001 ridgeside, rls pairwise difference in yeast biogrid PPI


rm(list=ls())
#setwd("~/github/0.network.aging.ms02/1.Fraser02")
setwd("/home/hqin/github/network.aging.configuration/1.Fraser02")
source("../network.r")
set.seed(2017)
debug = 1; 
start_time = Sys.time();
list.files(path="../data/")
## [1] "ken-RLS-byORF.csv"                             
## [2] "SummaryRegressionHetHomFactorized2015Oct13.csv"
## [3] "unique_biogrid_ScePPI.csv"
rls = read.csv("../data/ken-RLS-byORF.csv");
biogrid = read.csv("../data/unique_biogrid_ScePPI.csv");
fit = read.csv("../data/SummaryRegressionHetHomFactorized2015Oct13.csv")
ppi = biogrid[, c("Systematic.Name.Interactor.A","Systematic.Name.Interactor.B")];
names(ppi) = c("ORF1", "ORF2" )
#First, define a function to calculate V difference in pairs of proteins
 diff.RLS = function( inpairs ) {
   inpairs$rls1 = rls$avgLS[match( inpairs$ORF1, rls$ORF ) ];
   inpairs$rls2 = rls$avgLS[match( inpairs$ORF2, rls$ORF ) ];
   
   inpairs$essen1 = fit$essenflag[match(inpairs$ORF1, fit$orf)];
   inpairs$essen2 = fit$essenflag[match(inpairs$ORF2, fit$orf)];
   
   inpairs$rls1 = ifelse( inpairs$essen1=='essential', 0, inpairs$rls1);
   inpairs$rls2 = ifelse( inpairs$essen2=='essential', 0, inpairs$rls2);
   
   ret = mean( abs( inpairs$rls1 - inpairs$rls2 ), na.rm=T );
 } 
 # calculate the observed difference in RLS
 diff.RLS.obs = diff.RLS ( ppi );
 paste( "Observed deltaRLS = ", diff.RLS.obs); 
## [1] "Observed deltaRLS =  12.759632586095"
#permutation of pairs, and their difference in Ka
 Nsims = 1000; #number of permutations
 permutated.diff.RLS = numeric( Nsims ); #empty vector to store calculations

library(foreach)
library(doMC)
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores=8) #Intel i7 has 6 cores, Xeon E5-2603 @ridgeside has 8 cores

permutated.diff.RLS = foreach(i=1:Nsims) %dopar% {
   new.pairs = ms02_singlerun(ppi ) #generate a new MS02 random network
   new.pairs = new.pairs[,1:2] #reformating into two-columns
   names(new.pairs) = c("ORF1", "ORF2")
   diff.RLS( new.pairs ); 
  }
p-value
permutated.diff.RLS = unlist(permutated.diff.RLS)

summary(permutated.diff.RLS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.09   14.20   14.22   14.22   14.25   14.39
sub = permutated.diff.RLS[ permutated.diff.RLS < diff.RLS.obs ]
paste("pvalue = ", length(sub)/Nsims)
## [1] "pvalue =  0"
hist(permutated.diff.RLS)
stop_time = Sys.time()
paste( "running time = ", stop_time - start_time) 
## [1] "running time =  18.9267802158992"