Open Notebook: *** Output merged yeast GIN+PPI data merged_PPIGIN

Monday, January 20, 2014

*** Output merged yeast GIN+PPI data merged_PPIGIN_2014Jan20.tab

20171026: bug found. ORF1 and ORF2 order are not checked, so pairs may not be unique.

In 20131221.DIPGIN.sim.aging_v2.R

Move 'merged_PPIGIN_2014Jan20.tab" to "~/projects/0.ginppi.reliability.simulation/data".

# todo,
# permutation effect on aging?
# lambda ~ 1/connectivity of nodes

# 2013 Dec 20, merge DIP PPI and Genetic Inxt Net -> Multi-net approach
rm(list=ls())

#require(date)
require('flexsurv')
#source("/Users/hongqin/lib/R/lifespan.r")
source("lifespan.r")

#setwd("~/projects/0.network.aging.prj/0.ppi.reliability.simulation")
list.files(path='data', )

debug = 0;

#yeast PPI
#pairs = read.csv('data/pairs.csv', colClasses=c('character','character'))
#this yeast ppi dataset is consistent with Taiwan group's report.
dip = read.csv("data/yeastDIP.csv")
pairsPPI = dip[,c(1,2)]
pairsPPI$ORF1 = as.character(pairsPPI$ORF1)
pairsPPI$ORF2 = as.character(pairsPPI$ORF2)
pairsPPI = pairsPPI[ pairsPPI$ORF1 != pairsPPI$ORF2, ]#remove self-intxns

# yeast genetic network
#gPairs = read.csv("data/sgadata_costanzo2009_stringentCutoff_101120.csv", header=F)
gPairs = read.csv("data/sgadata_costanzo2009_lenientCutoff_101120.csv", header=F)
names(gPairs) = c("ORF1", "Name1", "ORF2", "Name2", NA, NA, NA)
gPairs$ORF1 = as.character( gPairs$ORF1 )
gPairs$ORF2 = as.character( gPairs$ORF2 )

#merge PPI and GIN
pairs = rbind(pairsPPI[,c("ORF1","ORF2")], gPairs[,c("ORF1","ORF2")])

# remove self-intxns
pairs = pairs[ pairs$ORF1 != pairs$ORF2, ]
# 96851 pairs for DIP+GIN.strigentCutoff
# 786118 for DIP+GIN.lenientCutoff

#essential gene info
essenTb = read.csv('data/SummaryRegressionHetHom2013Oct29.csv', colClasses=rep('character', 9))

#######################
# How do the two data set overlap? DIP seems to contain some questionable orfs
uniq.orf.from.pairs = unique(c(pairs$ORF1, pairs$ORF2)) #4207 ORF
matches = uniq.orf.from.pairs %in% unique(essenTb$orf)
table(matches)
#FALSE TRUE
# 720 5507

unmatchedORF = uniq.orf.from.pairs[! matches]

matches = uniq.orf.from.pairs %in% unique(essenTb$orf[essenTb$essenflag=='essential'])
table(matches)
#FALSE TRUE
#5171 1056
#This is a good coverage

matches = uniq.orf.from.pairs %in% unique(essenTb$orf[essenTb$essenflag=='nonessential'])
table(matches)
# FALSE TRUE
# 1839 4388 #this is amazingly consistent with Taiwan group's report.

#remove unmatched orfs from pairs
pairs$Removeflag = ifelse( pairs$ORF1 %in%unmatchedORF | pairs$ORF2 %in%unmatchedORF, T,F )
table(pairs$Removeflag)
# FALSE TRUE
# 572221 213897

#So, the updated DIP has 19770 intxn
#So, the DIP+GIN.lenior lead to 572221 intxns

pairs = pairs[! pairs$Removeflag, ]
table(pairs$Removeflag)
pairs = pairs[,1:2] ##This set of pairs is read for analysis

#write.table(pairs, "merged_PPIGIN_2014Jan20.tab", quote=F, row.names=F, col.names=F, sep='\t')

Open Notebook

Monday, January 20, 2014

*** Output merged yeast GIN+PPI data merged_PPIGIN_2014Jan20.tab

No comments:

Post a Comment