Tuesday, December 17, 2013

PPI aging simulation, mactower

I logmein into mactower, run the simulation, and then use Dropbox to send back the results.

20131216.ppi.sim.aging_v2_helen.R #for cutoff=4 for degree
20131216.ppi.sim.aging_v2_helen_cutoff5.R


# todo, 
# permutation effect on aging? 
# lambda ~ 1/connectivity of nodes

rm(list=ls())

require('flexsurv')
#source("/Users/hongqin/lib/R/lifespan.r")
source("lifespan.r")

#setwd("~/projects/0.network.aging.prj/0.ppi.reliability.simulation")
list.files(path='data', )

debug = 0; 

#yeast PPI
pairs = read.csv('data/pairs.csv', colClasses=c('character','character'))
#this yeast ppi dataset is consistent with Taiwan group's report.

#essential gene info
essenTb = read.csv('data/SummaryRegressionHetHom2013Oct29.csv', colClasses=rep('character', 9))

#######################
# How do the two data set overlap? DIP seems to contain some questionable orfs
uniq.orf.from.pairs = unique(c(pairs$ORF1, pairs$ORF2)) 
matches = uniq.orf.from.pairs %in% unique(essenTb$orf)
table(matches)
#FALSE  TRUE 
# 261  4217
# YDL026W dubious, YAR009C transposable element
dubiousORF = uniq.orf.from.pairs[! matches]

matches = uniq.orf.from.pairs %in% unique(essenTb$orf[essenTb$essenflag=='essential'])
table(matches)
#FALSE  TRUE 
#3506   972  #this is amazingly consistent with the Taiwan group's report.

matches = uniq.orf.from.pairs %in% unique(essenTb$orf[essenTb$essenflag=='nonessential'])
table(matches)
# FALSE  TRUE 
# 1287  3191  #this is amazingly consistent with Taiwan group's report.

#remove dubious orfs from PPI
pairs$Removeflag = ifelse( pairs$ORF1 %in%dubiousORF | pairs$ORF2 %in%dubiousORF, T,F   )
table(pairs$Removeflag)
#FALSE  TRUE 
#12180  1487 

pairs = pairs[! pairs$Removeflag, ]
table(pairs$Removeflag)
pairs = pairs[,1:2]  ##This set of pairs is read for analysis

###############################
# label essential nodes, remove nonesse-nonessen pairs
essentialORFs = essenTb$orf[essenTb$essenflag=='essential']
pairs$essen1 = pairs$ORF1 %in% essentialORFs
pairs$essen2 = pairs$ORF2 %in% essentialORFs

head(pairs)

#remove nonessen <-> nonessen intxn because they do not affect aging. 
pairs$remove = ifelse( pairs$essen1==F & pairs$essen2==F, T, F  )
pairs= pairs[! pairs$remove,1:4 ]  #only 6279 intxn left

#remove self-intxn, just to make sure
pairs = pairs[ pairs$ORF1 != pairs$ORF2, ]

#how many essen <--> essen intxn? 
pairs$inxnEE = pairs$essen1 & pairs$essen2
table(pairs$inxnEE)
# FALSE  TRUE 
# 4521  1758 

#How many essen genes? 
tmp = essentialORFs %in% unique(c(pairs$ORF1, pairs$ORF2)) 
table(tmp)
#FALSE  TRUE 
#194   958  #So, intxn among essential genes appear to be suppressed. 

essentialORFsPPI = essentialORFs[tmp]

#get connectivities per node
degreeTb = data.frame( table(c(pairs[,1], pairs[,2])))
summary(degreeTb)
degreeTb$ORF = as.character( degreeTb[,1])

degreeCutoff = 4; #######!!!!!!
tmp = essentialORFsPPI %in% degreeTb$ORF[degreeTb$Freq>degreeCutoff]
GooddEssentialORFsPPI = essentialORFsPPI[tmp]

###########################

if(debug >= 5){GooddEssentialORFsPPI = GooddEssentialORFsPPI[1:100]  }

###########################
# simulate aging
# -> exponential age to all pairs
# -> maximal age for each essential gene
# -> minimal age for all essential modules

set.seed(2013)

lambda_v = 1 / 3^seq(2,5)
#lambda_v = 1 / s

#p_v = seq(0.1, 1.0, by=0.1)  ; #the chance that each gene interaction is active at t=0
p_v = c(0.7, 0.7, 0.7,  0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1.0, 1.0, 1.0)  ; #the chance that each gene interaction is active at t=0

sim_names = c( "degreeCutoff","p", "lambda", "meanLS", "medianLS", "R","G", "GompAIC", "WeibAIC")
sim       = t( c(NA,     NA,   NA,       NA,       NA,        NA,  NA,   NA,      NA))
sim = data.frame(sim)
names(sim) = sim_names


################################## network simulations
# for debug, lambda = 1/10
# lambda = 1/35; p=0.9

for(lambda in lambda_v) {
  # p=0.9, #for debug
  for( p in p_v) {  
    popSize = 100 #too small pop size and too small p can lead to very few living individuals
    popAges = numeric(popSize)
    time1 = date()
    for( j in 1:popSize) {
      runningORFs = GooddEssentialORFsPPI  
      
      #I could add stochasticity into pairs here.  
      pairs$active = runif(length(pairs[,1]))
      # tmp = pairs$active > 1-p
      # table(tmp) / length(tmp)  ; #double-check, very good. 
      
      ModuleTb = data.frame(runningORFs)
      pairs$age = rexp( length(pairs[,1]), rate=lambda )  #exponential ages for pairs
      pairs$age = ifelse(pairs$active > (1-p), pairs$age, NA )
      
      #loop every essential genes to identify the module age
      for (i in 1:length(runningORFs)) {
        myORF = runningORFs[i]
        pos1 = grep(myORF, pairs$ORF1)
        pos2 = grep(myORF, pairs$ORF2)
        if( length( c(pos1,pos2))>=1 ) {
          ModuleTb$age.m[i] = max( pairs$age[c(pos1,pos2)], na.rm=T )   #maximal intxn age -> module age
        } else {
          ModuleTb$age.m[i] = NA; 
        }
      }
      #head(ModuleTb); 
      summary(ModuleTb)
      ModuleTb$age.m[ ModuleTb$age.m== -Inf] = 0; #dead births occur when links are not active
      currentNetworkAge = min(ModuleTb$age.m)
      popAges[j] = currentNetworkAge      
    }# end of j loop, population loop
    
    time2 = date()
    hist(popAges)
    summary(popAges)
    popAges = popAges[popAges>0]; #remove dead-births, which can occur when p is low
    
    #time1; time2; 
    #s.tb = calculate.s ( popAges )
    #plot( s.tb$s ~ s.tb$t ) 
    #plot( s.tb$s ~ s.tb$t, type='l', log='x' ) 
        
    lifespanGomp = flexsurvreg(formula = Surv(popAges) ~ 1, dist = 'gompertz') ### Use the flexsurvreg package to fit lifespan data to gompertz or weibull distribution
    lifespanWeib = flexsurvreg(formula = Surv(popAges) ~ 1, dist = 'weibull')  
    c(lifespanWeib$AIC, lifespanGomp$AIC, lifespanWeib$AIC - lifespanGomp$AIC )
    sOject = Surv(popAges)
    
    #sim_names = c( "cutoff","p", "lambda", "meanLS", "medianLS", "R","G", "GompAIC", "WeibAIC")
    sim = rbind(sim, c( degreeCutoff, p, lambda, mean(popAges), median(popAges), 
                        lifespanGomp$res[1,1], lifespanGomp$res[2,1], lifespanGomp$AIC, lifespanWeib$AIC))
    
  } # end of p-loop
  
  write.csv(sim, file="scePPIaging_sim_20131217_v2_tmp.csv", row.names=F)
  
} #end of lambda loop

#conditions$avgLS[r] = mean(lifespansTemp)
#conditions$medianLS[r] = median(lifespansTemp)
#conditions$gompShape[r] = lifespanGomp$res[1,1]
#conditions$gompRate[r] = lifespanGomp$res[2,1]
#conditions$gompLogLik[r] = lifespanGomp$loglik
#conditions$gompAIC[r] = lifespanGomp$AIC
#conditions$weibShape[r] = lifespanWeib$res[1,1]
#conditions$weibScale[r] = lifespanWeib$res[2,1]
#conditions$weibLogLik[r] = lifespanWeib$loglik
#conditions$weibAIC[r] = lifespanWeib$AIC    

write.csv(sim, file="scePPIaging_sim_20131217_v2_end.csv", row.names=F)

#quite('yes')



No comments:

Post a Comment