Showing posts with label ms02star. Show all posts
Showing posts with label ms02star. Show all posts

Monday, September 20, 2021

ms02 randomness verification

 

use network with known theoretical random permutation


this direction might be too theoretic and has very little practical importance. 


Friday, December 21, 2018

rlsratio ~rlsratio, yeast PIN

Conclusion: longest RLS are highly modular (interaction among themselves). Can we use an interaction index to gauge its modularity?

rm(list=ls())
start = Sys.time()
debug = 0
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
pairs= read.csv("Data/yeast.pin.csv", colClasses = c("character", "character"))
if (debug > 9) {
  pairs = pairs[1E3:4E3,]
}

RLS data preprocessing

rls.klab = read.csv("Data/RLSset.sandbox/rls_BY4742YPD30C_20181221.csv")
rls.klab = rls.klab[,c("set_ORF", "set_lifespan_mean","set_vs_ref_RLS_ratio")]
names(rls.klab) = c("Gene", "RLS", "RLS.ratio")
rls.essen = read.csv("Data/RLSset.sandbox/RLS.essential.mean.csv")
rls.essen$RLS.ratio = 0;
names(rls.essen) = c("Gene", "RLS", "RLS.ratio")
rls.tb = rbind( rls.klab, rls.essen)
rls.tb$Gene = as.character(rls.tb$Gene)
paste("There are ", length(rls.essen[,1])/ length(rls.tb[,1]), "essential genes")
## [1] "There are  0.189597315436242 essential genes"
#summary(rls.tb$RLS.ratio)

Percentiles of RLS.ratio

myq.rls = quantile( rls.tb$RLS.ratio, 
              prob = c( 0.19, 0.3, 0.6, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 0.995, 0.999, 1))
d.myq.rls = data.frame(myq.rls)
d.myq.rls$Q = rownames(d.myq.rls)
names(d.myq.rls) = c("RLS", "Q")

RLS2Q = function( x ){
  if ( max(d.myq.rls$RLS ) > x) {
    sub = d.myq.rls[ d.myq.rls$RLS >= x, ]
    return (  sub$Q[ sub$RLS == min(sub$RLS)] )
  } else {
    return("100%")
  }
}

RLS2Q(0)
## [1] "19%"
RLS2Q(0.2)
## [1] "30%"
RLS2Q(1.5)
## [1] "97%"
rls.tb$Q = unlist(lapply( rls.tb$RLS.ratio, RLS2Q))
g.rls.Q = rls.tb$Q
names(g.rls.Q) = rls.tb$Gene

F.obs

pairs$cat1 = as.character( g.rls.Q[pairs$geneA] )
pairs$cat2 = as.character( g.rls.Q[pairs$geneB] )
pairs$cat1[is.na(pairs$cat1)] = "NA"
pairs$cat2[is.na(pairs$cat2)] = "NA"

pairs[ pairs$geneA == "YAL002W", ] 
##       geneA   geneB cat1 cat2
## 233 YAL002W YAL021C   NA   NA
## 234 YAL002W YBR245C   NA  60%
## 235 YAL002W YDL077C   NA  30%
## 236 YAL002W YDR495C   NA  60%
## 237 YAL002W YGL212W   NA  30%
## 238 YAL002W YGR218W   NA  19%
## 239 YAL002W YKL193C   NA  19%
## 240 YAL002W YKL203C   NA  19%
## 241 YAL002W YKR014C   NA  95%
## 242 YAL002W YKR026C   NA  60%
## 243 YAL002W YLR148W   NA  30%
## 244 YAL002W YLR396C   NA  60%
## 245 YAL002W YMR231W   NA  30%
## 246 YAL002W YNL093W   NA  95%
## 247 YAL002W YOR036W   NA  60%
## 248 YAL002W YOR089C   NA  90%
## 249 YAL002W YOR359W   NA  90%
## 250 YAL002W YPL045W   NA  30%
rls.tb[rls.tb$Gene=="YAL002W",] #visual check passed
## [1] Gene      RLS       RLS.ratio Q        
## <0 rows> (or 0-length row.names)
tags = t(apply(pairs[,c("cat1", "cat2")], 1, sort))
pairs$tag = paste( tags[,1], tags[,2], sep='_')
F.obs = data.frame( table(pairs$tag))
names(F.obs) = c("tag", "freq")
F.obs [1:10,]
##         tag  freq
## 1  100%_19%    41
## 2  100%_30%    18
## 3  100%_60%    30
## 4  100%_90%    26
## 5  100%_95%     7
## 6  100%_97%     1
## 7  100%_98%     3
## 8   100%_NA    14
## 9   19%_19% 11068
## 10  19%_30%  5456
#load MS02 null networks
ms02files = list.files(path='yeastMS02')
if (debug > 0 ) {ms02files = ms02files[1: 3] }
F.ms02 = data.frame(matrix(data=NA, nrow=1, ncol=3)) #null distributions
names(F.ms02) = c('tag', 'freq', 'file')

# file = "ms02.1.csv" #debug
for (file in ms02files ){
  if ( debug > 0 ) { print(file) }
  ms02_pairs= read.csv(paste("yeastMS02/", file, sep=''),
                       colClasses = c("character", "character"))
  ms02_pairs = ms02_pairs[,1:2]
  
  ms02_pairs$cat1 = as.character( g.rls.Q[ms02_pairs$id1] )
  ms02_pairs$cat2 = as.character( g.rls.Q[ms02_pairs$id2] )
  ms02_pairs$cat1[is.na(ms02_pairs$cat1)] = "NA"
  ms02_pairs$cat2[is.na(ms02_pairs$cat2)] = "NA" 
 
  tags2 = t(apply(ms02_pairs[,c("cat1", "cat2")], 1, sort))
  ms02_pairs$tag = paste( tags2[,1], tags2[,2], sep='_')
  F.ms02current = data.frame( table(ms02_pairs$tag))
  F.ms02current$file = file
  names(F.ms02current) = c('tag', 'freq', 'file')
  F.ms02 =  data.frame( rbind(F.ms02, data.frame(F.ms02current)) )
}
F.ms02 = F.ms02[ !is.na(F.ms02$tag), ]
summary(F.ms02)
##      tag                 freq           file          
##  Length:8309        Min.   :    1   Length:8309       
##  Class :character   1st Qu.:    7   Class :character  
##  Mode  :character   Median :   96   Mode  :character  
##                     Mean   : 1237                     
##                     3rd Qu.:  293                     
##                     Max.   :13621
allCombinationsOfTwoVectors = function (els1, els2 ) {
  tagbuffer = c();
  for (e1 in els1) {
    for (e2 in els2) {
         tmp = sort(c(e1, e2));
         current_tag = paste(tmp[1], tmp[2], sep="_")
         tagbuffer = c(tagbuffer, current_tag)
    }
  }
  return( tagbuffer)
}

Initialize the Z-score matrix

all_tags = unique( allCombinationsOfTwoVectors(d.myq.rls$Q, d.myq.rls$Q) )
Zs = data.frame(all_tags)
names(Zs) = c('tag')
Zs$freq = ifelse( all_tags %in% Zs$tag, F.obs$freq[ match( Zs$tag , F.obs$tag) ], 0)
Zs$freq[is.na(Zs$freq)] = 0; 
summary(Zs)
##         tag          freq         
##  100%_100%: 1   Min.   :    0.00  
##  100%_19% : 1   1st Qu.:    2.25  
##  100%_30% : 1   Median :   45.00  
##  100%_60% : 1   Mean   : 1078.13  
##  100%_90% : 1   3rd Qu.:  237.50  
##  100%_95% : 1   Max.   :11832.00  
##  (Other)  :72

calculate Z-score. This take a few minutes. Need be modified by multicore

for (i in 1 : length(Zs$tag)) {
#i = 2
  sub = F.ms02[ F.ms02$tag == Zs$tag[i], ]
  if ( length(sub[,1])> 0) {
    Zs$Z[i] = ( Zs$freq[i] - mean(sub$freq) ) / max(sd(sub$freq), 0.5)
    if(debug>1 ){
       print( paste( Zs$tag[i],"lenthg(sub[,1]):",length(sub[,1]), "mean:", mean(sub$freq), "sd:", sd(sub$freq) ))  
       sub
    } 
  } else {
    #Zs$Z[i] = ( Zs$freq[i] - 0 ) / 1E-10  #never observed in ms02 nulls?? what to do??
    Zs$Z[i] = 999  #never observed in ms02 nulls?? what to do??
    if(debug>0 ){
       print( paste( Zs$tag[i],"lenthg(sub[,1]):",length(sub[,1]), "mean:", mean(sub$freq), "sd:", sd(sub$freq) ))  
    } 
  }
}
summary(Zs$Z)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -16.4052  -1.5414  -0.3094  -0.1638   1.4189  21.1630

generate Z matrix

#unique categories
cats =  as.character( d.myq.rls$Q ) # do not sort this
Zmat = matrix(NA, ncol=length(cats), nrow=length(cats))
colnames(Zmat) = cats; 
rownames(Zmat) = rev(cats); 
for (i in 1:length(cats)){#row
  for (  j in 1:length(cats)) { #column
    tmp = sort(c(cats[i], rev(cats)[j]))
    mytag = paste(tmp[1], tmp[2], sep="_")
    if( mytag %in% Zs$tag) {
       Zmat[i,j] = Zs$Z[ Zs$tag == mytag ]
    } else {
       Zmat[i,j] = NA
    }
    if (debug >1 ) {
      print (paste(mytag, Zmat[i,j] ) )
    }
  }
}
Zmat
##              19%         30%        60%        90%         95%         96%
## 100%  -0.3374710 -2.98414372  2.3235392  2.0608708 -4.23851323 -1.90232853
## 99.9%  1.0230630  1.45716519 -0.7505994  0.9348178  0.02110673  0.78224205
## 99.5% -0.3329221 -0.44519845 -0.2438553 -1.0608464  2.43787700  0.70660598
## 99%   -1.0484267  3.42917241 -1.4274895  0.9409399  1.76150685  1.66871627
## 98%    1.3039715  0.28361270 -0.3089387 -0.9359136  1.60508218 -2.07010524
## 97%   -1.7682671  0.03234569 -1.3554500 -1.5043696 -0.27568083 -0.04170095
## 96%   -0.4962805 -0.78314504  2.4161518 -1.9884977 -0.31448341 -0.81163759
## 95%    2.9171375 -0.82933909 -1.5537491  0.3403722  0.65086870 -0.31448341
## 90%   -2.4000000 -0.91854343 -0.2736136 -1.7155966  0.34037218 -1.98849771
## 60%   -2.3334640 -1.88705683 -1.6129650 -0.2736136 -1.55374914  2.41615177
## 30%   -2.0000000 -2.16000000 -1.8870568 -0.9185434 -0.82933909 -0.78314504
## 19%   -2.0000000 -2.00000000 -2.3334640 -2.4000000  2.91713746 -0.49628053
##               97%        98%         99%        99.5%        99.9%
## 100%  -2.07716272 -4.2478001 -16.4051980 -14.37641566 -11.30265608
## 99.9%  1.22857918 -0.3098392   3.6972350   3.68534922   3.48007596
## 99.5% -0.02549391  1.9143634   8.1611295   3.70626606   3.68534922
## 99%    1.93606068  3.1689002   4.5016691   8.16112949   3.69723500
## 98%   -1.28504475 -0.2397023   3.1689002   1.91436341  -0.30983922
## 97%   -1.13304432 -1.2850447   1.9360607  -0.02549391   1.22857918
## 96%   -0.04170095 -2.0701052   1.6687163   0.70660598   0.78224205
## 95%   -0.27568083  1.6050822   1.7615068   2.43787700   0.02110673
## 90%   -1.50436956 -0.9359136   0.9409399  -1.06084640   0.93481777
## 60%   -1.35544999 -0.3089387  -1.4274895  -0.24385530  -0.75059938
## 30%    0.03234569  0.2836127   3.4291724  -0.44519845   1.45716519
## 19%   -1.76826710  1.3039715  -1.0484267  -0.33292209   1.02306298
##             100%
## 100%   21.163024
## 99.9% -11.302656
## 99.5% -14.376416
## 99%   -16.405198
## 98%    -4.247800
## 97%    -2.077163
## 96%    -1.902329
## 95%    -4.238513
## 90%     2.060871
## 60%     2.323539
## 30%    -2.984144
## 19%    -0.337471
#heatmap
Boundary = 5
for( i in 1:length(Zmat[1,])) {
 for( j in 1:length(Zmat[,1])){
   Zmat[i,j] =  ifelse( Zmat[i,j] > Boundary  , Boundary, Zmat[i,j] )
   Zmat[i,j] =  ifelse( Zmat[i,j] < -Boundary, -Boundary, Zmat[i,j] )
 }
}
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
#colors = c(seq(min(Zmat),-10.1,length=100),seq(-9.9,9.9,length=100),seq(10.1,max(Zmat),length=100))
my_palette <- colorRampPalette(c("blue2", "white", "red2"))(n = 99)

heatmap.2( as.matrix(Zmat), col=my_palette, scale="none",
          # margins = c(5,4), key.title = NA, 
           ,key.xlab="Z-score", key.ylab=NA,
          dendrogram='none', Rowv=FALSE, Colv=FALSE,trace='none'
          )
end = Sys.time()
runtime = end - start
print(runtime)
## Time difference of 7.280587 mins