yeast PIN RLS ratio ~ RLS ratio Z-score
H Qin
2018-12-21
rm(list=ls())
start = Sys.time()
debug = 0
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
pairs= read.csv("Data/yeast.pin.csv", colClasses = c("character", "character"))
if (debug > 9) {
pairs = pairs[1E3:4E3,]
}
RLS data preprocessing
rls.klab = read.csv("Data/RLSset.sandbox/rls_BY4742YPD30C_20181221.csv")
rls.klab = rls.klab[,c("set_ORF", "set_lifespan_mean","set_vs_ref_RLS_ratio")]
names(rls.klab) = c("Gene", "RLS", "RLS.ratio")
rls.essen = read.csv("Data/RLSset.sandbox/RLS.essential.mean.csv")
rls.essen$RLS.ratio = 0;
names(rls.essen) = c("Gene", "RLS", "RLS.ratio")
rls.tb = rbind( rls.klab, rls.essen)
rls.tb$Gene = as.character(rls.tb$Gene)
paste("There are ", length(rls.essen[,1])/ length(rls.tb[,1]), "essential genes")
## [1] "There are 0.189597315436242 essential genes"
#summary(rls.tb$RLS.ratio)
Percentiles of RLS.ratio
myq.rls = quantile( rls.tb$RLS.ratio,
prob = c( 0.19, 0.3, 0.6, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 0.995, 0.999, 1))
d.myq.rls = data.frame(myq.rls)
d.myq.rls$Q = rownames(d.myq.rls)
names(d.myq.rls) = c("RLS", "Q")
RLS2Q = function( x ){
if ( max(d.myq.rls$RLS ) > x) {
sub = d.myq.rls[ d.myq.rls$RLS >= x, ]
return ( sub$Q[ sub$RLS == min(sub$RLS)] )
} else {
return("100%")
}
}
RLS2Q(0)
## [1] "19%"
RLS2Q(0.2)
## [1] "30%"
RLS2Q(1.5)
## [1] "97%"
rls.tb$Q = unlist(lapply( rls.tb$RLS.ratio, RLS2Q))
g.rls.Q = rls.tb$Q
names(g.rls.Q) = rls.tb$Gene
F.obs
pairs$cat1 = as.character( g.rls.Q[pairs$geneA] )
pairs$cat2 = as.character( g.rls.Q[pairs$geneB] )
pairs$cat1[is.na(pairs$cat1)] = "NA"
pairs$cat2[is.na(pairs$cat2)] = "NA"
pairs[ pairs$geneA == "YAL002W", ]
## geneA geneB cat1 cat2
## 233 YAL002W YAL021C NA NA
## 234 YAL002W YBR245C NA 60%
## 235 YAL002W YDL077C NA 30%
## 236 YAL002W YDR495C NA 60%
## 237 YAL002W YGL212W NA 30%
## 238 YAL002W YGR218W NA 19%
## 239 YAL002W YKL193C NA 19%
## 240 YAL002W YKL203C NA 19%
## 241 YAL002W YKR014C NA 95%
## 242 YAL002W YKR026C NA 60%
## 243 YAL002W YLR148W NA 30%
## 244 YAL002W YLR396C NA 60%
## 245 YAL002W YMR231W NA 30%
## 246 YAL002W YNL093W NA 95%
## 247 YAL002W YOR036W NA 60%
## 248 YAL002W YOR089C NA 90%
## 249 YAL002W YOR359W NA 90%
## 250 YAL002W YPL045W NA 30%
rls.tb[rls.tb$Gene=="YAL002W",] #visual check passed
## [1] Gene RLS RLS.ratio Q
## <0 rows> (or 0-length row.names)
tags = t(apply(pairs[,c("cat1", "cat2")], 1, sort))
pairs$tag = paste( tags[,1], tags[,2], sep='_')
F.obs = data.frame( table(pairs$tag))
names(F.obs) = c("tag", "freq")
F.obs [1:10,]
## tag freq
## 1 100%_19% 41
## 2 100%_30% 18
## 3 100%_60% 30
## 4 100%_90% 26
## 5 100%_95% 7
## 6 100%_97% 1
## 7 100%_98% 3
## 8 100%_NA 14
## 9 19%_19% 11068
## 10 19%_30% 5456
#load MS02 null networks
ms02files = list.files(path='yeastMS02')
if (debug > 0 ) {ms02files = ms02files[1: 3] }
F.ms02 = data.frame(matrix(data=NA, nrow=1, ncol=3)) #null distributions
names(F.ms02) = c('tag', 'freq', 'file')
# file = "ms02.1.csv" #debug
for (file in ms02files ){
if ( debug > 0 ) { print(file) }
ms02_pairs= read.csv(paste("yeastMS02/", file, sep=''),
colClasses = c("character", "character"))
ms02_pairs = ms02_pairs[,1:2]
ms02_pairs$cat1 = as.character( g.rls.Q[ms02_pairs$id1] )
ms02_pairs$cat2 = as.character( g.rls.Q[ms02_pairs$id2] )
ms02_pairs$cat1[is.na(ms02_pairs$cat1)] = "NA"
ms02_pairs$cat2[is.na(ms02_pairs$cat2)] = "NA"
tags2 = t(apply(ms02_pairs[,c("cat1", "cat2")], 1, sort))
ms02_pairs$tag = paste( tags2[,1], tags2[,2], sep='_')
F.ms02current = data.frame( table(ms02_pairs$tag))
F.ms02current$file = file
names(F.ms02current) = c('tag', 'freq', 'file')
F.ms02 = data.frame( rbind(F.ms02, data.frame(F.ms02current)) )
}
F.ms02 = F.ms02[ !is.na(F.ms02$tag), ]
summary(F.ms02)
## tag freq file
## Length:8309 Min. : 1 Length:8309
## Class :character 1st Qu.: 7 Class :character
## Mode :character Median : 96 Mode :character
## Mean : 1237
## 3rd Qu.: 293
## Max. :13621
allCombinationsOfTwoVectors = function (els1, els2 ) {
tagbuffer = c();
for (e1 in els1) {
for (e2 in els2) {
tmp = sort(c(e1, e2));
current_tag = paste(tmp[1], tmp[2], sep="_")
tagbuffer = c(tagbuffer, current_tag)
}
}
return( tagbuffer)
}
Initialize the Z-score matrix
all_tags = unique( allCombinationsOfTwoVectors(d.myq.rls$Q, d.myq.rls$Q) )
Zs = data.frame(all_tags)
names(Zs) = c('tag')
Zs$freq = ifelse( all_tags %in% Zs$tag, F.obs$freq[ match( Zs$tag , F.obs$tag) ], 0)
Zs$freq[is.na(Zs$freq)] = 0;
summary(Zs)
## tag freq
## 100%_100%: 1 Min. : 0.00
## 100%_19% : 1 1st Qu.: 2.25
## 100%_30% : 1 Median : 45.00
## 100%_60% : 1 Mean : 1078.13
## 100%_90% : 1 3rd Qu.: 237.50
## 100%_95% : 1 Max. :11832.00
## (Other) :72
calculate Z-score. This take a few minutes. Need be modified by multicore
for (i in 1 : length(Zs$tag)) {
#i = 2
sub = F.ms02[ F.ms02$tag == Zs$tag[i], ]
if ( length(sub[,1])> 0) {
Zs$Z[i] = ( Zs$freq[i] - mean(sub$freq) ) / max(sd(sub$freq), 0.5)
if(debug>1 ){
print( paste( Zs$tag[i],"lenthg(sub[,1]):",length(sub[,1]), "mean:", mean(sub$freq), "sd:", sd(sub$freq) ))
sub
}
} else {
#Zs$Z[i] = ( Zs$freq[i] - 0 ) / 1E-10 #never observed in ms02 nulls?? what to do??
Zs$Z[i] = 999 #never observed in ms02 nulls?? what to do??
if(debug>0 ){
print( paste( Zs$tag[i],"lenthg(sub[,1]):",length(sub[,1]), "mean:", mean(sub$freq), "sd:", sd(sub$freq) ))
}
}
}
summary(Zs$Z)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -16.4052 -1.5414 -0.3094 -0.1638 1.4189 21.1630
generate Z matrix
#unique categories
cats = as.character( d.myq.rls$Q ) # do not sort this
Zmat = matrix(NA, ncol=length(cats), nrow=length(cats))
colnames(Zmat) = cats;
rownames(Zmat) = rev(cats);
for (i in 1:length(cats)){#row
for ( j in 1:length(cats)) { #column
tmp = sort(c(cats[i], rev(cats)[j]))
mytag = paste(tmp[1], tmp[2], sep="_")
if( mytag %in% Zs$tag) {
Zmat[i,j] = Zs$Z[ Zs$tag == mytag ]
} else {
Zmat[i,j] = NA
}
if (debug >1 ) {
print (paste(mytag, Zmat[i,j] ) )
}
}
}
Zmat
## 19% 30% 60% 90% 95% 96%
## 100% -0.3374710 -2.98414372 2.3235392 2.0608708 -4.23851323 -1.90232853
## 99.9% 1.0230630 1.45716519 -0.7505994 0.9348178 0.02110673 0.78224205
## 99.5% -0.3329221 -0.44519845 -0.2438553 -1.0608464 2.43787700 0.70660598
## 99% -1.0484267 3.42917241 -1.4274895 0.9409399 1.76150685 1.66871627
## 98% 1.3039715 0.28361270 -0.3089387 -0.9359136 1.60508218 -2.07010524
## 97% -1.7682671 0.03234569 -1.3554500 -1.5043696 -0.27568083 -0.04170095
## 96% -0.4962805 -0.78314504 2.4161518 -1.9884977 -0.31448341 -0.81163759
## 95% 2.9171375 -0.82933909 -1.5537491 0.3403722 0.65086870 -0.31448341
## 90% -2.4000000 -0.91854343 -0.2736136 -1.7155966 0.34037218 -1.98849771
## 60% -2.3334640 -1.88705683 -1.6129650 -0.2736136 -1.55374914 2.41615177
## 30% -2.0000000 -2.16000000 -1.8870568 -0.9185434 -0.82933909 -0.78314504
## 19% -2.0000000 -2.00000000 -2.3334640 -2.4000000 2.91713746 -0.49628053
## 97% 98% 99% 99.5% 99.9%
## 100% -2.07716272 -4.2478001 -16.4051980 -14.37641566 -11.30265608
## 99.9% 1.22857918 -0.3098392 3.6972350 3.68534922 3.48007596
## 99.5% -0.02549391 1.9143634 8.1611295 3.70626606 3.68534922
## 99% 1.93606068 3.1689002 4.5016691 8.16112949 3.69723500
## 98% -1.28504475 -0.2397023 3.1689002 1.91436341 -0.30983922
## 97% -1.13304432 -1.2850447 1.9360607 -0.02549391 1.22857918
## 96% -0.04170095 -2.0701052 1.6687163 0.70660598 0.78224205
## 95% -0.27568083 1.6050822 1.7615068 2.43787700 0.02110673
## 90% -1.50436956 -0.9359136 0.9409399 -1.06084640 0.93481777
## 60% -1.35544999 -0.3089387 -1.4274895 -0.24385530 -0.75059938
## 30% 0.03234569 0.2836127 3.4291724 -0.44519845 1.45716519
## 19% -1.76826710 1.3039715 -1.0484267 -0.33292209 1.02306298
## 100%
## 100% 21.163024
## 99.9% -11.302656
## 99.5% -14.376416
## 99% -16.405198
## 98% -4.247800
## 97% -2.077163
## 96% -1.902329
## 95% -4.238513
## 90% 2.060871
## 60% 2.323539
## 30% -2.984144
## 19% -0.337471
#heatmap
Boundary = 5
for( i in 1:length(Zmat[1,])) {
for( j in 1:length(Zmat[,1])){
Zmat[i,j] = ifelse( Zmat[i,j] > Boundary , Boundary, Zmat[i,j] )
Zmat[i,j] = ifelse( Zmat[i,j] < -Boundary, -Boundary, Zmat[i,j] )
}
}
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
#colors = c(seq(min(Zmat),-10.1,length=100),seq(-9.9,9.9,length=100),seq(10.1,max(Zmat),length=100))
my_palette <- colorRampPalette(c("blue2", "white", "red2"))(n = 99)
heatmap.2( as.matrix(Zmat), col=my_palette, scale="none",
# margins = c(5,4), key.title = NA,
,key.xlab="Z-score", key.ylab=NA,
dendrogram='none', Rowv=FALSE, Colv=FALSE,trace='none'
)
end = Sys.time()
runtime = end - start
print(runtime)
## Time difference of 7.280587 mins
No comments:
Post a Comment