use network with known theoretical random permutation
this direction might be too theoretic and has very little practical importance.
This site is to serve as my note-book and to effectively communicate with my students and collaborators. Every now and then, a blog may be of interest to other researchers or teachers. Views in this blog are my own. All rights of research results and findings on this blog are reserved. See also http://youtube.com/c/hongqin @hongqin
use network with known theoretical random permutation
this direction might be too theoretic and has very little practical importance.
rm(list=ls())
start = Sys.time()
debug = 0
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
pairs= read.csv("Data/yeast.pin.csv", colClasses = c("character", "character"))
if (debug > 9) {
pairs = pairs[1E3:4E3,]
}
rls.klab = read.csv("Data/RLSset.sandbox/rls_BY4742YPD30C_20181221.csv")
rls.klab = rls.klab[,c("set_ORF", "set_lifespan_mean","set_vs_ref_RLS_ratio")]
names(rls.klab) = c("Gene", "RLS", "RLS.ratio")
rls.essen = read.csv("Data/RLSset.sandbox/RLS.essential.mean.csv")
rls.essen$RLS.ratio = 0;
names(rls.essen) = c("Gene", "RLS", "RLS.ratio")
rls.tb = rbind( rls.klab, rls.essen)
rls.tb$Gene = as.character(rls.tb$Gene)
paste("There are ", length(rls.essen[,1])/ length(rls.tb[,1]), "essential genes")
## [1] "There are 0.189597315436242 essential genes"
#summary(rls.tb$RLS.ratio)
myq.rls = quantile( rls.tb$RLS.ratio,
prob = c( 0.19, 0.3, 0.6, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 0.995, 0.999, 1))
d.myq.rls = data.frame(myq.rls)
d.myq.rls$Q = rownames(d.myq.rls)
names(d.myq.rls) = c("RLS", "Q")
RLS2Q = function( x ){
if ( max(d.myq.rls$RLS ) > x) {
sub = d.myq.rls[ d.myq.rls$RLS >= x, ]
return ( sub$Q[ sub$RLS == min(sub$RLS)] )
} else {
return("100%")
}
}
RLS2Q(0)
## [1] "19%"
RLS2Q(0.2)
## [1] "30%"
RLS2Q(1.5)
## [1] "97%"
rls.tb$Q = unlist(lapply( rls.tb$RLS.ratio, RLS2Q))
g.rls.Q = rls.tb$Q
names(g.rls.Q) = rls.tb$Gene
pairs$cat1 = as.character( g.rls.Q[pairs$geneA] )
pairs$cat2 = as.character( g.rls.Q[pairs$geneB] )
pairs$cat1[is.na(pairs$cat1)] = "NA"
pairs$cat2[is.na(pairs$cat2)] = "NA"
pairs[ pairs$geneA == "YAL002W", ]
## geneA geneB cat1 cat2
## 233 YAL002W YAL021C NA NA
## 234 YAL002W YBR245C NA 60%
## 235 YAL002W YDL077C NA 30%
## 236 YAL002W YDR495C NA 60%
## 237 YAL002W YGL212W NA 30%
## 238 YAL002W YGR218W NA 19%
## 239 YAL002W YKL193C NA 19%
## 240 YAL002W YKL203C NA 19%
## 241 YAL002W YKR014C NA 95%
## 242 YAL002W YKR026C NA 60%
## 243 YAL002W YLR148W NA 30%
## 244 YAL002W YLR396C NA 60%
## 245 YAL002W YMR231W NA 30%
## 246 YAL002W YNL093W NA 95%
## 247 YAL002W YOR036W NA 60%
## 248 YAL002W YOR089C NA 90%
## 249 YAL002W YOR359W NA 90%
## 250 YAL002W YPL045W NA 30%
rls.tb[rls.tb$Gene=="YAL002W",] #visual check passed
## [1] Gene RLS RLS.ratio Q
## <0 rows> (or 0-length row.names)
tags = t(apply(pairs[,c("cat1", "cat2")], 1, sort))
pairs$tag = paste( tags[,1], tags[,2], sep='_')
F.obs = data.frame( table(pairs$tag))
names(F.obs) = c("tag", "freq")
F.obs [1:10,]
## tag freq
## 1 100%_19% 41
## 2 100%_30% 18
## 3 100%_60% 30
## 4 100%_90% 26
## 5 100%_95% 7
## 6 100%_97% 1
## 7 100%_98% 3
## 8 100%_NA 14
## 9 19%_19% 11068
## 10 19%_30% 5456
ms02files = list.files(path='yeastMS02')
if (debug > 0 ) {ms02files = ms02files[1: 3] }
F.ms02 = data.frame(matrix(data=NA, nrow=1, ncol=3)) #null distributions
names(F.ms02) = c('tag', 'freq', 'file')
# file = "ms02.1.csv" #debug
for (file in ms02files ){
if ( debug > 0 ) { print(file) }
ms02_pairs= read.csv(paste("yeastMS02/", file, sep=''),
colClasses = c("character", "character"))
ms02_pairs = ms02_pairs[,1:2]
ms02_pairs$cat1 = as.character( g.rls.Q[ms02_pairs$id1] )
ms02_pairs$cat2 = as.character( g.rls.Q[ms02_pairs$id2] )
ms02_pairs$cat1[is.na(ms02_pairs$cat1)] = "NA"
ms02_pairs$cat2[is.na(ms02_pairs$cat2)] = "NA"
tags2 = t(apply(ms02_pairs[,c("cat1", "cat2")], 1, sort))
ms02_pairs$tag = paste( tags2[,1], tags2[,2], sep='_')
F.ms02current = data.frame( table(ms02_pairs$tag))
F.ms02current$file = file
names(F.ms02current) = c('tag', 'freq', 'file')
F.ms02 = data.frame( rbind(F.ms02, data.frame(F.ms02current)) )
}
F.ms02 = F.ms02[ !is.na(F.ms02$tag), ]
summary(F.ms02)
## tag freq file
## Length:8309 Min. : 1 Length:8309
## Class :character 1st Qu.: 7 Class :character
## Mode :character Median : 96 Mode :character
## Mean : 1237
## 3rd Qu.: 293
## Max. :13621
allCombinationsOfTwoVectors = function (els1, els2 ) {
tagbuffer = c();
for (e1 in els1) {
for (e2 in els2) {
tmp = sort(c(e1, e2));
current_tag = paste(tmp[1], tmp[2], sep="_")
tagbuffer = c(tagbuffer, current_tag)
}
}
return( tagbuffer)
}
all_tags = unique( allCombinationsOfTwoVectors(d.myq.rls$Q, d.myq.rls$Q) )
Zs = data.frame(all_tags)
names(Zs) = c('tag')
Zs$freq = ifelse( all_tags %in% Zs$tag, F.obs$freq[ match( Zs$tag , F.obs$tag) ], 0)
Zs$freq[is.na(Zs$freq)] = 0;
summary(Zs)
## tag freq
## 100%_100%: 1 Min. : 0.00
## 100%_19% : 1 1st Qu.: 2.25
## 100%_30% : 1 Median : 45.00
## 100%_60% : 1 Mean : 1078.13
## 100%_90% : 1 3rd Qu.: 237.50
## 100%_95% : 1 Max. :11832.00
## (Other) :72
for (i in 1 : length(Zs$tag)) {
#i = 2
sub = F.ms02[ F.ms02$tag == Zs$tag[i], ]
if ( length(sub[,1])> 0) {
Zs$Z[i] = ( Zs$freq[i] - mean(sub$freq) ) / max(sd(sub$freq), 0.5)
if(debug>1 ){
print( paste( Zs$tag[i],"lenthg(sub[,1]):",length(sub[,1]), "mean:", mean(sub$freq), "sd:", sd(sub$freq) ))
sub
}
} else {
#Zs$Z[i] = ( Zs$freq[i] - 0 ) / 1E-10 #never observed in ms02 nulls?? what to do??
Zs$Z[i] = 999 #never observed in ms02 nulls?? what to do??
if(debug>0 ){
print( paste( Zs$tag[i],"lenthg(sub[,1]):",length(sub[,1]), "mean:", mean(sub$freq), "sd:", sd(sub$freq) ))
}
}
}
summary(Zs$Z)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -16.4052 -1.5414 -0.3094 -0.1638 1.4189 21.1630
#unique categories
cats = as.character( d.myq.rls$Q ) # do not sort this
Zmat = matrix(NA, ncol=length(cats), nrow=length(cats))
colnames(Zmat) = cats;
rownames(Zmat) = rev(cats);
for (i in 1:length(cats)){#row
for ( j in 1:length(cats)) { #column
tmp = sort(c(cats[i], rev(cats)[j]))
mytag = paste(tmp[1], tmp[2], sep="_")
if( mytag %in% Zs$tag) {
Zmat[i,j] = Zs$Z[ Zs$tag == mytag ]
} else {
Zmat[i,j] = NA
}
if (debug >1 ) {
print (paste(mytag, Zmat[i,j] ) )
}
}
}
Zmat
## 19% 30% 60% 90% 95% 96%
## 100% -0.3374710 -2.98414372 2.3235392 2.0608708 -4.23851323 -1.90232853
## 99.9% 1.0230630 1.45716519 -0.7505994 0.9348178 0.02110673 0.78224205
## 99.5% -0.3329221 -0.44519845 -0.2438553 -1.0608464 2.43787700 0.70660598
## 99% -1.0484267 3.42917241 -1.4274895 0.9409399 1.76150685 1.66871627
## 98% 1.3039715 0.28361270 -0.3089387 -0.9359136 1.60508218 -2.07010524
## 97% -1.7682671 0.03234569 -1.3554500 -1.5043696 -0.27568083 -0.04170095
## 96% -0.4962805 -0.78314504 2.4161518 -1.9884977 -0.31448341 -0.81163759
## 95% 2.9171375 -0.82933909 -1.5537491 0.3403722 0.65086870 -0.31448341
## 90% -2.4000000 -0.91854343 -0.2736136 -1.7155966 0.34037218 -1.98849771
## 60% -2.3334640 -1.88705683 -1.6129650 -0.2736136 -1.55374914 2.41615177
## 30% -2.0000000 -2.16000000 -1.8870568 -0.9185434 -0.82933909 -0.78314504
## 19% -2.0000000 -2.00000000 -2.3334640 -2.4000000 2.91713746 -0.49628053
## 97% 98% 99% 99.5% 99.9%
## 100% -2.07716272 -4.2478001 -16.4051980 -14.37641566 -11.30265608
## 99.9% 1.22857918 -0.3098392 3.6972350 3.68534922 3.48007596
## 99.5% -0.02549391 1.9143634 8.1611295 3.70626606 3.68534922
## 99% 1.93606068 3.1689002 4.5016691 8.16112949 3.69723500
## 98% -1.28504475 -0.2397023 3.1689002 1.91436341 -0.30983922
## 97% -1.13304432 -1.2850447 1.9360607 -0.02549391 1.22857918
## 96% -0.04170095 -2.0701052 1.6687163 0.70660598 0.78224205
## 95% -0.27568083 1.6050822 1.7615068 2.43787700 0.02110673
## 90% -1.50436956 -0.9359136 0.9409399 -1.06084640 0.93481777
## 60% -1.35544999 -0.3089387 -1.4274895 -0.24385530 -0.75059938
## 30% 0.03234569 0.2836127 3.4291724 -0.44519845 1.45716519
## 19% -1.76826710 1.3039715 -1.0484267 -0.33292209 1.02306298
## 100%
## 100% 21.163024
## 99.9% -11.302656
## 99.5% -14.376416
## 99% -16.405198
## 98% -4.247800
## 97% -2.077163
## 96% -1.902329
## 95% -4.238513
## 90% 2.060871
## 60% 2.323539
## 30% -2.984144
## 19% -0.337471
Boundary = 5
for( i in 1:length(Zmat[1,])) {
for( j in 1:length(Zmat[,1])){
Zmat[i,j] = ifelse( Zmat[i,j] > Boundary , Boundary, Zmat[i,j] )
Zmat[i,j] = ifelse( Zmat[i,j] < -Boundary, -Boundary, Zmat[i,j] )
}
}
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
#colors = c(seq(min(Zmat),-10.1,length=100),seq(-9.9,9.9,length=100),seq(10.1,max(Zmat),length=100))
my_palette <- colorRampPalette(c("blue2", "white", "red2"))(n = 99)
heatmap.2( as.matrix(Zmat), col=my_palette, scale="none",
# margins = c(5,4), key.title = NA,
,key.xlab="Z-score", key.ylab=NA,
dendrogram='none', Rowv=FALSE, Colv=FALSE,trace='none'
)
end = Sys.time()
runtime = end - start
print(runtime)
## Time difference of 7.280587 mins