bug on multiple entries of NAME-to-ORF mapping. This seems to be 8 bugs in file "SceORF_name.csv".
It seems that probably four ORFs share the same gene NAMES, probably due to merged annotations. I need to revisit this annotation problem.
The for-loop parsing of 50681 rows took about 2.5 hours on Byte, an apple laptop.
Impressive. There 4101 single-gene RLS measures?! Visual check of ORF seems to confirm this. These covered almost all the essential genes.
write.csv(tb,"rls_20150103.csv", quote=T, row.names=F )
#Visual check in Excel. All fields seem correct.
R script is: "_find_LL_20150104.R", pwd ~/0.network.aging.prj/9.ken/
######## "_find_LL_20150104.R"
setwd("~/projects/0.network.aging.prj/9.ken")
rm(list=ls())
list.files(pattern='csv')
#get sce ORF and names
tb.sce = read.csv("SceORF_name.csv",header=F, colClasses = c(rep('character',2)))
str(tb.sce)
names(tb.sce)= c('ORF','NAME')
set_sce_names = unique(c(tb.sce$ORF, tb.sce$NAME)) #this is the dictionary
tb = read.csv("rls.csv", colClasses=c("integer", rep("character",8),
rep("numeric",5), rep("character",8),
rep("numeric",5),'character',
rep('numeric',3), 'character')
)
str(tb)
tb$setRLS_vs_ref_RLS = tb$set_lifespan_mean / tb$ref_lifespan_mean
hist(tb$setRLS_vs_ref_RLS, br = 30)
summary(tb$ranksum_p)
tb$LL = ifelse( tb$ranksum_p <0.05 & tb$setRLS_vs_ref_RLS>1, 'LL', 'NLL' )
table(tb$LL)
sort( unique(toupper(tb$set_genotype[tb$LL=='LL'])) )
# need to pick up single-gene mutants from tb$set_genotype
tb$ORF = NA; tb$NAME = NA; tb$single_gene_flag = 0;
for( i in 1:length(tb[,1])) {
#for( i in 1:100) {
print (paste("i=",i))
elements = strsplit( tb$set_genotype[i], '\\s+',)
current_name = toupper(elements[[1]][1])
if (( current_name %in% set_sce_names) & (length(elements)==1 )){
if (current_name %in% tb.sce$NAME) {
tb$ORF[i] = tb.sce$ORF[tb.sce$NAME == current_name]
tb$NAME[i] = current_name;
tb$single_gene_flag[i] = 1;
} else if (current_name %in% tb.sce$ORF ) {
tb$NAME[i] = tb.sce$NAME[tb.sce$ORF == current_name]
tb$ORF[i] = current_name;
tb$single_gene_flag[i] = 1;
} else {
print ("Excuse me. NAME or ORF seems weird at line 42. tb$single_gene_flag set to -1.")
tb$single_gene_flag[i] = -1;
}
}
} #run from 12:38- about 3pm.
save.image("_20150104.RData")
table( tb$single_gene_flag)
length( unique(tb$ORF[tb$single_gene_flag == 1]) )
write.csv(tb,"rls_20150104.csv", quote=T, row.names=F )
######## end of "_find_LL_20150104.R"
No comments:
Post a Comment