Sunday, January 4, 2015

parse single mutant rows in rls.csv in R

Parse rls.csv in R to pick single-gene mutant rows.

bug on multiple entries of NAME-to-ORF mapping. This seems to be 8 bugs in file "SceORF_name.csv".



It seems that probably four ORFs share the same gene NAMES, probably due to merged annotations.  I need to revisit this annotation problem. 


The for-loop parsing of 50681 rows took about 2.5 hours on Byte, an apple laptop. 


Impressive. There 4101 single-gene RLS measures?! Visual check of ORF seems to confirm this. These covered almost all the essential genes. 

write.csv(tb,"rls_20150103.csv", quote=T, row.names=F )
#Visual check in Excel. All fields seem correct. 

R script is: "_find_LL_20150104.R", pwd ~/0.network.aging.prj/9.ken/

########  "_find_LL_20150104.R"
setwd("~/projects/0.network.aging.prj/9.ken")
rm(list=ls())

list.files(pattern='csv')

#get sce ORF and names
tb.sce = read.csv("SceORF_name.csv",header=F, colClasses = c(rep('character',2)))
str(tb.sce)
names(tb.sce)= c('ORF','NAME')
set_sce_names = unique(c(tb.sce$ORF, tb.sce$NAME)) #this is the dictionary


tb = read.csv("rls.csv", colClasses=c("integer", rep("character",8),
                                      rep("numeric",5), rep("character",8),
                                      rep("numeric",5),'character',
                                      rep('numeric',3), 'character')
              )
str(tb)

tb$setRLS_vs_ref_RLS = tb$set_lifespan_mean / tb$ref_lifespan_mean
hist(tb$setRLS_vs_ref_RLS, br = 30)
summary(tb$ranksum_p)
tb$LL = ifelse( tb$ranksum_p <0.05 & tb$setRLS_vs_ref_RLS>1, 'LL', 'NLL' )
table(tb$LL)

sort( unique(toupper(tb$set_genotype[tb$LL=='LL'])) ) 

# need to pick up single-gene mutants from tb$set_genotype 
tb$ORF = NA; tb$NAME = NA; tb$single_gene_flag = 0;
for( i in 1:length(tb[,1])) { 
#for( i in 1:100) {
  print (paste("i=",i))
  elements = strsplit(  tb$set_genotype[i], '\\s+',)
  current_name = toupper(elements[[1]][1])
  if (( current_name %in% set_sce_names) & (length(elements)==1 )){
    if (current_name %in% tb.sce$NAME) {
      tb$ORF[i] = tb.sce$ORF[tb.sce$NAME == current_name]
      tb$NAME[i] = current_name;
      tb$single_gene_flag[i] = 1;
    } else if (current_name %in% tb.sce$ORF ) {
      tb$NAME[i] = tb.sce$NAME[tb.sce$ORF == current_name]
      tb$ORF[i] = current_name;
      tb$single_gene_flag[i] = 1;  
    } else {
      print ("Excuse me. NAME or ORF seems weird at line 42. tb$single_gene_flag set to -1.")
      tb$single_gene_flag[i] = -1;
    }     
  }
} #run from 12:38- about 3pm. 

save.image("_20150104.RData")
table( tb$single_gene_flag)
length( unique(tb$ORF[tb$single_gene_flag == 1]) )

write.csv(tb,"rls_20150104.csv", quote=T, row.names=F )
######## end of  "_find_LL_20150104.R"














No comments:

Post a Comment