Open Notebook: parse single mutant rows in rls.csv in R

Sunday, January 4, 2015

parse single mutant rows in rls.csv in R

Parse rls.csv in R to pick single-gene mutant rows.

bug on multiple entries of NAME-to-ORF mapping. This seems to be 8 bugs in file "SceORF_name.csv".

It seems that probably four ORFs share the same gene NAMES, probably due to merged annotations. I need to revisit this annotation problem.

The for-loop parsing of 50681 rows took about 2.5 hours on Byte, an apple laptop.

Impressive. There 4101 single-gene RLS measures?! Visual check of ORF seems to confirm this. These covered almost all the essential genes.

write.csv(tb,"rls_20150103.csv", quote=T, row.names=F )

#Visual check in Excel. All fields seem correct.

R script is: "_find_LL_20150104.R", pwd ~/0.network.aging.prj/9.ken/

######## "_find_LL_20150104.R"

setwd("~/projects/0.network.aging.prj/9.ken")

rm(list=ls())

list.files(pattern='csv')

#get sce ORF and names

tb.sce = read.csv("SceORF_name.csv",header=F, colClasses = c(rep('character',2)))

str(tb.sce)

names(tb.sce)= c('ORF','NAME')

set_sce_names = unique(c(tb.sce$ORF, tb.sce$NAME)) #this is the dictionary

tb = read.csv("rls.csv", colClasses=c("integer", rep("character",8),

rep("numeric",5), rep("character",8),

rep("numeric",5),'character',

rep('numeric',3), 'character')

)

str(tb)

tb$setRLS_vs_ref_RLS = tb$set_lifespan_mean / tb$ref_lifespan_mean

hist(tb$setRLS_vs_ref_RLS, br = 30)

summary(tb$ranksum_p)

tb$LL = ifelse( tb$ranksum_p <0.05 & tb$setRLS_vs_ref_RLS>1, 'LL', 'NLL' )

table(tb$LL)

sort( unique(toupper(tb$set_genotype[tb$LL=='LL'])) )

# need to pick up single-gene mutants from tb$set_genotype

tb$ORF = NA; tb$NAME = NA; tb$single_gene_flag = 0;

for( i in 1:length(tb[,1])) {

#for( i in 1:100) {

print (paste("i=",i))

elements = strsplit( tb$set_genotype[i], '\\s+',)

current_name = toupper(elements[[1]][1])

if (( current_name %in% set_sce_names) & (length(elements)==1 )){

if (current_name %in% tb.sce$NAME) {

tb$ORF[i] = tb.sce$ORF[tb.sce$NAME == current_name]

tb$NAME[i] = current_name;

tb$single_gene_flag[i] = 1;

} else if (current_name %in% tb.sce$ORF ) {

tb$NAME[i] = tb.sce$NAME[tb.sce$ORF == current_name]

tb$ORF[i] = current_name;

tb$single_gene_flag[i] = 1;

} else {

print ("Excuse me. NAME or ORF seems weird at line 42. tb$single_gene_flag set to -1.")

tb$single_gene_flag[i] = -1;

}

} #run from 12:38- about 3pm.

save.image("_20150104.RData")

table( tb$single_gene_flag)

length( unique(tb$ORF[tb$single_gene_flag == 1]) )

write.csv(tb,"rls_20150104.csv", quote=T, row.names=F )

######## end of "_find_LL_20150104.R"

Open Notebook

Sunday, January 4, 2015

parse single mutant rows in rls.csv in R

No comments:

Post a Comment