Wednesday, November 22, 2017

yeast growth fitness analysis, essentiality

applejack:0.yeast.growth.fitness hqin$ ll *Rmd

-rw-r--r--  1 hqin  staff   3.9K Nov 22 22:24 analyze_growth_fitness.Rmd
Based on 20131028 code and its 20151012 corrections of comma in gene names. For deletion fitness data, download from http://www-deletion.stanford.edu/YDPM/YDPM_index.html
“In the Deletion Project, strains from the deletion collection were monitored under 9 different media conditions selected for the study of mitochondrial function. 5791 heterozygous diploid and 4706 homozygous diploid deletion strains were monitored in parallel using molecular barcodes on fermentable (YPD, YPDGE) and non-fermentable substrates (YPG, YPE, YPL). The YDPM database contains both the raw data and growth rates calculated for each strain in each media condition. Strains can be searched by ORF or Gene name to access growth measurements and data plots for each strain.”
What are the genotypes in the 4 yeast fitness files?
rm(list=ls()); 
setwd("~/github/yeast.growth.fitness"); 
datapath = "data"; 
my.files = c("Regression_Tc1_het.txt", 
             "Regression_Tc1_hom.txt", 
             "Regression_Tc2_het.txt", 
             "Regression_Tc2_hom.txt");

tb1het = read.table(paste('data/',my.files[1],sep=''), header=T, sep='\t', fill=T)
tb2hom = read.table(paste('data/',my.files[2],sep=''), header=T, sep='\t', fill=T)
tb3het = read.table(paste('data/',my.files[3],sep=''), header=T, sep='\t', fill=T)
tb4hom = read.table(paste('data/',my.files[4],sep=''), header=T, sep='\t', fill=T)
Essetial genes are expected to have YPD measurement in tb1het and tb3het, but not in tb2hom and tb4hom.
tb1het$YPDFlag1 = ifelse( is.na(tb1het$YPD), 'noYPDMeasurement', 'HasYPDMeasurement' )
table(tb1het$YPDFlag1)
## 
## HasYPDMeasurement  noYPDMeasurement 
##              5744               174
tb3het$YPDFlag3 = ifelse( is.na(tb3het$YPD), 'noYPDMeasurement', 'HasYPDMeasurement' )
table(tb3het$YPDFlag3)
## 
## HasYPDMeasurement  noYPDMeasurement 
##              5766               152
tb2hom$YPDFlag2 = ifelse( is.na(tb2hom$YPD), 'noYPDMeasurement', 'HasYPDMeasurement' )
table(tb2hom$YPDFlag2)
## 
## HasYPDMeasurement  noYPDMeasurement 
##              4659              1259
tb4hom$YPDFlag4 = ifelse( is.na(tb4hom$YPD), 'noYPDMeasurement', 'HasYPDMeasurement' )
table(tb4hom$YPDFlag4)
## 
## HasYPDMeasurement  noYPDMeasurement 
##              4718              1200
tb12 = merge(tb1het, tb2hom, by.x='orf', by.y='orf')
tb12 = tb12[, c(1,2,grep('Flag',names(tb12)))]

tb34 = merge(tb3het,tb4hom, by.x='orf', by.y='orf')
tb34 = tb34[, c(1,2,grep('Flag',names(tb34)))]

#names(tb12) = c("orf",'gene','Anno1','Growth1')
#names(tb34) = c('orf','gene','Anno2','Growth2')
str(tb12)
## 'data.frame':    5918 obs. of  4 variables:
##  $ orf     : Factor w/ 5918 levels "YAL001C","YAL002W",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ gene.x  : Factor w/ 5915 levels "37165","AAC1",..: 3133 3395 750 3424 2971 803 906 2912 1470 3425 ...
##  $ YPDFlag1: chr  "HasYPDMeasurement" "HasYPDMeasurement" "HasYPDMeasurement" "HasYPDMeasurement" ...
##  $ YPDFlag2: chr  "HasYPDMeasurement" "HasYPDMeasurement" "noYPDMeasurement" "HasYPDMeasurement" ...
str(tb34)
## 'data.frame':    5918 obs. of  4 variables:
##  $ orf     : Factor w/ 5918 levels "YAL001C","YAL002W",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ gene.x  : Factor w/ 5915 levels "37165","AAC1",..: 3133 3395 750 3424 2971 803 906 2912 1470 3425 ...
##  $ YPDFlag3: chr  "HasYPDMeasurement" "HasYPDMeasurement" "HasYPDMeasurement" "HasYPDMeasurement" ...
##  $ YPDFlag4: chr  "HasYPDMeasurement" "HasYPDMeasurement" "noYPDMeasurement" "HasYPDMeasurement" ...
tb = merge(tb12,tb34, by.x='orf', by.y='orf')
head(tb)
##       orf gene.x.x          YPDFlag1          YPDFlag2 gene.x.y
## 1 YAL001C     TFC3 HasYPDMeasurement HasYPDMeasurement     TFC3
## 2 YAL002W     VPS8 HasYPDMeasurement HasYPDMeasurement     VPS8
## 3 YAL003W     EFB1 HasYPDMeasurement  noYPDMeasurement     EFB1
## 4 YAL004W  YAL004W HasYPDMeasurement HasYPDMeasurement  YAL004W
## 5 YAL005C     ssa1 HasYPDMeasurement HasYPDMeasurement     ssa1
## 6 YAL007C     ERP2 HasYPDMeasurement HasYPDMeasurement     ERP2
##            YPDFlag3          YPDFlag4
## 1 HasYPDMeasurement HasYPDMeasurement
## 2 HasYPDMeasurement HasYPDMeasurement
## 3 HasYPDMeasurement  noYPDMeasurement
## 4 HasYPDMeasurement HasYPDMeasurement
## 5 HasYPDMeasurement HasYPDMeasurement
## 6 HasYPDMeasurement HasYPDMeasurement
Consistency check for sanity.
tb$name.check= ifelse(tb$gene.x.x == tb$gene.x.y, T, F)
table(tb$name.check)
## 
## TRUE 
## 5918
#tb$essentiality = NA; 
tb$essentiality = 'abnormal'; 

tb$essentiality = ifelse( tb$YPDFlag2=='HasYPDMeasurement' & tb$YPDFlag4=='HasYPDMeasurement', 'nonessential',  tb$essentiality ); #2 and 4 are homozygous deletion
table(tb$essentiality)
## 
##     abnormal nonessential 
##         1264         4654
tb$essentiality = ifelse( tb$YPDFlag2=='noYPDMeasurement' & tb$YPDFlag4=='noYPDMeasurement' & tb$YPDFlag1 == 'HasYPDMeasurement' & tb$YPDFlag3 == 'HasYPDMeasurement' , 'essential',  tb$essentiality ); #2 and 4 are homozygous deletion, 1 and 3 are heterozynous

table(tb$essentiality)
## 
##     abnormal    essential nonessential 
##          122         1142         4654
sum( table(tb$essentiality) )
## [1] 5918
write.csv(tb, "data/SummaryRegressionHetHom20171122.csv", row.names = F );
#what was I doing here? 
#tbReal$essenflag = ifelse( tbReal$Growth1=='growth1' & tbReal$Growth2=='growth2', 'nonessential', 
#                       ifelse(tbReal$Growth1=='nogrowth1' & tbReal$Growth2=='nogrowth2', 'essential', 'abnormal') )
#head(tbReal)
#table(tbReal$essenflag) 
# abnormal    essential nonessential 
# 68         1152         4552 
#Good, consistent results. Use these for further analysis. 2013 Oct 29
# 20151012, found two "TRUE" in orfs
#length(tbReal$orf) #5772
#length(unique(tbReal$orf)) #5772

#write.csv(tbReal, "SummaryRegressionHetHom2015Oct12.csv", row.names = F ) #change 20151012
#write.csv(tbReal, "SummaryRegressionHetHom2015Oct12.csv",  quote=F, row.names=F )

No comments:

Post a Comment