Conver 20131028.growth.fitness.R to Rmd. Output is here. Retro added on Nov 20, 2017.
Analyze growth fitness
H Qin
11/16/2017
Based on 20131028 code and its 20151012 corrections of comma in gene names. For deletion fitness data, download from http://www-deletion.stanford.edu/YDPM/YDPM_index.html
“In the Deletion Project, strains from the deletion collection were monitored under 9 different media conditions selected for the study of mitochondrial function. 5791 heterozygous diploid and 4706 homozygous diploid deletion strains were monitored in parallel using molecular barcodes on fermentable (YPD, YPDGE) and non-fermentable substrates (YPG, YPE, YPL). The YDPM database contains both the raw data and growth rates calculated for each strain in each media condition. Strains can be searched by ORF or Gene name to access growth measurements and data plots for each strain.”
What are the genotypes in the 4 yeast fitness files?
rm(list=ls())
datapath = "data"
my.files = c("Regression_Tc1_het.txt",
"Regression_Tc1_hom.txt",
"Regression_Tc2_het.txt",
"Regression_Tc2_hom.txt")
tb1 = read.table(paste('data/',my.files[1],sep=''), header=T, sep='\t', fill=T)
tb2 = read.table(paste('data/',my.files[2],sep=''), header=T, sep='\t', fill=T)
tb3 = read.table(paste('data/',my.files[3],sep=''), header=T, sep='\t', fill=T)
tb4 = read.table(paste('data/',my.files[4],sep=''), header=T, sep='\t', fill=T)
tb1$AnnotationFlag = ifelse( is.na(tb1$YPD), 'questionable1', 'verified1' )
table(tb1$AnnotationFlag)
##
## questionable1 verified1
## 174 5744
# questional verified
# 174 5744
x = table(tb1$orf); str(x)
## 'table' int [1:5918(1d)] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 1
## ..$ : chr [1:5918] "YAL001C" "YAL002W" "YAL003W" "YAL004W" ...
tb3$AnnotationFlag = ifelse( is.na(tb3$YPD), 'questionable2', 'verified2' )
table(tb3$AnnotationFlag)
##
## questionable2 verified2
## 152 5766
# quetionable verified
# 152 5766
hist(tb3$YPD)
x = table(tb3$orf); str(x)
## 'table' int [1:5918(1d)] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 1
## ..$ : chr [1:5918] "YAL001C" "YAL002W" "YAL003W" "YAL004W" ...
tb2$GrowthFlag = ifelse( is.na(tb2$YPD), 'nogrowth1', 'growth1' )
table(tb2$GrowthFlag)
##
## growth1 nogrowth1
## 4659 1259
# growth nogrowth
# 4659 1259
x = table(tb2$orf); str(x)
## 'table' int [1:5918(1d)] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 1
## ..$ : chr [1:5918] "YAL001C" "YAL002W" "YAL003W" "YAL004W" ...
tb4$GrowthFlag = ifelse( is.na(tb4$YPD), 'nogrowth2', 'growth2' )
table(tb4$GrowthFlag)
##
## growth2 nogrowth2
## 4718 1200
# growth nogrowth
# 4718 1200
x = table(tb4$orf); str(x)
## 'table' int [1:5918(1d)] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 1
## ..$ : chr [1:5918] "YAL001C" "YAL002W" "YAL003W" "YAL004W" ...
tb12 = merge(tb1,tb2, by.x='orf', by.y='orf')
tb12 = tb12[, c(1,2,grep('Flag',names(tb12)))]
tb34 = merge(tb3,tb4, by.x='orf', by.y='orf')
tb34 = tb34[, c(1,2,grep('Flag',names(tb34)))]
names(tb12) = c("orf",'gene','Anno1','Growth1')
names(tb34) = c('orf','gene','Anno2','Growth2')
str(tb12)
## 'data.frame': 5918 obs. of 4 variables:
## $ orf : Factor w/ 5918 levels "YAL001C","YAL002W",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ gene : Factor w/ 5915 levels "37165","AAC1",..: 3133 3395 750 3424 2971 803 906 2912 1470 3425 ...
## $ Anno1 : chr "verified1" "verified1" "verified1" "verified1" ...
## $ Growth1: chr "growth1" "growth1" "nogrowth1" "growth1" ...
str(tb34)
## 'data.frame': 5918 obs. of 4 variables:
## $ orf : Factor w/ 5918 levels "YAL001C","YAL002W",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ gene : Factor w/ 5915 levels "37165","AAC1",..: 3133 3395 750 3424 2971 803 906 2912 1470 3425 ...
## $ Anno2 : chr "verified2" "verified2" "verified2" "verified2" ...
## $ Growth2: chr "growth2" "growth2" "nogrowth2" "growth2" ...
tb = merge(tb12,tb34, by.x='orf', by.y='orf')
head(tb)
## orf gene.x Anno1 Growth1 gene.y Anno2 Growth2
## 1 YAL001C TFC3 verified1 growth1 TFC3 verified2 growth2
## 2 YAL002W VPS8 verified1 growth1 VPS8 verified2 growth2
## 3 YAL003W EFB1 verified1 nogrowth1 EFB1 verified2 nogrowth2
## 4 YAL004W YAL004W verified1 growth1 YAL004W verified2 growth2
## 5 YAL005C ssa1 verified1 growth1 ssa1 verified2 growth2
## 6 YAL007C ERP2 verified1 growth1 ERP2 verified2 growth2
tb$name.check= ifelse(tb$gene.x == tb$gene.y, T, F)
table(tb$name.check)
##
## TRUE
## 5918
# tb[ tb$name.check==F, ]
length( grep('question', tb$Anno1) ) #174
## [1] 174
length( grep('question', tb$Anno2) ) #152
## [1] 152
#tbReal = tb[ tb$Anno1=='verified1' & tb$Anno2=='verified2', ]
tbReal = tb[ tb$Anno1=='verified1' | tb$Anno2=='verified2', ] #At least one Het growth. The other one might be an error?!
length( grep('question', tbReal$Anno1) ) #28
## [1] 28
length( grep('question', tbReal$Anno2) ) #6
## [1] 6
table(tbReal$Growth1, tbReal$Growth2)
##
## growth2 nogrowth2
## growth1 4552 5
## nogrowth1 63 1152
# growth2 nogrowth2
# growth1 4552 5
# nogrowth1 63 1152
# Verify some manually on SGD.
tbReal[tbReal$Growth1=='nogrowth1' & tbReal$Growth2=='nogrowth2', ] [1:10,]
## orf gene.x Anno1 Growth1 gene.y Anno2 Growth2
## 3 YAL003W EFB1 verified1 nogrowth1 EFB1 verified2 nogrowth2
## 15 YAL016W tpd3 verified1 nogrowth1 tpd3 verified2 nogrowth2
## 24 YAL025C mak16 verified1 nogrowth1 mak16 verified2 nogrowth2
## 31 YAL032C PRP45 verified1 nogrowth1 PRP45 verified2 nogrowth2
## 32 YAL033W POP5 verified1 nogrowth1 POP5 verified2 nogrowth2
## 34 YAL034W-A MTW1 verified1 nogrowth1 MTW1 verified2 nogrowth2
## 35 YAL035C-A YAL035C-A verified1 nogrowth1 YAL035C-A verified2 nogrowth2
## 39 YAL038W cdc19 verified1 nogrowth1 cdc19 verified2 nogrowth2
## 42 YAL041W cdc24 verified1 nogrowth1 cdc24 verified2 nogrowth2
## 44 YAL043C pta1 verified1 nogrowth1 pta1 verified2 nogrowth2
## name.check
## 3 TRUE
## 15 TRUE
## 24 TRUE
## 31 TRUE
## 32 TRUE
## 34 TRUE
## 35 TRUE
## 39 TRUE
## 42 TRUE
## 44 TRUE
tbReal[tbReal$Growth1=='growth1' & tbReal$Growth2=='nogrowth2', ]
## orf gene.x Anno1 Growth1 gene.y Anno2 Growth2
## 956 YDR050C TPI1 verified1 growth1 TPI1 verified2 nogrowth2
## 1448 YEL002C WBP1 verified1 growth1 WBP1 verified2 nogrowth2
## 2685 YIL078W THS1 verified1 growth1 THS1 verified2 nogrowth2
## 4152 YML126C ERG13 verified1 growth1 ERG13 verified2 nogrowth2
## 4383 YMR218C TRS130 verified1 growth1 TRS130 verified2 nogrowth2
## name.check
## 956 TRUE
## 1448 TRUE
## 2685 TRUE
## 4152 TRUE
## 4383 TRUE
#what was I doing here?
tbReal$essenflag = ifelse( tbReal$Growth1=='growth1' & tbReal$Growth2=='growth2', 'nonessential',
ifelse(tbReal$Growth1=='nogrowth1' & tbReal$Growth2=='nogrowth2', 'essential', 'abnormal') )
head(tbReal)
## orf gene.x Anno1 Growth1 gene.y Anno2 Growth2
## 1 YAL001C TFC3 verified1 growth1 TFC3 verified2 growth2
## 2 YAL002W VPS8 verified1 growth1 VPS8 verified2 growth2
## 3 YAL003W EFB1 verified1 nogrowth1 EFB1 verified2 nogrowth2
## 4 YAL004W YAL004W verified1 growth1 YAL004W verified2 growth2
## 5 YAL005C ssa1 verified1 growth1 ssa1 verified2 growth2
## 6 YAL007C ERP2 verified1 growth1 ERP2 verified2 growth2
## name.check essenflag
## 1 TRUE nonessential
## 2 TRUE nonessential
## 3 TRUE essential
## 4 TRUE nonessential
## 5 TRUE nonessential
## 6 TRUE nonessential
table(tbReal$essenflag)
##
## abnormal essential nonessential
## 68 1152 4552
# abnormal essential nonessential
# 68 1152 4552
#Good, consistent results. Use these for further analysis. 2013 Oct 29
# 20151012, found two "TRUE" in orfs
length(tbReal$orf) #5772
## [1] 5772
length(unique(tbReal$orf)) #5772
## [1] 5772
#write.csv(tbReal, "SummaryRegressionHetHom2015Oct12.csv", row.names = F ) #change 20151012
#write.csv(tbReal, "SummaryRegressionHetHom2015Oct12.csv", quote=F, row.names=F )
No comments:
Post a Comment