This site is to serve as my note-book and to effectively communicate with my students and collaborators. Every now and then, a blog may be of interest to other researchers or teachers. Views in this blog are my own. All rights of research results and findings on this blog are reserved. See also http://youtube.com/c/hongqin @hongqin
Monday, October 28, 2013
Verify essential genes parsing results
Results parsed from YPD growth measures from the Deletion collection data.
File: /0.ppi.reliability.simulation/20131028.growth.fitness.R
tbReal[tbReal$Growth1=='nogrowth1' & tbReal$Growth2=='nogrowth2', ] [1:10,]
orf gene.x Anno1 Growth1 gene.y Anno2 Growth2 name.check
3 YAL003W EFB1 verified1 nogrowth1 EFB1 verified2 nogrowth2 TRUE
15 YAL016W tpd3 verified1 nogrowth1 tpd3 verified2 nogrowth2 TRUE
24 YAL025C mak16 verified1 nogrowth1 mak16 verified2 nogrowth2 TRUE
31 YAL032C PRP45 verified1 nogrowth1 PRP45 verified2 nogrowth2 TRUE
32 YAL033W POP5 verified1 nogrowth1 POP5 verified2 nogrowth2 TRUE
34 YAL034W-A MTW1 verified1 nogrowth1 MTW1 verified2 nogrowth2 TRUE
35 YAL035C-A YAL035C-A verified1 nogrowth1 YAL035C-A verified2 nogrowth2 TRUE
39 YAL038W cdc19 verified1 nogrowth1 cdc19 verified2 nogrowth2 TRUE
42 YAL041W cdc24 verified1 nogrowth1 cdc24 verified2 nogrowth2 TRUE
44 YAL043C pta1 verified1 nogrowth1 pta1 verified2 nogrowth2 TRUE
YAL025C is inviable on SGD.
YAL035C-A is inviable on SGD, but is a dubious orf.
YAL041W cdc24 is inviable on SGD, but also have some other phenotypes.
> tbReal[tbReal$Growth1=='growth1' & tbReal$Growth2=='nogrowth2', ]
orf gene.x Anno1 Growth1 gene.y Anno2 Growth2 name.check
956 YDR050C TPI1 verified1 growth1 TPI1 verified2 nogrowth2 TRUE
1448 YEL002C WBP1 verified1 growth1 WBP1 verified2 nogrowth2 TRUE
2685 YIL078W THS1 verified1 growth1 THS1 verified2 nogrowth2 TRUE
4152 YML126C ERG13 verified1 growth1 ERG13 verified2 nogrowth2 TRUE
4383 YMR218C TRS130 verified1 growth1 TRS130 verified2 nogrowth2 TRUE
YDR050C TPI1, sgd inviable, but with mixed phenotypes
YEL002C, inviable
YMR218C, inviable, but with mixed phenotypes
--------------------------------------------
> rm(list=ls())
> setwd("~/projects/0.network.aging.prj/0.ppi.reliability.simulation")
> list.files(path="data/", pattern="Regression")
[1] "Regression_Tc1_het.txt" "Regression_Tc1_hom.txt" "Regression_Tc2_het.txt"
[4] "Regression_Tc2_hom.txt"
> my.files = c("Regression_Tc1_het.txt", "Regression_Tc1_hom.txt", "Regression_Tc2_het.txt", "Regression_Tc2_hom.txt")
> # I removed the single quotes in these files
>
> tb1 = read.table(paste('data/',my.files[1],sep=''), header=T, sep='\t', fill=T)
> tb2 = read.table(paste('data/',my.files[2],sep=''), header=T, sep='\t', fill=T)
> tb3 = read.table(paste('data/',my.files[3],sep=''), header=T, sep='\t', fill=T)
> tb4 = read.table(paste('data/',my.files[4],sep=''), header=T, sep='\t', fill=T)
>
> tb1$AnnotationFlag = ifelse( is.na(tb1$YPD), 'questionable1', 'verified1' )
> table(tb1$AnnotationFlag)
questionable1 verified1
174 5744
> # questional verified
> # 174 5744
>
> tb3$AnnotationFlag = ifelse( is.na(tb3$YPD), 'questionable2', 'verified2' )
> table(tb3$AnnotationFlag)
questionable2 verified2
152 5766
> # quetionable verified
> # 152 5766
> hist(tb3$YPD)
>
> tb2$GrowthFlag = ifelse( is.na(tb2$YPD), 'nogrowth1', 'growth1' )
> table(tb2$GrowthFlag)
growth1 nogrowth1
4659 1259
> # growth nogrowth
> # 4659 1259
>
> tb4$GrowthFlag = ifelse( is.na(tb4$YPD), 'nogrowth2', 'growth2' )
> table(tb4$GrowthFlag)
growth2 nogrowth2
4718 1200
> # growth nogrowth
> # 4718 1200
>
> tb12 = merge(tb1,tb2, by.x='orf', by.y='orf')
> tb12 = tb12[, c(1,2,grep('Flag',names(tb12)))]
>
> tb34 = merge(tb3,tb4, by.x='orf', by.y='orf')
> tb34 = tb34[, c(1,2,grep('Flag',names(tb34)))]
>
> names(tb12) = c("orf",'gene','Anno1','Growth1')
> names(tb34) = c('orf','gene','Anno2','Growth2')
> str(tb12)
'data.frame': 5918 obs. of 4 variables:
$ orf : Factor w/ 5918 levels "YAL001C","YAL002W",..: 1 2 3 4 5 6 7 8 9 10 ...
$ gene : Factor w/ 5915 levels "37165","AAC1",..: 3133 3395 750 3424 2971 803 906 2912 1470 3425 ...
$ Anno1 : chr "verified1" "verified1" "verified1" "verified1" ...
$ Growth1: chr "growth1" "growth1" "nogrowth1" "growth1" ...
> str(tb34)
'data.frame': 5918 obs. of 4 variables:
$ orf : Factor w/ 5918 levels "YAL001C","YAL002W",..: 1 2 3 4 5 6 7 8 9 10 ...
$ gene : Factor w/ 5915 levels "37165","AAC1",..: 3133 3395 750 3424 2971 803 906 2912 1470 3425 ...
$ Anno2 : chr "verified2" "verified2" "verified2" "verified2" ...
$ Growth2: chr "growth2" "growth2" "nogrowth2" "growth2" ...
>
> tb = merge(tb12,tb34, by.x='orf', by.y='orf')
> head(tb)
orf gene.x Anno1 Growth1 gene.y Anno2 Growth2
1 YAL001C TFC3 verified1 growth1 TFC3 verified2 growth2
2 YAL002W VPS8 verified1 growth1 VPS8 verified2 growth2
3 YAL003W EFB1 verified1 nogrowth1 EFB1 verified2 nogrowth2
4 YAL004W YAL004W verified1 growth1 YAL004W verified2 growth2
5 YAL005C ssa1 verified1 growth1 ssa1 verified2 growth2
6 YAL007C ERP2 verified1 growth1 ERP2 verified2 growth2
> tb$name.check= ifelse(tb$gene.x == tb$gene.y, T, F)
> table(tb$name.check)
TRUE
5918
> # tb[ tb$name.check==F, ]
> length( grep('question', tb$Anno1) ) #174
[1] 174
> length( grep('question', tb$Anno2) ) #152
[1] 152
>
> #tbReal = tb[ tb$Anno1=='verified1' & tb$Anno2=='verified2', ]
> tbReal = tb[ tb$Anno1=='verified1' | tb$Anno2=='verified2', ] #At least one Het growth. The other one might be an error?!
> length( grep('question', tbReal$Anno1) ) #28
[1] 28
> length( grep('question', tbReal$Anno2) ) #6
[1] 6
>
> table(tbReal$Growth1, tbReal$Growth2)
growth2 nogrowth2
growth1 4552 5
nogrowth1 63 1152
> # growth2 nogrowth2
> # growth1 4552 5
> # nogrowth1 63 1152
>
> # Verify some manually on SGD.
> tbReal[tbReal$Growth1=='nogrowth1' & tbReal$Growth2=='nogrowth2', ] [1:10,]
orf gene.x Anno1 Growth1 gene.y Anno2 Growth2
3 YAL003W EFB1 verified1 nogrowth1 EFB1 verified2 nogrowth2
15 YAL016W tpd3 verified1 nogrowth1 tpd3 verified2 nogrowth2
24 YAL025C mak16 verified1 nogrowth1 mak16 verified2 nogrowth2
31 YAL032C PRP45 verified1 nogrowth1 PRP45 verified2 nogrowth2
32 YAL033W POP5 verified1 nogrowth1 POP5 verified2 nogrowth2
34 YAL034W-A MTW1 verified1 nogrowth1 MTW1 verified2 nogrowth2
35 YAL035C-A YAL035C-A verified1 nogrowth1 YAL035C-A verified2 nogrowth2
39 YAL038W cdc19 verified1 nogrowth1 cdc19 verified2 nogrowth2
42 YAL041W cdc24 verified1 nogrowth1 cdc24 verified2 nogrowth2
44 YAL043C pta1 verified1 nogrowth1 pta1 verified2 nogrowth2
name.check
3 TRUE
15 TRUE
24 TRUE
31 TRUE
32 TRUE
34 TRUE
35 TRUE
39 TRUE
42 TRUE
44 TRUE
> tbReal[tbReal$Growth1=='growth1' & tbReal$Growth2=='nogrowth2', ]
orf gene.x Anno1 Growth1 gene.y Anno2 Growth2 name.check
956 YDR050C TPI1 verified1 growth1 TPI1 verified2 nogrowth2 TRUE
1448 YEL002C WBP1 verified1 growth1 WBP1 verified2 nogrowth2 TRUE
2685 YIL078W THS1 verified1 growth1 THS1 verified2 nogrowth2 TRUE
4152 YML126C ERG13 verified1 growth1 ERG13 verified2 nogrowth2 TRUE
4383 YMR218C TRS130 verified1 growth1 TRS130 verified2 nogrowth2 TRUE
>
> tbReal$essenflag = ifelse( tbReal$Growth1=='growth1' & tbReal$Growth2=='growth2', 'nonessential',
+ ifelse(tbReal$Growth1=='nogrowth1' & tbReal$Growth2=='nogrowth2', 'essential', 'abnormal') )
> head(tbReal)
orf gene.x Anno1 Growth1 gene.y Anno2 Growth2 name.check
1 YAL001C TFC3 verified1 growth1 TFC3 verified2 growth2 TRUE
2 YAL002W VPS8 verified1 growth1 VPS8 verified2 growth2 TRUE
3 YAL003W EFB1 verified1 nogrowth1 EFB1 verified2 nogrowth2 TRUE
4 YAL004W YAL004W verified1 growth1 YAL004W verified2 growth2 TRUE
5 YAL005C ssa1 verified1 growth1 ssa1 verified2 growth2 TRUE
6 YAL007C ERP2 verified1 growth1 ERP2 verified2 growth2 TRUE
essenflag
1 nonessential
2 nonessential
3 essential
4 nonessential
5 nonessential
6 nonessential
> table(tbReal$essenflag)
abnormal essential nonessential
68 1152 4552
> # abnormal essential nonessential
> # 68 1152 4552
> #Good, consistent results. Use these for further analysis. 2013 Oct 29
>
> write.csv(tbReal, "SummaryRegressionHetHom2013Oct29.csv", quote=F, row.names=F )
>
>
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment