#convert DIP IDs to SGD ORFs
# setwd("~/databases/DIP/sandbox")
list.files()
dic = read.table("_matchedSGD2DIP_IDs.txt", sep="\t", colClass=c("character","character"))
names(dic) = c("ORF","DIP")
str(dic)
DIP.tb = read.csv("Scere20131031.csv", colClass=c(rep("character",18)))
str(DIP.tb)
names(DIP.tb)
head(DIP.tb)
summary(DIP.tb) 
#There non-Sce entries in DIP data. I need to remove them from yeast PPI
# ... to do
#DIP id formats in interaction and sequence files are not the same
DIP.tb$ID.interactor.A[1:10]
#[1] "DIP-844N|refseq:NP_010949|uniprotkb:P40020" "DIP-777N|refseq:NP_010710|uniprotkb:P32578"
#[3] "DIP-814N|refseq:NP_009655|uniprotkb:P22219" "DIP-33N|refseq:NP_010513|uniprotkb:P11978" 
DIP.tb$A.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.A)
head(DIP.tb$A.DIP)
DIP.tb$B.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.B)
head(DIP.tb$B.DIP)
dic$DIP[1:5]
#[1] "dip:DIP-6088N|uniprot:P98002"                  "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875"
#[3] "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856" "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854"
#[5] "dip:DIP-350N|refseq:YP_209217|uniprot:P00157" 
dic$DIP2 = gsub(pattern="dip:", replacement="", dic$DIP)
head(dic$DIP2)
dic$DIP3 = gsub(pattern="\\|.+", replacement="", dic$DIP2)
head(dic$DIP3)
intersect(dic$DIP3, DIP.tb$A.DIP)[1:10] #it worked. 
intersect(dic$DIP3, DIP.tb$B.DIP)[1:10] #it worked. 
# MAP DIP to SGD ORFs
DIP.tb$ORF1 = dic$ORF[match(DIP.tb$A.DIP, dic$DIP3)]
head(DIP.tb$ORF1)
summary(DIP.tb$ORF1) #it worked.
DIP.tb$ORF2 = dic$ORF[match(DIP.tb$B.DIP, dic$DIP3)]
head(DIP.tb$ORF2)
summary(DIP.tb$ORF2) 
head(DIP.tb[,1:10])
DIP.tb= DIP.tb[, c(21,22,19,20,1:18)]
head(DIP.tb[,1:6])
write.csv(DIP.tb, "_SceDIP_withORFs.20131219.csv",row.names=F)
 
No comments:
Post a Comment