Thursday, December 19, 2013

Convert DIP IDs to SGD ORFs, code only

#convert DIP IDs to SGD ORFs

# setwd("~/databases/DIP/sandbox")
list.files()

dic = read.table("_matchedSGD2DIP_IDs.txt", sep="\t", colClass=c("character","character"))
names(dic) = c("ORF","DIP")
str(dic)

DIP.tb = read.csv("Scere20131031.csv", colClass=c(rep("character",18)))
str(DIP.tb)
names(DIP.tb)
head(DIP.tb)
summary(DIP.tb)

#There non-Sce entries in DIP data. I need to remove them from yeast PPI
# ... to do

#DIP id formats in interaction and sequence files are not the same
DIP.tb$ID.interactor.A[1:10]
#[1] "DIP-844N|refseq:NP_010949|uniprotkb:P40020" "DIP-777N|refseq:NP_010710|uniprotkb:P32578"
#[3] "DIP-814N|refseq:NP_009655|uniprotkb:P22219" "DIP-33N|refseq:NP_010513|uniprotkb:P11978"
DIP.tb$A.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.A)
head(DIP.tb$A.DIP)
DIP.tb$B.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.B)
head(DIP.tb$B.DIP)

dic$DIP[1:5]
#[1] "dip:DIP-6088N|uniprot:P98002"                  "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875"
#[3] "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856" "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854"
#[5] "dip:DIP-350N|refseq:YP_209217|uniprot:P00157"

dic$DIP2 = gsub(pattern="dip:", replacement="", dic$DIP)
head(dic$DIP2)

dic$DIP3 = gsub(pattern="\\|.+", replacement="", dic$DIP2)
head(dic$DIP3)

intersect(dic$DIP3, DIP.tb$A.DIP)[1:10] #it worked.
intersect(dic$DIP3, DIP.tb$B.DIP)[1:10] #it worked.

# MAP DIP to SGD ORFs
DIP.tb$ORF1 = dic$ORF[match(DIP.tb$A.DIP, dic$DIP3)]
head(DIP.tb$ORF1)
summary(DIP.tb$ORF1) #it worked.

DIP.tb$ORF2 = dic$ORF[match(DIP.tb$B.DIP, dic$DIP3)]
head(DIP.tb$ORF2)
summary(DIP.tb$ORF2)

head(DIP.tb[,1:10])
DIP.tb= DIP.tb[, c(21,22,19,20,1:18)]
head(DIP.tb[,1:6])

write.csv(DIP.tb, "_SceDIP_withORFs.20131219.csv",row.names=F)

No comments:

Post a Comment