$ R -f _20131219.convertDIPtoSGD.R
> #convert DIP IDs to SGD ORFs
>
> # setwd("~/databases/DIP/sandbox")
> list.files()
[1] "_20131219.convertDIPtoSGD.R" "_matchedSGD2DIP_IDs.txt"
[3] "_out_DIP2SGD.txt" "_out_SGD2DIP.txt"
[5] "_test.faa" "_tmp.txt"
[7] "fasta20131201.seq" "fasta20131201.seq.phr"
[9] "fasta20131201.seq.pin" "fasta20131201.seq.psq"
[11] "formatdb.log" "s288c-prot.faa"
[13] "s288c-prot.faa.phr" "s288c-prot.faa.pin"
[15] "s288c-prot.faa.psq" "Scere20131031.csv"
>
> dic = read.table("_matchedSGD2DIP_IDs.txt", sep="\t", colClass=c("character","character"))
> names(dic) = c("ORF","DIP")
> str(dic)
'data.frame': 5160 obs. of 2 variables:
$ ORF: chr "Q0045" "Q0050" "Q0080" "Q0085" ...
$ DIP: chr "dip:DIP-6088N|uniprot:P98002" "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875" "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856" "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854" ...
>
> DIP.tb = read.csv("Scere20131031.csv", colClass=c(rep("character",18)))
> str(DIP.tb)
'data.frame': 22584 obs. of 18 variables:
$ ID.interactor.A : chr "DIP-844N|refseq:NP_010949|uniprotkb:P40020" "DIP-777N|refseq:NP_010710|uniprotkb:P32578" "DIP-814N|refseq:NP_009655|uniprotkb:P22219" "DIP-33N|refseq:NP_010513|uniprotkb:P11978" ...
$ ID.interactor.B : chr "DIP-871N|refseq:NP_010481|uniprotkb:P42073" "DIP-18N|refseq:NP_010765|uniprotkb:P06782" "DIP-97N|refseq:NP_013341|uniprotkb:P22543" "DIP-33N|refseq:NP_010513|uniprotkb:P11978" ...
$ Alt..ID.interactor.A : chr " -" " -" " -" "-" ...
$ Alt..ID.interactor.B : chr "-" "-" "-" "-" ...
$ Alias.es..interactor.A : chr "-" "-" "-" "-" ...
$ Alias.es..interactor.B : chr "-" "-" "-" "-" ...
$ Interaction.detection.method.s.: chr "MI:0018(two hybrid)" "MI:0018(two hybrid)|MI:0018(two hybrid)|MI:0019(coimmunoprecipitation)|MI:0018(two hybrid)|MI:0004(affinity chromatography tech"| __truncated__ "MI:0019(coimmunoprecipitation)|MI:0030(cross-linking study)" "MI:0018(two hybrid)" ...
$ Publication.1st.author.s. : chr "-" "-" "-" "-" ...
$ Publication.Identifier.s. : chr "pubmed:9196079|pubmed:DIP-356S" "pubmed:1496382|pubmed:DIP-448S|pubmed:9121458|pubmed:DIP-723S|pubmed:1496382|pubmed:DIP-448S|pubmed:7813428|pubmed:DIP-724S|pub"| __truncated__ "pubmed:8387919|pubmed:DIP-391S|pubmed:8387919|pubmed:DIP-391S" "pubmed:1946372|pubmed:DIP-50S" ...
$ Taxid.interactor.A : chr "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" ...
$ Taxid.interactor.B : chr "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" ...
$ Interaction.type.s. : chr "MI:0218(physical interaction)" "MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218"| __truncated__ "MI:0218(physical interaction)|MI:0407(direct interaction)" "MI:0218(physical interaction)" ...
$ Source.database.s. : chr "MI:0465(dip)" "MI:0465(dip)" "MI:0465(dip)" "MI:0465(dip)" ...
$ Interaction.identifier.s. : chr "DIP-536E" "DIP-539E" "DIP-540E" "DIP-541E" ...
$ Confidence.value.s. : chr "dip-quality-status:core" "dip-quality-status:core" "dip-quality-status:core" "dip-quality-status:core" ...
$ Processing.Status : chr "dip:0002(small scale)" "dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small sc"| __truncated__ "dip:0002(small scale)|dip:0002(small scale)" "dip:0002(small scale)" ...
$ X : chr "" "" "" "" ...
$ X.1 : chr "-" "-" "-" "-" ...
> names(DIP.tb)
[1] "ID.interactor.A" "ID.interactor.B"
[3] "Alt..ID.interactor.A" "Alt..ID.interactor.B"
[5] "Alias.es..interactor.A" "Alias.es..interactor.B"
[7] "Interaction.detection.method.s." "Publication.1st.author.s."
[9] "Publication.Identifier.s." "Taxid.interactor.A"
[11] "Taxid.interactor.B" "Interaction.type.s."
[13] "Source.database.s." "Interaction.identifier.s."
[15] "Confidence.value.s." "Processing.Status"
[17] "X" "X.1"
> head(DIP.tb)
ID.interactor.A
1 DIP-844N|refseq:NP_010949|uniprotkb:P40020
2 DIP-777N|refseq:NP_010710|uniprotkb:P32578
3 DIP-814N|refseq:NP_009655|uniprotkb:P22219
4 DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 DIP-698N|refseq:NP_010270|uniprotkb:P15646
6 DIP-982N|refseq:NP_011399|uniprotkb:P26309
ID.interactor.B Alt..ID.interactor.A
1 DIP-871N|refseq:NP_010481|uniprotkb:P42073 -
2 DIP-18N|refseq:NP_010765|uniprotkb:P06782 -
3 DIP-97N|refseq:NP_013341|uniprotkb:P22543 -
4 DIP-33N|refseq:NP_010513|uniprotkb:P11978 -
5 DIP-746N|refseq:NP_013090|uniprotkb:P33750 -
6 DIP-293N|refseq:NP_011429|uniprotkb:P40957 -
Alt..ID.interactor.B Alias.es..interactor.A Alias.es..interactor.B
1 - - -
2 - - -
3 - - -
4 - - -
5 - - -
6 - - -
Interaction.detection.method.s.
1 MI:0018(two hybrid)
2 MI:0018(two hybrid)|MI:0018(two hybrid)|MI:0019(coimmunoprecipitation)|MI:0018(two hybrid)|MI:0004(affinity chromatography technology)|MI:0019(coimmunoprecipitation)|MI:0676(tandem affinity purification)
3 MI:0019(coimmunoprecipitation)|MI:0030(cross-linking study)
4 MI:0018(two hybrid)
5 MI:0019(coimmunoprecipitation)
6 MI:0018(two hybrid)|MI:0018(two hybrid)
Publication.1st.author.s.
1 -
2 -
3 -
4 -
5 -
6 -
Publication.Identifier.s.
1 pubmed:9196079|pubmed:DIP-356S
2 pubmed:1496382|pubmed:DIP-448S|pubmed:9121458|pubmed:DIP-723S|pubmed:1496382|pubmed:DIP-448S|pubmed:7813428|pubmed:DIP-724S|pubmed:9121458|pubmed:DIP-723S|pubmed:7813428|pubmed:DIP-724S|pubmed:11805826|pubmed:DIP-1768S
3 pubmed:8387919|pubmed:DIP-391S|pubmed:8387919|pubmed:DIP-391S
4 pubmed:1946372|pubmed:DIP-50S
5 pubmed:8508778|pubmed:DIP-198S
6 pubmed:10848588|pubmed:DIP-1428S|pubmed:9461437|pubmed:DIP-189S
Taxid.interactor.A Taxid.interactor.B
1 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
2 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
3 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
4 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
5 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
6 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
Interaction.type.s.
1 MI:0218(physical interaction)
2 MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0915(physical association)
3 MI:0218(physical interaction)|MI:0407(direct interaction)
4 MI:0218(physical interaction)
5 MI:0218(physical interaction)
6 MI:0218(physical interaction)|MI:0218(physical interaction)
Source.database.s. Interaction.identifier.s. Confidence.value.s.
1 MI:0465(dip) DIP-536E dip-quality-status:core
2 MI:0465(dip) DIP-539E dip-quality-status:core
3 MI:0465(dip) DIP-540E dip-quality-status:core
4 MI:0465(dip) DIP-541E dip-quality-status:core
5 MI:0465(dip) DIP-542E dip-quality-status:core
6 MI:0465(dip) DIP-590E dip-quality-status:core
Processing.Status
1 dip:0002(small scale)
2 dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0005(high throughput)
3 dip:0002(small scale)|dip:0002(small scale)
4 dip:0002(small scale)
5 dip:0002(small scale)
6 dip:0002(small scale)|dip:0002(small scale)
X X.1
1 -
2 -
3 -
4 -
5 -
6 -
> summary(DIP.tb)
ID.interactor.A ID.interactor.B Alt..ID.interactor.A
Length:22584 Length:22584 Length:22584
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Alt..ID.interactor.B Alias.es..interactor.A Alias.es..interactor.B
Length:22584 Length:22584 Length:22584
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Interaction.detection.method.s. Publication.1st.author.s.
Length:22584 Length:22584
Class :character Class :character
Mode :character Mode :character
Publication.Identifier.s. Taxid.interactor.A Taxid.interactor.B
Length:22584 Length:22584 Length:22584
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Interaction.type.s. Source.database.s. Interaction.identifier.s.
Length:22584 Length:22584 Length:22584
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Confidence.value.s. Processing.Status X X.1
Length:22584 Length:22584 Length:22584 Length:22584
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
>
> #There non-Sce entries in DIP data. I need to remove them from yeast PPI
> # ... to do
>
> #DIP id formats in interaction and sequence files are not the same
> DIP.tb$ID.interactor.A[1:10]
[1] "DIP-844N|refseq:NP_010949|uniprotkb:P40020"
[2] "DIP-777N|refseq:NP_010710|uniprotkb:P32578"
[3] "DIP-814N|refseq:NP_009655|uniprotkb:P22219"
[4] "DIP-33N|refseq:NP_010513|uniprotkb:P11978"
[5] "DIP-698N|refseq:NP_010270|uniprotkb:P15646"
[6] "DIP-982N|refseq:NP_011399|uniprotkb:P26309"
[7] "DIP-982N|refseq:NP_011399|uniprotkb:P26309"
[8] "DIP-948N|uniprotkb:Q80US8"
[9] "DIP-314N|refseq:NP_012619|uniprotkb:P18852"
[10] "DIP-954N|refseq:NP_014855|uniprotkb:P18851"
> #[1] "DIP-844N|refseq:NP_010949|uniprotkb:P40020" "DIP-777N|refseq:NP_010710|uniprotkb:P32578"
> #[3] "DIP-814N|refseq:NP_009655|uniprotkb:P22219" "DIP-33N|refseq:NP_010513|uniprotkb:P11978"
> DIP.tb$A.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.A)
> head(DIP.tb$A.DIP)
[1] "DIP-844N" "DIP-777N" "DIP-814N" "DIP-33N" "DIP-698N" "DIP-982N"
> DIP.tb$B.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.B)
> head(DIP.tb$B.DIP)
[1] "DIP-871N" "DIP-18N" "DIP-97N" "DIP-33N" "DIP-746N" "DIP-293N"
>
> dic$DIP[1:5]
[1] "dip:DIP-6088N|uniprot:P98002"
[2] "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875"
[3] "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856"
[4] "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854"
[5] "dip:DIP-350N|refseq:YP_209217|uniprot:P00157"
> #[1] "dip:DIP-6088N|uniprot:P98002" "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875"
> #[3] "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856" "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854"
> #[5] "dip:DIP-350N|refseq:YP_209217|uniprot:P00157"
>
> dic$DIP2 = gsub(pattern="dip:", replacement="", dic$DIP)
> head(dic$DIP2)
[1] "DIP-6088N|uniprot:P98002"
[2] "DIP-8242N|refseq:NP_009310|uniprot:P03875"
[3] "DIP-3040N|refseq:NP_009312|uniprot:P00856"
[4] "DIP-3038N|refseq:NP_009313|uniprot:P00854"
[5] "DIP-350N|refseq:YP_209217|uniprot:P00157"
[6] "DIP-3041N|refseq:NP_009319|uniprot:P61829"
>
> dic$DIP3 = gsub(pattern="\\|.+", replacement="", dic$DIP2)
> head(dic$DIP3)
[1] "DIP-6088N" "DIP-8242N" "DIP-3040N" "DIP-3038N" "DIP-350N" "DIP-3041N"
>
> intersect(dic$DIP3, DIP.tb$A.DIP)[1:10] #it worked.
[1] "DIP-3040N" "DIP-3038N" "DIP-3041N" "DIP-7592N" "DIP-7698N" "DIP-518N"
[7] "DIP-6298N" "DIP-6445N" "DIP-2750N" "DIP-2253N"
> intersect(dic$DIP3, DIP.tb$B.DIP)[1:10] #it worked.
[1] "DIP-8242N" "DIP-3040N" "DIP-3038N" "DIP-3041N" "DIP-7592N" "DIP-8133N"
[7] "DIP-7698N" "DIP-518N" "DIP-6739N" "DIP-6298N"
>
> # MAP DIP to SGD ORFs
> DIP.tb$ORF1 = dic$ORF[match(DIP.tb$A.DIP, dic$DIP3)]
> head(DIP.tb$ORF1)
[1] "YER032W" "YDR422C" "YBR097W" "YDR227W" "YDL014W" "YGL116W"
> summary(DIP.tb$ORF1) #it worked.
Length Class Mode
22584 character character
>
> DIP.tb$ORF2 = dic$ORF[match(DIP.tb$B.DIP, dic$DIP3)]
> head(DIP.tb$ORF2)
[1] "YDR195W" "YDR477W" "YLR240W" "YDR227W" "YLL011W" "YGL086W"
> summary(DIP.tb$ORF2)
Length Class Mode
22584 character character
>
> head(DIP.tb[,1:10])
ID.interactor.A
1 DIP-844N|refseq:NP_010949|uniprotkb:P40020
2 DIP-777N|refseq:NP_010710|uniprotkb:P32578
3 DIP-814N|refseq:NP_009655|uniprotkb:P22219
4 DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 DIP-698N|refseq:NP_010270|uniprotkb:P15646
6 DIP-982N|refseq:NP_011399|uniprotkb:P26309
ID.interactor.B Alt..ID.interactor.A
1 DIP-871N|refseq:NP_010481|uniprotkb:P42073 -
2 DIP-18N|refseq:NP_010765|uniprotkb:P06782 -
3 DIP-97N|refseq:NP_013341|uniprotkb:P22543 -
4 DIP-33N|refseq:NP_010513|uniprotkb:P11978 -
5 DIP-746N|refseq:NP_013090|uniprotkb:P33750 -
6 DIP-293N|refseq:NP_011429|uniprotkb:P40957 -
Alt..ID.interactor.B Alias.es..interactor.A Alias.es..interactor.B
1 - - -
2 - - -
3 - - -
4 - - -
5 - - -
6 - - -
Interaction.detection.method.s.
1 MI:0018(two hybrid)
2 MI:0018(two hybrid)|MI:0018(two hybrid)|MI:0019(coimmunoprecipitation)|MI:0018(two hybrid)|MI:0004(affinity chromatography technology)|MI:0019(coimmunoprecipitation)|MI:0676(tandem affinity purification)
3 MI:0019(coimmunoprecipitation)|MI:0030(cross-linking study)
4 MI:0018(two hybrid)
5 MI:0019(coimmunoprecipitation)
6 MI:0018(two hybrid)|MI:0018(two hybrid)
Publication.1st.author.s.
1 -
2 -
3 -
4 -
5 -
6 -
Publication.Identifier.s.
1 pubmed:9196079|pubmed:DIP-356S
2 pubmed:1496382|pubmed:DIP-448S|pubmed:9121458|pubmed:DIP-723S|pubmed:1496382|pubmed:DIP-448S|pubmed:7813428|pubmed:DIP-724S|pubmed:9121458|pubmed:DIP-723S|pubmed:7813428|pubmed:DIP-724S|pubmed:11805826|pubmed:DIP-1768S
3 pubmed:8387919|pubmed:DIP-391S|pubmed:8387919|pubmed:DIP-391S
4 pubmed:1946372|pubmed:DIP-50S
5 pubmed:8508778|pubmed:DIP-198S
6 pubmed:10848588|pubmed:DIP-1428S|pubmed:9461437|pubmed:DIP-189S
Taxid.interactor.A
1 taxid:4932(Saccharomyces cerevisiae)
2 taxid:4932(Saccharomyces cerevisiae)
3 taxid:4932(Saccharomyces cerevisiae)
4 taxid:4932(Saccharomyces cerevisiae)
5 taxid:4932(Saccharomyces cerevisiae)
6 taxid:4932(Saccharomyces cerevisiae)
> DIP.tb= DIP.tb[, c(21,22,19,20,1:18)]
> head(DIP.tb[,1:6])
ORF1 ORF2 A.DIP B.DIP ID.interactor.A
1 YER032W YDR195W DIP-844N DIP-871N DIP-844N|refseq:NP_010949|uniprotkb:P40020
2 YDR422C YDR477W DIP-777N DIP-18N DIP-777N|refseq:NP_010710|uniprotkb:P32578
3 YBR097W YLR240W DIP-814N DIP-97N DIP-814N|refseq:NP_009655|uniprotkb:P22219
4 YDR227W YDR227W DIP-33N DIP-33N DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 YDL014W YLL011W DIP-698N DIP-746N DIP-698N|refseq:NP_010270|uniprotkb:P15646
6 YGL116W YGL086W DIP-982N DIP-293N DIP-982N|refseq:NP_011399|uniprotkb:P26309
ID.interactor.B
1 DIP-871N|refseq:NP_010481|uniprotkb:P42073
2 DIP-18N|refseq:NP_010765|uniprotkb:P06782
3 DIP-97N|refseq:NP_013341|uniprotkb:P22543
4 DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 DIP-746N|refseq:NP_013090|uniprotkb:P33750
6 DIP-293N|refseq:NP_011429|uniprotkb:P40957
>
> write.csv(DIP.tb, "_SceDIP_withORFs.20131219.csv",row.names=F)
>
No comments:
Post a Comment