Thursday, December 19, 2013

Convert DIP IDs to SGD ORFs, code with runningn results

$ R -f _20131219.convertDIPtoSGD.R 


> #convert DIP IDs to SGD ORFs
>
> # setwd("~/databases/DIP/sandbox")
> list.files()
 [1] "_20131219.convertDIPtoSGD.R" "_matchedSGD2DIP_IDs.txt"   
 [3] "_out_DIP2SGD.txt"            "_out_SGD2DIP.txt"          
 [5] "_test.faa"                   "_tmp.txt"                  
 [7] "fasta20131201.seq"           "fasta20131201.seq.phr"     
 [9] "fasta20131201.seq.pin"       "fasta20131201.seq.psq"     
[11] "formatdb.log"                "s288c-prot.faa"            
[13] "s288c-prot.faa.phr"          "s288c-prot.faa.pin"        
[15] "s288c-prot.faa.psq"          "Scere20131031.csv"         
>
> dic = read.table("_matchedSGD2DIP_IDs.txt", sep="\t", colClass=c("character","character"))
> names(dic) = c("ORF","DIP")
> str(dic)
'data.frame':    5160 obs. of  2 variables:
 $ ORF: chr  "Q0045" "Q0050" "Q0080" "Q0085" ...
 $ DIP: chr  "dip:DIP-6088N|uniprot:P98002" "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875" "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856" "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854" ...
>
> DIP.tb = read.csv("Scere20131031.csv", colClass=c(rep("character",18)))
> str(DIP.tb)
'data.frame':    22584 obs. of  18 variables:
 $ ID.interactor.A                : chr  "DIP-844N|refseq:NP_010949|uniprotkb:P40020" "DIP-777N|refseq:NP_010710|uniprotkb:P32578" "DIP-814N|refseq:NP_009655|uniprotkb:P22219" "DIP-33N|refseq:NP_010513|uniprotkb:P11978" ...
 $ ID.interactor.B                : chr  "DIP-871N|refseq:NP_010481|uniprotkb:P42073" "DIP-18N|refseq:NP_010765|uniprotkb:P06782" "DIP-97N|refseq:NP_013341|uniprotkb:P22543" "DIP-33N|refseq:NP_010513|uniprotkb:P11978" ...
 $ Alt..ID.interactor.A           : chr  " -" " -" " -" "-" ...
 $ Alt..ID.interactor.B           : chr  "-" "-" "-" "-" ...
 $ Alias.es..interactor.A         : chr  "-" "-" "-" "-" ...
 $ Alias.es..interactor.B         : chr  "-" "-" "-" "-" ...
 $ Interaction.detection.method.s.: chr  "MI:0018(two hybrid)" "MI:0018(two hybrid)|MI:0018(two hybrid)|MI:0019(coimmunoprecipitation)|MI:0018(two hybrid)|MI:0004(affinity chromatography tech"| __truncated__ "MI:0019(coimmunoprecipitation)|MI:0030(cross-linking study)" "MI:0018(two hybrid)" ...
 $ Publication.1st.author.s.      : chr  "-" "-" "-" "-" ...
 $ Publication.Identifier.s.      : chr  "pubmed:9196079|pubmed:DIP-356S" "pubmed:1496382|pubmed:DIP-448S|pubmed:9121458|pubmed:DIP-723S|pubmed:1496382|pubmed:DIP-448S|pubmed:7813428|pubmed:DIP-724S|pub"| __truncated__ "pubmed:8387919|pubmed:DIP-391S|pubmed:8387919|pubmed:DIP-391S" "pubmed:1946372|pubmed:DIP-50S" ...
 $ Taxid.interactor.A             : chr  "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" ...
 $ Taxid.interactor.B             : chr  "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" "taxid:4932(Saccharomyces cerevisiae)" ...
 $ Interaction.type.s.            : chr  "MI:0218(physical interaction)" "MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218"| __truncated__ "MI:0218(physical interaction)|MI:0407(direct interaction)" "MI:0218(physical interaction)" ...
 $ Source.database.s.             : chr  "MI:0465(dip)" "MI:0465(dip)" "MI:0465(dip)" "MI:0465(dip)" ...
 $ Interaction.identifier.s.      : chr  "DIP-536E" "DIP-539E" "DIP-540E" "DIP-541E" ...
 $ Confidence.value.s.            : chr  "dip-quality-status:core" "dip-quality-status:core" "dip-quality-status:core" "dip-quality-status:core" ...
 $ Processing.Status              : chr  "dip:0002(small scale)" "dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small sc"| __truncated__ "dip:0002(small scale)|dip:0002(small scale)" "dip:0002(small scale)" ...
 $ X                              : chr  "" "" "" "" ...
 $ X.1                            : chr  "-" "-" "-" "-" ...
> names(DIP.tb)
 [1] "ID.interactor.A"                 "ID.interactor.B"               
 [3] "Alt..ID.interactor.A"            "Alt..ID.interactor.B"          
 [5] "Alias.es..interactor.A"          "Alias.es..interactor.B"        
 [7] "Interaction.detection.method.s." "Publication.1st.author.s."     
 [9] "Publication.Identifier.s."       "Taxid.interactor.A"            
[11] "Taxid.interactor.B"              "Interaction.type.s."           
[13] "Source.database.s."              "Interaction.identifier.s."     
[15] "Confidence.value.s."             "Processing.Status"             
[17] "X"                               "X.1"                           
> head(DIP.tb)
                             ID.interactor.A
1 DIP-844N|refseq:NP_010949|uniprotkb:P40020
2 DIP-777N|refseq:NP_010710|uniprotkb:P32578
3 DIP-814N|refseq:NP_009655|uniprotkb:P22219
4  DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 DIP-698N|refseq:NP_010270|uniprotkb:P15646
6 DIP-982N|refseq:NP_011399|uniprotkb:P26309
                             ID.interactor.B Alt..ID.interactor.A
1 DIP-871N|refseq:NP_010481|uniprotkb:P42073                    -
2  DIP-18N|refseq:NP_010765|uniprotkb:P06782                    -
3  DIP-97N|refseq:NP_013341|uniprotkb:P22543                    -
4  DIP-33N|refseq:NP_010513|uniprotkb:P11978                    -
5 DIP-746N|refseq:NP_013090|uniprotkb:P33750                    -
6 DIP-293N|refseq:NP_011429|uniprotkb:P40957                    -
  Alt..ID.interactor.B Alias.es..interactor.A Alias.es..interactor.B
1                    -                      -                      -
2                    -                      -                      -
3                    -                      -                      -
4                    -                      -                      -
5                    -                      -                      -
6                    -                      -                      -
                                                                                                                                                                              Interaction.detection.method.s.
1                                                                                                                                                                                         MI:0018(two hybrid)
2 MI:0018(two hybrid)|MI:0018(two hybrid)|MI:0019(coimmunoprecipitation)|MI:0018(two hybrid)|MI:0004(affinity chromatography technology)|MI:0019(coimmunoprecipitation)|MI:0676(tandem affinity purification)
3                                                                                                                                                 MI:0019(coimmunoprecipitation)|MI:0030(cross-linking study)
4                                                                                                                                                                                         MI:0018(two hybrid)
5                                                                                                                                                                              MI:0019(coimmunoprecipitation)
6                                                                                                                                                                     MI:0018(two hybrid)|MI:0018(two hybrid)
  Publication.1st.author.s.
1                         -
2                         -
3                         -
4                         -
5                         -
6                         -
                                                                                                                                                                                                   Publication.Identifier.s.
1                                                                                                                                                                                             pubmed:9196079|pubmed:DIP-356S
2 pubmed:1496382|pubmed:DIP-448S|pubmed:9121458|pubmed:DIP-723S|pubmed:1496382|pubmed:DIP-448S|pubmed:7813428|pubmed:DIP-724S|pubmed:9121458|pubmed:DIP-723S|pubmed:7813428|pubmed:DIP-724S|pubmed:11805826|pubmed:DIP-1768S
3                                                                                                                                                              pubmed:8387919|pubmed:DIP-391S|pubmed:8387919|pubmed:DIP-391S
4                                                                                                                                                                                              pubmed:1946372|pubmed:DIP-50S
5                                                                                                                                                                                             pubmed:8508778|pubmed:DIP-198S
6                                                                                                                                                            pubmed:10848588|pubmed:DIP-1428S|pubmed:9461437|pubmed:DIP-189S
                    Taxid.interactor.A                   Taxid.interactor.B
1 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
2 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
3 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
4 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
5 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
6 taxid:4932(Saccharomyces cerevisiae) taxid:4932(Saccharomyces cerevisiae)
                                                                                                                                                                                                Interaction.type.s.
1                                                                                                                                                                                     MI:0218(physical interaction)
2 MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0218(physical interaction)|MI:0915(physical association)
3                                                                                                                                                         MI:0218(physical interaction)|MI:0407(direct interaction)
4                                                                                                                                                                                     MI:0218(physical interaction)
5                                                                                                                                                                                     MI:0218(physical interaction)
6                                                                                                                                                       MI:0218(physical interaction)|MI:0218(physical interaction)
  Source.database.s. Interaction.identifier.s.     Confidence.value.s.
1       MI:0465(dip)                  DIP-536E dip-quality-status:core
2       MI:0465(dip)                  DIP-539E dip-quality-status:core
3       MI:0465(dip)                  DIP-540E dip-quality-status:core
4       MI:0465(dip)                  DIP-541E dip-quality-status:core
5       MI:0465(dip)                  DIP-542E dip-quality-status:core
6       MI:0465(dip)                  DIP-590E dip-quality-status:core
                                                                                                                                              Processing.Status
1                                                                                                                                         dip:0002(small scale)
2 dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0002(small scale)|dip:0005(high throughput)
3                                                                                                                   dip:0002(small scale)|dip:0002(small scale)
4                                                                                                                                         dip:0002(small scale)
5                                                                                                                                         dip:0002(small scale)
6                                                                                                                   dip:0002(small scale)|dip:0002(small scale)
  X X.1
1     -
2     -
3     -
4     -
5     -
6     -
> summary(DIP.tb)
 ID.interactor.A    ID.interactor.B    Alt..ID.interactor.A
 Length:22584       Length:22584       Length:22584       
 Class :character   Class :character   Class :character   
 Mode  :character   Mode  :character   Mode  :character   
 Alt..ID.interactor.B Alias.es..interactor.A Alias.es..interactor.B
 Length:22584         Length:22584           Length:22584         
 Class :character     Class :character       Class :character     
 Mode  :character     Mode  :character       Mode  :character     
 Interaction.detection.method.s. Publication.1st.author.s.
 Length:22584                    Length:22584            
 Class :character                Class :character        
 Mode  :character                Mode  :character        
 Publication.Identifier.s. Taxid.interactor.A Taxid.interactor.B
 Length:22584              Length:22584       Length:22584     
 Class :character          Class :character   Class :character 
 Mode  :character          Mode  :character   Mode  :character 
 Interaction.type.s. Source.database.s. Interaction.identifier.s.
 Length:22584        Length:22584       Length:22584            
 Class :character    Class :character   Class :character        
 Mode  :character    Mode  :character   Mode  :character        
 Confidence.value.s. Processing.Status       X                 X.1          
 Length:22584        Length:22584       Length:22584       Length:22584     
 Class :character    Class :character   Class :character   Class :character 
 Mode  :character    Mode  :character   Mode  :character   Mode  :character 
>
> #There non-Sce entries in DIP data. I need to remove them from yeast PPI
> # ... to do
>
> #DIP id formats in interaction and sequence files are not the same
> DIP.tb$ID.interactor.A[1:10]
 [1] "DIP-844N|refseq:NP_010949|uniprotkb:P40020"
 [2] "DIP-777N|refseq:NP_010710|uniprotkb:P32578"
 [3] "DIP-814N|refseq:NP_009655|uniprotkb:P22219"
 [4] "DIP-33N|refseq:NP_010513|uniprotkb:P11978"
 [5] "DIP-698N|refseq:NP_010270|uniprotkb:P15646"
 [6] "DIP-982N|refseq:NP_011399|uniprotkb:P26309"
 [7] "DIP-982N|refseq:NP_011399|uniprotkb:P26309"
 [8] "DIP-948N|uniprotkb:Q80US8"                
 [9] "DIP-314N|refseq:NP_012619|uniprotkb:P18852"
[10] "DIP-954N|refseq:NP_014855|uniprotkb:P18851"
> #[1] "DIP-844N|refseq:NP_010949|uniprotkb:P40020" "DIP-777N|refseq:NP_010710|uniprotkb:P32578"
> #[3] "DIP-814N|refseq:NP_009655|uniprotkb:P22219" "DIP-33N|refseq:NP_010513|uniprotkb:P11978"
> DIP.tb$A.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.A)
> head(DIP.tb$A.DIP)
[1] "DIP-844N" "DIP-777N" "DIP-814N" "DIP-33N"  "DIP-698N" "DIP-982N"
> DIP.tb$B.DIP = gsub(pattern="\\|.+", replacement="", DIP.tb$ID.interactor.B)
> head(DIP.tb$B.DIP)
[1] "DIP-871N" "DIP-18N"  "DIP-97N"  "DIP-33N"  "DIP-746N" "DIP-293N"
>
> dic$DIP[1:5]
[1] "dip:DIP-6088N|uniprot:P98002"                
[2] "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875"
[3] "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856"
[4] "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854"
[5] "dip:DIP-350N|refseq:YP_209217|uniprot:P00157"
> #[1] "dip:DIP-6088N|uniprot:P98002"                  "dip:DIP-8242N|refseq:NP_009310|uniprot:P03875"
> #[3] "dip:DIP-3040N|refseq:NP_009312|uniprot:P00856" "dip:DIP-3038N|refseq:NP_009313|uniprot:P00854"
> #[5] "dip:DIP-350N|refseq:YP_209217|uniprot:P00157"
>
> dic$DIP2 = gsub(pattern="dip:", replacement="", dic$DIP)
> head(dic$DIP2)
[1] "DIP-6088N|uniprot:P98002"                
[2] "DIP-8242N|refseq:NP_009310|uniprot:P03875"
[3] "DIP-3040N|refseq:NP_009312|uniprot:P00856"
[4] "DIP-3038N|refseq:NP_009313|uniprot:P00854"
[5] "DIP-350N|refseq:YP_209217|uniprot:P00157"
[6] "DIP-3041N|refseq:NP_009319|uniprot:P61829"
>
> dic$DIP3 = gsub(pattern="\\|.+", replacement="", dic$DIP2)
> head(dic$DIP3)
[1] "DIP-6088N" "DIP-8242N" "DIP-3040N" "DIP-3038N" "DIP-350N"  "DIP-3041N"
>
> intersect(dic$DIP3, DIP.tb$A.DIP)[1:10] #it worked.
 [1] "DIP-3040N" "DIP-3038N" "DIP-3041N" "DIP-7592N" "DIP-7698N" "DIP-518N"
 [7] "DIP-6298N" "DIP-6445N" "DIP-2750N" "DIP-2253N"
> intersect(dic$DIP3, DIP.tb$B.DIP)[1:10] #it worked.
 [1] "DIP-8242N" "DIP-3040N" "DIP-3038N" "DIP-3041N" "DIP-7592N" "DIP-8133N"
 [7] "DIP-7698N" "DIP-518N"  "DIP-6739N" "DIP-6298N"
>
> # MAP DIP to SGD ORFs
> DIP.tb$ORF1 = dic$ORF[match(DIP.tb$A.DIP, dic$DIP3)]
> head(DIP.tb$ORF1)
[1] "YER032W" "YDR422C" "YBR097W" "YDR227W" "YDL014W" "YGL116W"
> summary(DIP.tb$ORF1) #it worked.
   Length     Class      Mode
    22584 character character
>
> DIP.tb$ORF2 = dic$ORF[match(DIP.tb$B.DIP, dic$DIP3)]
> head(DIP.tb$ORF2)
[1] "YDR195W" "YDR477W" "YLR240W" "YDR227W" "YLL011W" "YGL086W"
> summary(DIP.tb$ORF2)
   Length     Class      Mode
    22584 character character
>
> head(DIP.tb[,1:10])
                             ID.interactor.A
1 DIP-844N|refseq:NP_010949|uniprotkb:P40020
2 DIP-777N|refseq:NP_010710|uniprotkb:P32578
3 DIP-814N|refseq:NP_009655|uniprotkb:P22219
4  DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 DIP-698N|refseq:NP_010270|uniprotkb:P15646
6 DIP-982N|refseq:NP_011399|uniprotkb:P26309
                             ID.interactor.B Alt..ID.interactor.A
1 DIP-871N|refseq:NP_010481|uniprotkb:P42073                    -
2  DIP-18N|refseq:NP_010765|uniprotkb:P06782                    -
3  DIP-97N|refseq:NP_013341|uniprotkb:P22543                    -
4  DIP-33N|refseq:NP_010513|uniprotkb:P11978                    -
5 DIP-746N|refseq:NP_013090|uniprotkb:P33750                    -
6 DIP-293N|refseq:NP_011429|uniprotkb:P40957                    -
  Alt..ID.interactor.B Alias.es..interactor.A Alias.es..interactor.B
1                    -                      -                      -
2                    -                      -                      -
3                    -                      -                      -
4                    -                      -                      -
5                    -                      -                      -
6                    -                      -                      -
                                                                                                                                                                              Interaction.detection.method.s.
1                                                                                                                                                                                         MI:0018(two hybrid)
2 MI:0018(two hybrid)|MI:0018(two hybrid)|MI:0019(coimmunoprecipitation)|MI:0018(two hybrid)|MI:0004(affinity chromatography technology)|MI:0019(coimmunoprecipitation)|MI:0676(tandem affinity purification)
3                                                                                                                                                 MI:0019(coimmunoprecipitation)|MI:0030(cross-linking study)
4                                                                                                                                                                                         MI:0018(two hybrid)
5                                                                                                                                                                              MI:0019(coimmunoprecipitation)
6                                                                                                                                                                     MI:0018(two hybrid)|MI:0018(two hybrid)
  Publication.1st.author.s.
1                         -
2                         -
3                         -
4                         -
5                         -
6                         -
                                                                                                                                                                                                   Publication.Identifier.s.
1                                                                                                                                                                                             pubmed:9196079|pubmed:DIP-356S
2 pubmed:1496382|pubmed:DIP-448S|pubmed:9121458|pubmed:DIP-723S|pubmed:1496382|pubmed:DIP-448S|pubmed:7813428|pubmed:DIP-724S|pubmed:9121458|pubmed:DIP-723S|pubmed:7813428|pubmed:DIP-724S|pubmed:11805826|pubmed:DIP-1768S
3                                                                                                                                                              pubmed:8387919|pubmed:DIP-391S|pubmed:8387919|pubmed:DIP-391S
4                                                                                                                                                                                              pubmed:1946372|pubmed:DIP-50S
5                                                                                                                                                                                             pubmed:8508778|pubmed:DIP-198S
6                                                                                                                                                            pubmed:10848588|pubmed:DIP-1428S|pubmed:9461437|pubmed:DIP-189S
                    Taxid.interactor.A
1 taxid:4932(Saccharomyces cerevisiae)
2 taxid:4932(Saccharomyces cerevisiae)
3 taxid:4932(Saccharomyces cerevisiae)
4 taxid:4932(Saccharomyces cerevisiae)
5 taxid:4932(Saccharomyces cerevisiae)
6 taxid:4932(Saccharomyces cerevisiae)
> DIP.tb= DIP.tb[, c(21,22,19,20,1:18)]
> head(DIP.tb[,1:6])
     ORF1    ORF2    A.DIP    B.DIP                            ID.interactor.A
1 YER032W YDR195W DIP-844N DIP-871N DIP-844N|refseq:NP_010949|uniprotkb:P40020
2 YDR422C YDR477W DIP-777N  DIP-18N DIP-777N|refseq:NP_010710|uniprotkb:P32578
3 YBR097W YLR240W DIP-814N  DIP-97N DIP-814N|refseq:NP_009655|uniprotkb:P22219
4 YDR227W YDR227W  DIP-33N  DIP-33N  DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 YDL014W YLL011W DIP-698N DIP-746N DIP-698N|refseq:NP_010270|uniprotkb:P15646
6 YGL116W YGL086W DIP-982N DIP-293N DIP-982N|refseq:NP_011399|uniprotkb:P26309
                             ID.interactor.B
1 DIP-871N|refseq:NP_010481|uniprotkb:P42073
2  DIP-18N|refseq:NP_010765|uniprotkb:P06782
3  DIP-97N|refseq:NP_013341|uniprotkb:P22543
4  DIP-33N|refseq:NP_010513|uniprotkb:P11978
5 DIP-746N|refseq:NP_013090|uniprotkb:P33750
6 DIP-293N|refseq:NP_011429|uniprotkb:P40957
>
> write.csv(DIP.tb, "_SceDIP_withORFs.20131219.csv",row.names=F)
>

No comments:

Post a Comment