Wednesday, January 2, 2013

First try to enumerate truncated trees for bacilli

import warnings
import re
from Bio import Phylo

stree = "((Bha,Bcl), (Bpu,(Bli,(Bam,(Bmo,Bsu)))),  (Bwe,(Bce,(Ban,Bth))))"
stree2 = re.sub( r'[\(\)\s+]', '', stree)
taxa = re.split( ',', stree2)
outgrouptaxa = taxa[0:2]
bsutaxa = taxa[2:7]
bcetaxa = taxa[7:12]
bacillustaxa = bsutaxa + bcetaxa

#tmpfl = "/tmp/_tmpfile2013Jan2.nwk"
#tmpFH = open( tmpfl, 'w')
#tmpFH.write( stree + ";\n" )
#tmpFH.close()
#mastertree = Phylo.read(tmpfl, 'newick')


def lookup_by_names(tree):
    names = {}
    for clade in tree.get_terminals():
        if clade.name:
            if clade.name in names:
                raise ValueError("Duplicate key: %s" % clade.name)
            names[clade.name] = clade
    return names

from StringIO import StringIO
#mastertree = Phylo.read(StringIO(stree), 'newick')
#names = lookup_by_names(mastertree)

#single leaf-node deletion
for node in bacillustaxa:
    tree = Phylo.read(StringIO(stree), 'newick')
    names = lookup_by_names(tree)
    tree.prune(names[node])
    Phylo.write( tree, 'sandbox/-' + node + '.nwk', 'newick')

#double leaf node deletion
for i in range(len(bacillustaxa)):
    for j in range( (i+1), len(bacillustaxa)):
        print i, j
        tree = Phylo.read(StringIO(stree), 'newick')
        names = lookup_by_names(tree)
        tree.prune(names[bacillustaxa[i]])
        tree.prune(names[bacillustaxa[j]])
        Phylo.write( tree, 'sandbox/_' + bacillustaxa[i] + bacillustaxa[j]+'.nwk', 'newick')


$ pwd
~coat.protein/ortholog.analysis/coatpaml/python/



This is not bad for a one-hour coding on a first try. I should try to write multiple trees in a single file. I should try a generic loop or more likely, a recursive call.





No comments:

Post a Comment