Wednesday, January 19, 2011

Generate all possible proteins from ambiguous DNA

This had me stumped for awhile, but this works pretty well.  Does NOT handle stop codons or gap characters like '-'.  Requires BioPython


import itertools
from Bio.Seq import Seq
from Bio.Data import CodonTable
from Bio.Data import IUPACData</pre>
# Takes Bio.Seq.Seq object as input
# Returns list of all possible proteins
# Assumes sequence is in frame +1
def generateProtFromAmbiguousDNA(s):
   std_nt = CodonTable.unambiguous_dna_by_name["Standard"]
   nonstd = IUPACData.ambiguous_dna_values
   aa_trans = []
   for i in range(0,len(s),3):
      codon = s.tostring()[i:i+3]
      aa = CodonTable.list_possible_proteins(codon,std_nt.forward_table,nonstd) 
      aa_trans.append(aa)
   proteins = list(itertools.product(*aa_trans))
   possible_proteins = []
   for x in proteins:
      possible_proteins.append("".join(x))
   return possible_proteins
def main():
   a = Seq('ATGGCARTTGTAHAC')
   print "DNA: ",a.tostring()
   print "Proteins:"
   foo = generateProtFromAmbiguousDNA(a)
   for s in foo: print s
if __name__ == '__main__':
   main()

No comments:

Post a Comment