[BioPython] How to check codon usage for specific amino acid positions in a given set of CDS sequences

Thu Jan 15 23:46:52 EST 2009

Peter,
Wow! The code(for positional frequency of codons) works 4 me. Thanks a ton.
While we are at it please allow me to ask you another question related to
downloading CDS sequences. I have copied one script from mailing list for
downloading CDS given from Genbank record of protein sequence written by
Andrew Dalke. I modified it a little bit to include few more exceptions and
it work in most of the cases but it's still not bug free. Giving errors
frequently. I am copying both the script and errors. See if you can spot the
problem.. or can suggest better way of doing it.. 

----------------------------------------------------------------------------
----------------------------------------------------------------------------
-
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import GenBank
from Bio.GenBank import LocationParser
from EUtils import DBIds, DBIdsClient
from Bio.SeqRecord import SeqRecord
import StringIO
from Bio import Entrez
from Bio.Alphabet import IUPAC
#Animesh Agrawal Email:animesh.agrawal at anu.edu.au

print "This program extracts the CDS of a given Genbank protein file\n"

File_Input = raw_input("Give the name of input file:\t")

File_Output = raw_input("Give the name of output file:\t")

gb_handle = open(File_Input, "r")

feature_parser = GenBank.FeatureParser ()

iterator = GenBank.Iterator (gb_handle, feature_parser)

Out_file= open(File_Output, "w")

def lookup(name, seq_start, seq_stop):
     h = DBIdsClient.from_dbids(DBIds(db = "nucleotide", ids = [name]))
     return h.efetch(retmode = "text", rettype = "fasta",
                     seq_start = seq_start, seq_stop = seq_stop).read()

def make_rc_record(record) :
    """Returns a new SeqRecord with the reverse complement sequence."""
    rc_rec = SeqRecord(seq = record.seq.reverse_complement(), \
                       id = "rc_" + record.id, \
                       name = "rc_" + record.name, \
                       description = "reverse complement")
    return rc_rec

while 1:
    cur_entry = iterator.next ()
    Genbank_entry = str(cur_entry)
    if cur_entry is None:
        break

    for feature in cur_entry.features :
        if feature.type == "CDS":
                 loc = feature.qualifiers["coded_by"][0]
                 Temp1=loc
                 Temp2 = Temp1.split('(')
                  # for genbank record like this
			# coded_by="complement(NC_001713.1:67323..68795)"
			if Temp2[0]=="complement":   
                      Temp3 = Temp2[1].replace(')', '')

parsed_loc_complement=LocationParser.parse(LocationParser.scan(Temp3))
                      assert isinstance(parsed_loc_complement,
LocationParser.AbsoluteLocation)
                      assert
isinstance(parsed_loc_complement.local_location, LocationParser.Range)
                      seq_start = parsed_loc_complement.local_location.low
                      seq_stop = parsed_loc_complement.local_location.high
                      assert isinstance(seq_start, LocationParser.Integer)
                      assert isinstance(seq_stop, LocationParser.Integer)
                      seq_start = seq_start.val
                      seq_stop = seq_stop.val
                      Temp4=lookup(parsed_loc_complement.path.accession,
seq_start, seq_stop)
                      fasta_handle = StringIO.StringIO(Temp4)
                      record = SeqIO.read(fasta_handle, "fasta")
                      record = make_rc_record(record)
                      Out_file.write(record.format("fasta"))
                      break
                  # for genbank record like this
			# coded_by="join(NC_008114.1:51934..52632,
NC_008114.1:54315..55043)"
			elif Temp2[0]=="join":
                      loc=loc.replace('join', '')
                      loc= loc.replace('(', '')
                      loc= loc.replace(')', '')
                      loc = loc.split(',')
                      loc1=loc[0].split(':')
                      loc2=loc[1].split(':')
                      loc3=loc1[1].split('..')
                      loc4=loc2[1].split('..')
                      loc5=int(loc3[0])-1
                      loc6=int(loc3[1])
                      loc7=int(loc4[0])-1
                      loc8=int(loc4[1])
                      handle = Entrez.efetch(db="nucleotide", id=loc1[0],
rettype="genbank")
                      record=SeqIO.read(handle, "genbank")
                      seq1 = record.seq[loc5:loc6]
                      seq2 = record.seq[loc7:loc8]
                      record.seq = seq1+seq2
                      Out_file.write(record.format("fasta"))
                      break
                  # for genbank record like this
			# coded_by="FM207547.1:<1..1443"
			else:

parsed_loc=LocationParser.parse(LocationParser.scan(loc))
                      assert isinstance(parsed_loc,
LocationParser.AbsoluteLocation)
                      assert isinstance(parsed_loc.local_location,
LocationParser.Range)
                      seq_start = parsed_loc.local_location.low
                      seq_stop = parsed_loc.local_location.high
                      assert isinstance(seq_start, LocationParser.Integer)
                      assert isinstance(seq_stop, LocationParser.Integer)
                      seq_start = seq_start.val
                      seq_stop = seq_stop.val
                      Out_file.write(lookup(parsed_loc.path.accession,
seq_start, seq_stop))
                      break
       # for swissprot entries in Genbank 
	 elif Genbank_entry.find('swissprot') >= 0:                 
           Entry = cur_entry.annotations
           Entry = str(Entry)
           Entry = Entry.split('xrefs')
           Entry1 =Entry[1].split(',')
           Entry2 = Entry1[0].split(':')
           handle = Entrez.efetch(db="nucleotide", id=Entry2[1],
rettype="genbank")
           data=SeqIO.read(handle, "genbank")
           for feature in data.features :
               if feature.type == "gene":
                  Gene_id= feature.qualifiers['gene'] [0]
                  if Gene_id == "rbcL":
                   temp = str(feature.location)
                   temp = temp.replace(':', '..')
                   temp = temp.replace('[', '')
                   temp = temp.replace(']', '')
                   if feature.strand == -1:
                      temp1 = data.id+':<'+temp
                      parsed_loc_complement =
LocationParser.parse(LocationParser.scan(temp1))    
                      assert isinstance(parsed_loc_complement,
LocationParser.AbsoluteLocation)
                      assert
isinstance(parsed_loc_complement.local_location, LocationParser.Range)
                      seq_start = parsed_loc_complement.local_location.low
                      seq_stop = parsed_loc_complement.local_location.high
                      assert isinstance(seq_start, LocationParser.Integer)
                      assert isinstance(seq_stop, LocationParser.Integer)
                      seq_start = seq_start.val+1
                      seq_stop = seq_stop.val
                      Temp4=lookup(parsed_loc_complement.path.accession,
seq_start, seq_stop)
                      fasta_handle = StringIO.StringIO(Temp4)
                      record = SeqIO.read(fasta_handle, "fasta")
                      record = make_rc_record(record)
                      #print (record.format("fasta"))
                      Out_file.write(record.format("fasta"))
                      break
                   else:
                      temp2 = data.id+':<'+temp

parsed_loc=LocationParser.parse(LocationParser.scan(temp2))
                      assert isinstance(parsed_loc,
LocationParser.AbsoluteLocation)
                      assert isinstance(parsed_loc.local_location,
LocationParser.Range)
                      seq_start = parsed_loc.local_location.low
                      seq_stop = parsed_loc.local_location.high
                      assert isinstance(seq_start, LocationParser.Integer)
                      assert isinstance(seq_stop, LocationParser.Integer)
                      seq_start = seq_start.val+1
                      seq_stop = seq_stop.val
                      #print (lookup(parsed_loc.path.accession, seq_start,
seq_stop))
                      Out_file.write(lookup(parsed_loc.path.accession,
seq_start, seq_stop))
                      break
           break

Out_file.close() 
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-

Syntax error at or near `join' token
Traceback (most recent call last):
  File "C:\Documents and
Settings\Animesh\Desktop\sequences\Features_extraction_final.py", line 52,
in <module>
    parsed_loc_complement=LocationParser.parse(LocationParser.scan(Temp3))
  File "C:\Python25\lib\site-packages\Bio\GenBank\LocationParser.py", line
319, in parse
    return _cached_parser.parse(tokens)
  File "C:\Python25\Lib\site-packages\Bio\Parsers\spark.py", line 204, in
parse
    self.error(tokens[i-1])
  File "C:\Python25\Lib\site-packages\Bio\Parsers\spark.py", line 183, in
error
    raise SystemExit
SystemExit
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-
Traceback (most recent call last):
  File "C:\Documents and
Settings\Animesh\Desktop\sequences\Features_extraction_final.py", line 76,
in <module>
    loc5=int(loc3[0])-1
ValueError: invalid literal for int() with base 10: '<1'
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-

Animesh