[BioPython] How to check codon usage for specific amino acid positions in a given set of CDS sequences
Animesh Agrawal
animesh.agrawal at anu.edu.au
Fri Jan 16 04:46:52 UTC 2009
Peter,
Wow! The code(for positional frequency of codons) works 4 me. Thanks a ton.
While we are at it please allow me to ask you another question related to
downloading CDS sequences. I have copied one script from mailing list for
downloading CDS given from Genbank record of protein sequence written by
Andrew Dalke. I modified it a little bit to include few more exceptions and
it work in most of the cases but it's still not bug free. Giving errors
frequently. I am copying both the script and errors. See if you can spot the
problem.. or can suggest better way of doing it..
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import GenBank
from Bio.GenBank import LocationParser
from EUtils import DBIds, DBIdsClient
from Bio.SeqRecord import SeqRecord
import StringIO
from Bio import Entrez
from Bio.Alphabet import IUPAC
#Animesh Agrawal Email:animesh.agrawal at anu.edu.au
print "This program extracts the CDS of a given Genbank protein file\n"
File_Input = raw_input("Give the name of input file:\t")
File_Output = raw_input("Give the name of output file:\t")
gb_handle = open(File_Input, "r")
feature_parser = GenBank.FeatureParser ()
iterator = GenBank.Iterator (gb_handle, feature_parser)
Out_file= open(File_Output, "w")
def lookup(name, seq_start, seq_stop):
h = DBIdsClient.from_dbids(DBIds(db = "nucleotide", ids = [name]))
return h.efetch(retmode = "text", rettype = "fasta",
seq_start = seq_start, seq_stop = seq_stop).read()
def make_rc_record(record) :
"""Returns a new SeqRecord with the reverse complement sequence."""
rc_rec = SeqRecord(seq = record.seq.reverse_complement(), \
id = "rc_" + record.id, \
name = "rc_" + record.name, \
description = "reverse complement")
return rc_rec
while 1:
cur_entry = iterator.next ()
Genbank_entry = str(cur_entry)
if cur_entry is None:
break
for feature in cur_entry.features :
if feature.type == "CDS":
loc = feature.qualifiers["coded_by"][0]
Temp1=loc
Temp2 = Temp1.split('(')
# for genbank record like this
# coded_by="complement(NC_001713.1:67323..68795)"
if Temp2[0]=="complement":
Temp3 = Temp2[1].replace(')', '')
parsed_loc_complement=LocationParser.parse(LocationParser.scan(Temp3))
assert isinstance(parsed_loc_complement,
LocationParser.AbsoluteLocation)
assert
isinstance(parsed_loc_complement.local_location, LocationParser.Range)
seq_start = parsed_loc_complement.local_location.low
seq_stop = parsed_loc_complement.local_location.high
assert isinstance(seq_start, LocationParser.Integer)
assert isinstance(seq_stop, LocationParser.Integer)
seq_start = seq_start.val
seq_stop = seq_stop.val
Temp4=lookup(parsed_loc_complement.path.accession,
seq_start, seq_stop)
fasta_handle = StringIO.StringIO(Temp4)
record = SeqIO.read(fasta_handle, "fasta")
record = make_rc_record(record)
Out_file.write(record.format("fasta"))
break
# for genbank record like this
# coded_by="join(NC_008114.1:51934..52632,
NC_008114.1:54315..55043)"
elif Temp2[0]=="join":
loc=loc.replace('join', '')
loc= loc.replace('(', '')
loc= loc.replace(')', '')
loc = loc.split(',')
loc1=loc[0].split(':')
loc2=loc[1].split(':')
loc3=loc1[1].split('..')
loc4=loc2[1].split('..')
loc5=int(loc3[0])-1
loc6=int(loc3[1])
loc7=int(loc4[0])-1
loc8=int(loc4[1])
handle = Entrez.efetch(db="nucleotide", id=loc1[0],
rettype="genbank")
record=SeqIO.read(handle, "genbank")
seq1 = record.seq[loc5:loc6]
seq2 = record.seq[loc7:loc8]
record.seq = seq1+seq2
Out_file.write(record.format("fasta"))
break
# for genbank record like this
# coded_by="FM207547.1:<1..1443"
else:
parsed_loc=LocationParser.parse(LocationParser.scan(loc))
assert isinstance(parsed_loc,
LocationParser.AbsoluteLocation)
assert isinstance(parsed_loc.local_location,
LocationParser.Range)
seq_start = parsed_loc.local_location.low
seq_stop = parsed_loc.local_location.high
assert isinstance(seq_start, LocationParser.Integer)
assert isinstance(seq_stop, LocationParser.Integer)
seq_start = seq_start.val
seq_stop = seq_stop.val
Out_file.write(lookup(parsed_loc.path.accession,
seq_start, seq_stop))
break
# for swissprot entries in Genbank
elif Genbank_entry.find('swissprot') >= 0:
Entry = cur_entry.annotations
Entry = str(Entry)
Entry = Entry.split('xrefs')
Entry1 =Entry[1].split(',')
Entry2 = Entry1[0].split(':')
handle = Entrez.efetch(db="nucleotide", id=Entry2[1],
rettype="genbank")
data=SeqIO.read(handle, "genbank")
for feature in data.features :
if feature.type == "gene":
Gene_id= feature.qualifiers['gene'] [0]
if Gene_id == "rbcL":
temp = str(feature.location)
temp = temp.replace(':', '..')
temp = temp.replace('[', '')
temp = temp.replace(']', '')
if feature.strand == -1:
temp1 = data.id+':<'+temp
parsed_loc_complement =
LocationParser.parse(LocationParser.scan(temp1))
assert isinstance(parsed_loc_complement,
LocationParser.AbsoluteLocation)
assert
isinstance(parsed_loc_complement.local_location, LocationParser.Range)
seq_start = parsed_loc_complement.local_location.low
seq_stop = parsed_loc_complement.local_location.high
assert isinstance(seq_start, LocationParser.Integer)
assert isinstance(seq_stop, LocationParser.Integer)
seq_start = seq_start.val+1
seq_stop = seq_stop.val
Temp4=lookup(parsed_loc_complement.path.accession,
seq_start, seq_stop)
fasta_handle = StringIO.StringIO(Temp4)
record = SeqIO.read(fasta_handle, "fasta")
record = make_rc_record(record)
#print (record.format("fasta"))
Out_file.write(record.format("fasta"))
break
else:
temp2 = data.id+':<'+temp
parsed_loc=LocationParser.parse(LocationParser.scan(temp2))
assert isinstance(parsed_loc,
LocationParser.AbsoluteLocation)
assert isinstance(parsed_loc.local_location,
LocationParser.Range)
seq_start = parsed_loc.local_location.low
seq_stop = parsed_loc.local_location.high
assert isinstance(seq_start, LocationParser.Integer)
assert isinstance(seq_stop, LocationParser.Integer)
seq_start = seq_start.val+1
seq_stop = seq_stop.val
#print (lookup(parsed_loc.path.accession, seq_start,
seq_stop))
Out_file.write(lookup(parsed_loc.path.accession,
seq_start, seq_stop))
break
break
Out_file.close()
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-
Syntax error at or near `join' token
Traceback (most recent call last):
File "C:\Documents and
Settings\Animesh\Desktop\sequences\Features_extraction_final.py", line 52,
in <module>
parsed_loc_complement=LocationParser.parse(LocationParser.scan(Temp3))
File "C:\Python25\lib\site-packages\Bio\GenBank\LocationParser.py", line
319, in parse
return _cached_parser.parse(tokens)
File "C:\Python25\Lib\site-packages\Bio\Parsers\spark.py", line 204, in
parse
self.error(tokens[i-1])
File "C:\Python25\Lib\site-packages\Bio\Parsers\spark.py", line 183, in
error
raise SystemExit
SystemExit
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-
Traceback (most recent call last):
File "C:\Documents and
Settings\Animesh\Desktop\sequences\Features_extraction_final.py", line 76,
in <module>
loc5=int(loc3[0])-1
ValueError: invalid literal for int() with base 10: '<1'
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-
Animesh
More information about the Biopython
mailing list