[Biopython-dev] [Bug 2382] Generic FASTA parser
bugzilla-daemon at portal.open-bio.org
bugzilla-daemon at portal.open-bio.org
Tue Oct 16 17:26:29 EDT 2007
http://bugzilla.open-bio.org/show_bug.cgi?id=2382
------- Comment #3 from jflatow at northwestern.edu 2007-10-16 17:26 EST -------
On second thought, let me just rewrite all the code:
# The Bio.Fasta parser
class Fasta(): # or whatever
@staticmethod
def parse(file):
# return an iterator over the file as Bio.Fasta.Records
# for the records, trim newline from header, don't do anything to data
# The Bio.SeqIO.FastaIO wrapper for Bio.Fasta
class FastaIO(): # or however its organized
@staticmethod
def header_todict(header):
parts = re.split('[,|]?\s+', header, maxsplit=1)
assert len(parts) == 2
return {'id': parts[0],
'description': header}
@staticmethod
def data_toseq(data, alphabet):
return Seq(re.sub('\s+', '', data), alphabet)
@staticmethod
def parse(file, header_todict=Fasta.header_todict,
alphabet=single_letter_alphabet):
return (SeqRecord(seq=data_toseq(record.data, alphabet),
**header_todict(record.header)) for record in
Bio.Fasta.parse(file))
# Now to use these in my example I can do
seq_dict = SeqIO.to_dict(SeqIO.FastaIO.parse(seq_file))
for record in Bio.Fasta.parse(qual_file):
id = Bio.SeqIO.FastaIO.header_todict(record.header)['id']
seq_dict[id].quality = [int(x) for x in record.data.split()]
# Suppose instead I have an alignment file, which looks like this:
>contigname
A A 10 64
T T 9 64
C C 9 64
...
# and on, where the first column is a reference sequence, the second column is
a consensus
# sequence, the third column is the number of reads aligned, the fourth column
is the combined
# quality score
# Now its just as easy for me to parse this into an object
class ContigAlign():
def __init__(self, name, ref, consensus, numreads, qscore):
self.name = name
self.ref = ref
self.consensus = consensus
self.numreads = numreads
self.qscore = qscore
# ill make a dictionary of my contigaligns
d = {}
for record in Bio.Fasta.parse(file):
(ref, consensus, numreads, qscore) = zip(record.data.split('\n'))
d[record.header] = ContigAlign(record.header, ref, consensus, numreads,
qscore)
# maybe i would turn ref and consensus into Seqs, but you get the point
--
Configure bugmail: http://bugzilla.open-bio.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the assignee for the bug, or are watching the assignee.
More information about the Biopython-dev
mailing list