[Biopython-dev] [Bug 2382] Generic FASTA parser

Tue Oct 16 21:26:29 UTC 2007

http://bugzilla.open-bio.org/show_bug.cgi?id=2382


------- Comment #3 from jflatow at northwestern.edu  2007-10-16 17:26 EST -------
On second thought, let me just rewrite all the code:
# The Bio.Fasta parser
class Fasta(): # or whatever
   @staticmethod
   def parse(file):
        # return an iterator over the file as Bio.Fasta.Records 
        #  for the records, trim newline from header, don't do anything to data

# The Bio.SeqIO.FastaIO wrapper for Bio.Fasta
class FastaIO(): # or however its organized
   @staticmethod
   def header_todict(header):
    parts = re.split('[,|]?\s+', header, maxsplit=1)
    assert len(parts) == 2
    return {'id': parts[0],
                 'description': header}

   @staticmethod
   def data_toseq(data, alphabet):
      return Seq(re.sub('\s+', '', data), alphabet)

   @staticmethod
   def parse(file, header_todict=Fasta.header_todict,
alphabet=single_letter_alphabet):
     return (SeqRecord(seq=data_toseq(record.data, alphabet),
                  **header_todict(record.header)) for record in
Bio.Fasta.parse(file))

# Now to use these in my example I can do
seq_dict = SeqIO.to_dict(SeqIO.FastaIO.parse(seq_file))

for record in Bio.Fasta.parse(qual_file):
        id = Bio.SeqIO.FastaIO.header_todict(record.header)['id']
        seq_dict[id].quality = [int(x) for x in record.data.split()]

# Suppose instead I have an alignment file, which looks like this:
>contigname
A A 10 64
T T  9 64
C C 9 64
...
# and on, where the first column is a reference sequence, the second column is
a consensus
# sequence, the third column is the number of reads aligned, the fourth column
is the combined
# quality score
# Now its just as easy for me to parse this into an object
class ContigAlign():
   def __init__(self, name, ref, consensus, numreads, qscore):
        self.name = name
        self.ref = ref
        self.consensus = consensus
        self.numreads = numreads
        self.qscore = qscore

# ill make a dictionary of my contigaligns
d = {}
for record in Bio.Fasta.parse(file):
    (ref, consensus, numreads, qscore) = zip(record.data.split('\n'))
    d[record.header] = ContigAlign(record.header, ref, consensus, numreads,
qscore)

# maybe i would turn ref and consensus into Seqs, but you get the point


-- 
Configure bugmail: http://bugzilla.open-bio.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the assignee for the bug, or are watching the assignee.