[Biopython-dev] sequence format readers ?
Thomas Sicheritz-Ponten
thomas at cbs.dtu.dk
Tue Sep 11 20:43:43 EDT 2001
Brad, I made some changes to our initial SeqRecord and FastaReader/Write
classes in order to use it for inheritance.
Before I start defining rules for the other formats we should brainstorm
over possible drawbacks/pitfalls of the current implementation (e.g. alignments).
Any ideas/suggestions ?
cheers
-thomas
# ----- SNIP ----- SNAP ----- SNIP ----- SNAP -----
import sys
import string
import Bio.Alphabet
from Bio.Seq import Seq
#from Bio.SeqRecord import SeqRecord
class SeqRecord:
def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
description = "<unknown description>"):
self.seq = seq
self.id = id
self.name = name
self.description = description
# annotations about the whole sequence
self.annotations = {}
# annotations about parts of the sequence
self.features = []
def __str__(self):
res = ''
res += '%s %s' % (self.name, self.seq.data)
return res
class GenericFormat:
def __init__(self, instream=None, outstream=None, alphabet = Bio.Alphabet.generic_alphabet):
self.instream = instream
self.outstream = outstream
self.alphabet = alphabet
self._n = -1
self._lookahead = None
def find_start(self):
# find the start of data
pass
def next(self):
pass
def __getitem__(self, i):
# wrapper to the normal Python "for spam in list:" idiom
assert i == self._n # forward iteration only!
x = self.next()
if x is None:
raise IndexError, i
return x
def write(self, record):
pass
def write_records(self, records):
# In general, can assume homogenous records... useful?
for record in records:
self.write(record)
def close(self):
return self.outstream.close()
def flush(self):
return self.outstream.flush()
class FastaFormat(GenericFormat):
def __init__(self, instream=None, outstream=None, alphabet = Bio.Alphabet.generic_alphabet):
GenericFormat.__init__(self, instream, alphabet = Bio.Alphabet.generic_alphabet)
self.find_start()
def find_start(self):
line = self.instream.readline()
while line and line[0] != ">": line = self.instream.readline()
self._lookahead = line
self._n = 0
def next(self):
self._n = self._n + 1
line = self._lookahead
if not line: return None
x = string.split(line[1:-1], None, 1)
if len(x) == 1:
id = x
desc = ""
else:
id, desc = x
lines = []
line = self.instream.readline()
while line:
if line[0] == ">":
break
lines.append(line[:-1])
line = self.instream.readline()
self._lookahead = line
return SeqRecord(Seq(string.join(lines, ""), self.alphabet),
id = id, name = id, description = desc)
def write(self, record):
id = record.id
assert "\n" not in id
description = record.description
assert "\n" not in description
self.outstream.write(">%s %s\n" % (id, description))
data = record.seq.tostring()
for i in range(0, len(data), 60):
self.outstream.write(data[i:i+60] + "\n")
if __name__ == '__main__':
txt = """
>TM0001 hypothetical protein
MVYGKEGYGRSKNILLSECVCGIISLELNGFQYFLRGMETL
>TM0002 hypothetical protein
MSPEDWKRLICFHTSKEVLKQTLDDAQQNISDSVSIPLRKY
>TM0003 hypothetical protein
METVKAYEVEDIPAIGFNNSLEVWKLFPASSSRSTSSSFQ
>TM0004 hypothetical protein
MKDLYERFNNSLEVWKLVELFGTSIRIHLFQ
"""
from StringIO import StringIO
test = FastaFormat(instream = StringIO(txt))
while 1:
r = test.next()
if not r: break
print r
# ----- SNIP ----- SNAP ----- SNIP ----- SNAP -----
--
Sicheritz-Ponten Thomas, Ph.D CBS, Department of Biotechnology
thomas at biopython.org The Technical University of Denmark
CBS: +45 45 252489 Building 208, DK-2800 Lyngby
Fax +45 45 931585 http://www.cbs.dtu.dk/thomas
De Chelonian Mobile ... The Turtle Moves ...
More information about the Biopython-dev
mailing list