[Biopython-dev] Flat files indices
Brad Chapman
chapmanb at uga.edu
Fri Aug 8 18:08:23 EDT 2003
Hi Yves;
> Are the "Open-bio flat-file indexing systems" implemented soemwhere?
Yes, in Bio.Mindy. All of the components are there to be used,
although the documentation and "ease of use" of them is lagging
behind what has actually been finished. But, you can do useful work
with them.
I'm attaching a file of example code ripped from some work I've been
doing which hopefully demonstrates the using it. This current code
indexes a standard set of FASTA files downloaded from GenBank based
on GI numbers. It also has some support for version numbers and
other things tossed in which aren't use here, but which I have used
in other things (aaah, the ugliness of cut-n-paste code).
This uses exclusively the Martel based parsers, which allows it to
work on pretty darn huge FASTA files. A previous version used the
standard Fasta RecordParser which doesn't do well on huge entries (I
was doing rice work for someone and had entries like all of
chromosome 10 tossed in).
So, yeah -- it's all there but needs some work to make it more
user-friendly and documented. Volunteers are always welcome :-).
Hope this helps!
Brad
-------------- next part --------------
import cStringIO
from Bio import Fasta
from Bio import Mindy
from Bio.Mindy import SimpleSeqRecord
from Bio.expressions import fasta
from Martel.LAX import LAX
class MindyIndexOrganizer:
"""Organizer to deal with a set of Mindy indexes of Fasta (or other) files
"""
def __init__(self):
self._databases ={}
self._opened_mindydbs = {}
self._retrieved_seqs = {}
def _get_indexer(self):
"""Derive from and override this class to get different indexers.
"""
return _FastaIdIndexer()
def index(self, index_name, index_file, index_dir, force_index = 0):
"""Index, if necessary, the file and load the index into the organizer.
"""
if not(os.path.exists(index_dir)):
os.makedirs(index_dir)
full_index = os.path.join(index_dir, index_name)
if not(os.path.exists(full_index)) or force_index:
print "Indexing %s..." % index_file
indexer = self._get_indexer()
SimpleSeqRecord.create_berkeleydb([index_file], full_index,
indexer)
self._databases[index_name] = full_index
def retrieve(self, database_name, id_key):
"""Retrieve a sequence record from the given Mindy sequence db.
This returns a Record object, specified by the parser passed
to the class.
"""
# see if we've already retrieved this sequence -- save time with big
# sequences
try:
return self._retrieved_seqs[(database_name, id_key)]
except KeyError:
pass
# Get the sequence database
seq_db = self._get_opened_mindydb(database_name)
# first try retrieving by the base id
try:
seqs = seq_db.lookup(id = id_key)
# if we can't do that, we have to fetch by alias
# this deals with the problem of multiple sequence versions
except KeyError:
seqs = seq_db.lookup(aliases = id_key)
# easy case -- 1 sequence
if len(seqs) == 1:
seq_ref = seqs[0]
# harder case -- multiple sequence versions
else:
seq_ref = self._get_latest_version(seqs)
it_builder = fasta.format.make_iterator("record")
handle = cStringIO.StringIO(seq_ref.text)
iterator = it_builder.iterateFile(handle,
LAX(fields = ['bioformat:sequence']))
result = iterator.next()
seq_record = Fasta.Record()
seq_record.title = id_key
seq_record.sequence = "".join(result['bioformat:sequence'])
# add the retrieved sequence to an in-memory dictionary
self._retrieved_seqs[(database_name, id_key)] = seq_record
return seq_record
def _get_opened_mindydb(self, name):
"""Retrieve an open Mindy database with the given name.
This stores databases we already opened to try and prevent multiple
opens of the same database (and thus save time).
"""
try:
return self._opened_mindydbs[name]
except KeyError:
self._opened_mindydbs[name] = Mindy.open(self._databases[name])
return self._get_opened_mindydb(name)
def _get_latest_version(self, seqs):
"""Find the latest version of a sequence from a list of choices.
This deals with the problem of multiple versions of a sequence by
finding and returning the most recent version
"""
version_dict = {}
the_base_name = None # error checking
for seq in seqs:
text_parts = seq.text.split("\n")
first_line_parts = text_parts[0].split(" ")
seq_name = first_line_parts[0]
assert seq_name.find(".") >= 0, \
"Expected versioned seqs: %s" % seq_name
base_name, version = seq_name.split(".")
if the_base_name is None:
the_base_name = base_name
else:
assert base_name == the_base_name, "Different seqs"
version_dict[int(version)] = seq
versions = version_dict.keys()
versions.sort()
# use the most recent version, seems the best plan
return version_dict[max(versions)]
class _FastaIdIndexer(SimpleSeqRecord.BaseSeqRecordIndexer):
"""Simple indexer to index by GI values.
This assumes that the description of the sequence records is in the
standard GenBank-like format:
>gi|33304516|gb|AY339367.1| Oryza sativa (japonica cultivar-group)...
It indexes these based on the GI number of the sequence.
"""
def __init__(self):
SimpleSeqRecord.BaseSeqRecordIndexer.__init__(self)
def primary_key_name(self):
return "id"
def secondary_key_names(self):
return ["name", "aliases"]
def get_id_dictionary(self, seq_record):
parts = seq_record.description.split(" ")
record_id = parts[0]
record_parts = record_id.split("|")
sequence_id = record_parts[1] # GI number
aliases = []
if not(sequence_id):
raise ValueError("No sequence ID: %s" % seq_record.description)
id_info = {"id" : [sequence_id],
"name" : [],
"aliases" : aliases}
return id_info
More information about the Biopython-dev
mailing list