[Biopython-dev] Flat files indices

Fri Aug 8 18:08:23 EDT 2003

Hi Yves;

> Are the "Open-bio flat-file indexing systems" implemented soemwhere?

Yes, in Bio.Mindy. All of the components are there to be used,
although the documentation and "ease of use" of them is lagging
behind what has actually been finished. But, you can do useful work
with them. 

I'm attaching a file of example code ripped from some work I've been
doing which hopefully demonstrates the using it. This current code
indexes a standard set of FASTA files downloaded from GenBank based
on GI numbers. It also has some support for version numbers and
other things tossed in which aren't use here, but which I have used
in other things (aaah, the ugliness of cut-n-paste code). 

This uses exclusively the Martel based parsers, which allows it to
work on pretty darn huge FASTA files. A previous version used the
standard Fasta RecordParser which doesn't do well on huge entries (I
was doing rice work for someone and had entries like all of
chromosome 10 tossed in).

So, yeah -- it's all there but needs some work to make it more
user-friendly and documented. Volunteers are always welcome :-).

Hope this helps!
Brad
-------------- next part --------------
import cStringIO

from Bio import Fasta
from Bio import Mindy
from Bio.Mindy import SimpleSeqRecord
from Bio.expressions import fasta

from Martel.LAX import LAX

class MindyIndexOrganizer:
    """Organizer to deal with a set of Mindy indexes of Fasta (or other) files
    """
    def __init__(self):
        self._databases ={}
        self._opened_mindydbs = {}
        self._retrieved_seqs = {}

    def _get_indexer(self):
        """Derive from and override this class to get different indexers.
        """
        return _FastaIdIndexer()

    def index(self, index_name, index_file, index_dir, force_index = 0):
        """Index, if necessary, the file and load the index into the organizer.
        """
        if not(os.path.exists(index_dir)):
            os.makedirs(index_dir)

        full_index = os.path.join(index_dir, index_name)
        if not(os.path.exists(full_index)) or force_index:
            print "Indexing %s..." % index_file
            indexer = self._get_indexer()
            SimpleSeqRecord.create_berkeleydb([index_file], full_index,
                                              indexer)

        self._databases[index_name] = full_index

    def retrieve(self, database_name, id_key):
        """Retrieve a sequence record from the given Mindy sequence db.

        This returns a Record object, specified by the parser passed
        to the class.
        """
        # see if we've already retrieved this sequence -- save time with big
        # sequences
        try:
            return self._retrieved_seqs[(database_name, id_key)]
        except KeyError:
            pass

        # Get the sequence database
        seq_db = self._get_opened_mindydb(database_name)

        # first try retrieving by the base id
        try:
            seqs = seq_db.lookup(id = id_key)
        # if we can't do that, we have to fetch by alias
        # this deals with the problem of multiple sequence versions
        except KeyError:
            seqs = seq_db.lookup(aliases = id_key)

        # easy case -- 1 sequence
        if len(seqs) == 1:
            seq_ref = seqs[0]
        # harder case -- multiple sequence versions
        else:
            seq_ref = self._get_latest_version(seqs)

        it_builder = fasta.format.make_iterator("record")
        handle = cStringIO.StringIO(seq_ref.text)
        iterator = it_builder.iterateFile(handle,
                LAX(fields = ['bioformat:sequence']))
        result = iterator.next()

        seq_record = Fasta.Record()
        seq_record.title = id_key
        seq_record.sequence = "".join(result['bioformat:sequence'])

        # add the retrieved sequence to an in-memory dictionary
        self._retrieved_seqs[(database_name, id_key)] = seq_record

        return seq_record

    def _get_opened_mindydb(self, name):
        """Retrieve an open Mindy database with the given name.

        This stores databases we already opened to try and prevent multiple 
        opens of the same database (and thus save time).
        """
        try:
            return self._opened_mindydbs[name]
        except KeyError:
            self._opened_mindydbs[name] = Mindy.open(self._databases[name])
            return self._get_opened_mindydb(name)

    def _get_latest_version(self, seqs):
        """Find the latest version of a sequence from a list of choices.

        This deals with the problem of multiple versions of a sequence by
        finding and returning the most recent version
        """
        version_dict = {}
        the_base_name = None # error checking
        for seq in seqs:
            text_parts = seq.text.split("\n")
            first_line_parts = text_parts[0].split(" ")
            seq_name = first_line_parts[0]
            assert seq_name.find(".") >= 0, \
              "Expected versioned seqs: %s" % seq_name
            base_name, version = seq_name.split(".")
            if the_base_name is None:
                the_base_name = base_name
            else:
                assert base_name == the_base_name, "Different seqs"
            version_dict[int(version)] = seq
        versions = version_dict.keys()
        versions.sort()
        # use the most recent version, seems the best plan
        return version_dict[max(versions)]

class _FastaIdIndexer(SimpleSeqRecord.BaseSeqRecordIndexer):
    """Simple indexer to index by GI values.

    This assumes that the description of the sequence records is in the
    standard GenBank-like format:
    >gi|33304516|gb|AY339367.1| Oryza sativa (japonica cultivar-group)...

    It indexes these based on the GI number of the sequence.
    """
    def __init__(self):
        SimpleSeqRecord.BaseSeqRecordIndexer.__init__(self)

    def primary_key_name(self):
        return "id"

    def secondary_key_names(self):
        return ["name", "aliases"]

    def get_id_dictionary(self, seq_record):
        parts = seq_record.description.split(" ")
        record_id = parts[0]
        record_parts = record_id.split("|")
        sequence_id = record_parts[1] # GI number
        aliases = []
        if not(sequence_id):
            raise ValueError("No sequence ID: %s" % seq_record.description)

        id_info = {"id" : [sequence_id],
                   "name" : [],
                   "aliases" : aliases}
        return id_info