[Biopython] Indexing large sequence files

Fri Jun 19 15:20:36 UTC 2009

On Fri, Jun 19, 2009 at 3:56 PM, Cedar Mckay<cmckay at u.washington.edu> wrote:
> Peter, I apreciate all this hard work you are doing for me. I won't be able
> to test any of it until I'm back in the office on Tuesday, but I'll let you
> know how it goes then.
>
> Best,
> Cedar
> Sent via phone

It isn't just for you ;)

You managed to come up with an interesting challenge that caught my
interest. I'm interested to see if that solution works in practise, it certainly
seems to be OK on my machine.

If you can report back next week, we can resume this discussion then.

Regards,

Peter

P.S. Here is a rough version which works on more file formats. This tries to
use the record.id as the dictionary key, based on how the SeqIO parsers
work and the default behaviour of the Bio.SeqIO.to_dict() function.

In some cases (e.g. FASTA and FASTQ) this is easy to mimic (getting the
same string for the record.id). For SwissProt or GenBank files this is harder,
so the choice is parse the record (slow) or mimic the record header parsing
in Bio.SeqIO (fragile - we'd need good test coverage). Something based on
this code might be a worthwhile addition to Bio.SeqIO, obviously this would
need tests and documentation first.

from Bio import SeqIO
import re
class SeqRecordDict(object) :
    """Read only dictionary interface to a sequential sequence file.

    Keeps the keys in memory, reads the file to access entries as
    SeqRecord objects using Bio.SeqIO for parsing them."""
    def __init__(self, filename, format, alphabet=None) :
        #TODO - Take a handle instead, provided it has seek and tell methods?
        markers = {"fasta" : ">",
                   "fastq" : "@",
                   "fastq-solexa" : "@",
                   "fastq-illumnia" : "@",
                   "genbank" : "LOCUS ",
                   "gb" : "LOCUS ",
                   "embl" : "ID   ",
                   "swiss": "ID   ",
                   }
        try :
            marker_offset = len(markers[format])
            marker = re.compile("^" + markers[format]) #caret means
start of line
        except KeyError :
            raise ValueError("Indexing %s format not supported" % repr(format))
        self._index = dict()
        self._alphabet = alphabet
        self._format = format
        handle = open(filename, "rU")
        while True :
            pos = handle.tell()
            line = handle.readline()
            if not line : break #End of file
            if marker.match(line) :
                if self._format in
["fasta","fastq","fastq-solexa","fastq-illumina"]:
                    #Here we can assume the record.id is the first
word after the
                    #marker. This isn't the case in say GenBank or SwissProt.

self._index[line[marker_offset:].rstrip().split(None,1)[0]] = pos
                elif self._format == "swiss" :
                    line = handle.readline()
                    assert line.startswith("AC ")
                    self._index[line.rstrip(";\n").split()[1]] = pos
                else :
                    #Want to make sure we use the record.id as the key... the
                    #only general way to do this is to parse it now (slow) :(
                    handle.seek(pos)
                    record = SeqIO.parse(handle, format, alphabet).next()
                    self._index[record.id] = pos
                    #After SeqIO has used the handle, it may be pointing part
                    #way into the next record, so to be safe, rewind to the last
                    #known location...
                    handle.seek(pos)
                    handle.readline()
        handle.seek(0)
        self._handle = handle

    def keys(self) :
        return self._index.keys()

    def __len__(self) :
        return len(self._index)

    def __getitem__(self, index) :
        handle = self._handle
        handle.seek(self._index[index])
        return SeqIO.parse(handle, self._format, self._alphabet).next()

#Testing...
import time
start = time.time()
my_dict = SeqRecordDict("uniprot_sprot.fasta","fasta")
count = len(my_dict)
print my_dict["sp|Q197F8|002R_IIV3"].id #first
print my_dict["sp|B2ZDY1|Z_WWAVU"].id #last
print "%i Fasta took %0.2fs" % (count, time.time()-start)
#470369 Fasta took 7.01s, 210MB file.

start = time.time()
my_dict = SeqRecordDict("uniprot_sprot.dat","swiss")
count = len(my_dict)
print my_dict["Q197F8"].id #first
print my_dict["B2ZDY1"].id #last
print "%i swiss took %0.2fs" % (count, time.time()-start)
#470369 swiss took 61.90s, 1.9GB file.

start = time.time()
my_dict = SeqIODict("SRR001666_1.fastq", "fastq")
count = len(my_dict)
print my_dict["SRR001666.1"].id #first
print my_dict["SRR001666.7047668"].id #last
print "%i FASTQ took %0.2fs" % (count, time.time()-start)
#7051494 FASTQ took 52.32s, 1.3GB file.

start = time.time()
my_dict = SeqRecordDict("gbpln1.seq","gb")
count = len(my_dict)
print my_dict["AB000001.1"].id #first
print my_dict["AB433452.1"].id #last
print "%i GenBank took %0.2fs" % (count, time.time()-start)
#Takes a while, needs an optimisation like the one for "swiss"?