[Biopython-dev] Flat files indices
Yves Bastide
Yves.Bastide at irisa.fr
Tue Aug 12 15:18:41 EDT 2003
Brad Chapman wrote:
> Hi Yves;
>
>
>>Are the "Open-bio flat-file indexing systems" implemented soemwhere?
>
>
> Yes, in Bio.Mindy. All of the components are there to be used,
> although the documentation and "ease of use" of them is lagging
> behind what has actually been finished. But, you can do useful work
> with them.
>
> I'm attaching a file of example code ripped from some work I've been
> doing which hopefully demonstrates the using it. This current code
> indexes a standard set of FASTA files downloaded from GenBank based
> on GI numbers. It also has some support for version numbers and
> other things tossed in which aren't use here, but which I have used
> in other things (aaah, the ugliness of cut-n-paste code).
>
> This uses exclusively the Martel based parsers, which allows it to
> work on pretty darn huge FASTA files. A previous version used the
> standard Fasta RecordParser which doesn't do well on huge entries (I
> was doing rice work for someone and had entries like all of
> chromosome 10 tossed in).
>
> So, yeah -- it's all there but needs some work to make it more
> user-friendly and documented. Volunteers are always welcome :-).
>
> Hope this helps!
> Brad
Thanks!
I think there are a few bugs in Mindy (eg., the use of fileid_onfo in
WriteDB); here's a patch with gratuitous cosmetic changes, perhaps
useful ones, and certainly new bugs :) (I also started to add
docstrings, then saw the hour here)
I also patched sprot38.py to read a current snapshot of SwissProt:
2A5D_HUMAN has a multi-lines RP. I dunno if there are users of the
parser to change, though...
yves
-------------- next part --------------
Index: Bio/Mindy/BaseDB.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/BaseDB.py,v
retrieving revision 1.5
diff -u -p -r1.5 BaseDB.py
--- Bio/Mindy/BaseDB.py 2002/12/10 20:56:05 1.5
+++ Bio/Mindy/BaseDB.py 2003/08/12 19:12:21
@@ -3,6 +3,7 @@ import Bio
import compression
def _int_str(i):
+ # XXX doesn't seem useful
s = str(i)
if s[-1:] == "l":
return s[:-1]
@@ -12,22 +13,22 @@ class WriteDB:
# Must define 'self.filename_map' mapping from filename -> fileid
# Must define 'self.fileid_info' mapping from fileid -> (filename,size)
- def add_filename(self, filename, size, fileid_info):
+ def add_filename(self, filename, size):
fileid = self.filename_map.get(filename, None)
if fileid is not None:
return fileid
s = str(len(self.filename_map))
self.filename_map[filename] = s # map from filename -> id
- assert s not in fileid_info.keys(), "Duplicate entry! %s" % (s,)
+ assert s not in self.fileid_info.keys(), "Duplicate entry! %s" % (s,)
self.fileid_info[s] = (filename, size)
return s
- def load(self, filename, builder, fileid_info, record_tag = "record"):
+ def load(self, filename, builder, record_tag="record"):
formatname = self.formatname
size = os.path.getsize(filename)
- filetag = self.add_filename(filename, size, fileid_info)
+ filetag = self.add_filename(filename, size)
- source = compression.open_file(filename, "rb")
+ source = compression.open(filename, "rb")
if formatname == "unknown":
formatname = "sequence"
@@ -66,7 +67,7 @@ class DictLookup:
def items(self):
return [(key, self[key]) for key in self.keys()]
- def get(self, key, default = None):
+ def get(self, key, default=None):
try:
return self[key]
except KeyError:
@@ -97,7 +98,7 @@ class OpenDB(DictLookup):
if os.path.getsize(filename) != size:
raise TypeError(
"File %s has changed size from %d to %d bytes!" %
- (size, os.path.getsize(filename)))
+ (filename, size, os.path.getsize(filename)))
self.filename_map = filename_map
self.fileid_info = fileid_info
Index: Bio/Mindy/BerkeleyDB.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/BerkeleyDB.py,v
retrieving revision 1.8
diff -u -p -r1.8 BerkeleyDB.py
--- Bio/Mindy/BerkeleyDB.py 2002/12/10 21:55:40 1.8
+++ Bio/Mindy/BerkeleyDB.py 2003/08/12 19:12:21
@@ -1,15 +1,19 @@
+"""Open-Bio BerkeleyDB indexing system for flat-files databanks."""
+
import os
-from bsddb3 import db
+try:
+ from bsddb import db
+except ImportError:
+ from bsddb3 import db
import Location
import BaseDB
import Bio
-_open = open # rename for internal use -- gets redefined below
-
INDEX_TYPE = "BerkeleyDB/1"
def create(dbname, primary_namespace, secondary_namespaces,
- formatname = "unknown"):
+ formatname="unknown"):
+ """BerkeleyDB creator factory"""
os.mkdir(dbname)
config_filename = os.path.join(dbname, "config.dat")
BaseDB.write_config(config_filename = config_filename,
@@ -39,7 +43,7 @@ def create(dbname, primary_namespace, se
primary_table.close()
dbenv.close()
- return open(dbname, "rw")
+ return BerkeleyDB(dbname, "rw")
class PrimaryNamespace(BaseDB.DictLookup):
@@ -92,7 +96,7 @@ class SecondaryNamespace(BaseDB.DictLook
return table.keys()
class BerkeleyDB(BaseDB.OpenDB, BaseDB.WriteDB):
- def __init__(self, dbname, mode = "r"):
+ def __init__(self, dbname, mode="r"):
if mode not in ("r", "rw"):
raise TypeError("Unknown mode: %r" % (mode,))
self.__need_flush = 0
@@ -173,7 +177,7 @@ class BerkeleyDB(BaseDB.OpenDB, BaseDB.W
[x.close() for x in self.secondary_tables.values()]
self.dbenv.close()
self.dbenv = self.primary_table = self.fileid_info = \
- self.secondary_tables = self.fileid_info = None
+ self.secondary_tables = None
def __del__(self):
if self.dbenv is not None:
@@ -188,4 +192,3 @@ class BerkeleyDB(BaseDB.OpenDB, BaseDB.W
return SecondaryNamespace(self, key)
open = BerkeleyDB
-
Index: Bio/Mindy/FlatDB.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/FlatDB.py,v
retrieving revision 1.6
diff -u -p -r1.6 FlatDB.py
--- Bio/Mindy/FlatDB.py 2002/03/01 15:07:21 1.6
+++ Bio/Mindy/FlatDB.py 2003/08/12 19:12:21
@@ -1,9 +1,11 @@
+"""Open-Bio flat indexing system for flat-files databanks."""
-import os, bisect
-import BaseDB, Location
+import os
+import bisect
+import BaseDB
+import Location
import Bio
-_open = open
INDEX_TYPE = "flat/1"
def _parse_primary_table_entry(s):
@@ -11,7 +13,7 @@ def _parse_primary_table_entry(s):
return name, filetag, long(startpos), long(length)
def _read_primary_table(filename):
- infile = _open(filename, "rb")
+ infile = file(filename, "rb")
size = int(infile.read(4))
table = {}
while 1:
@@ -36,7 +38,7 @@ def _write_primary_table(filename, prima
raise AssertionError(
"Primary index record too large for format spec! " +
" %s bytes in %r" % (n, s))
- outfile = _open(filename, "wb")
+ outfile = file(filename, "wb")
outfile.write("%04d" % n)
for k, v in info:
s = "%s\t%s" % (k, v)
@@ -47,7 +49,7 @@ def _parse_secondary_table_entry(s):
return s.rstrip().split("\t")
def _read_secondary_table(filename):
- infile = _open(filename, "rb")
+ infile = file(filename, "rb")
size = int(infile.read(4))
table = {}
while 1:
@@ -75,7 +77,7 @@ def _write_secondary_table(filename, tab
"Secondary index record too large for format spec! " +
" %s bytes in %r" % (n, s))
# And write the output
- outfile = _open(filename, "wb")
+ outfile = file(filename, "wb")
outfile.write("%04d" % n)
for k, v in items:
for x in v:
@@ -127,7 +129,7 @@ class MemoryFlatDB(BaseDB.WriteDB, BaseF
def __init__(self, dbname):
self.__in_constructor = 1
self._need_flush = 0
- BaseFlatDB.__init__(self, dbname, INDEX_TYPE)
+ BaseFlatDB.__init__(self, dbname)
primary_filename = os.path.join(self.dbname,
"key_%s.key" % (self.primary_namespace,) )
@@ -145,7 +147,8 @@ class MemoryFlatDB(BaseDB.WriteDB, BaseF
if len(key_list) != 1:
raise TypeError(
"Field %s has %d entries but must have only one "
- "(must be unique)" % (repr(unique), len(key_list)))
+ "(must be unique)" % (repr(self.primary_namespace),
+ len(key_list)))
key = key_list[0]
if self.primary_table.has_key(key):
raise TypeError("Field %r = %r already exists; must be unique" %
@@ -227,7 +230,7 @@ class BisectFile:
def _find_entry(filename, wantword):
size = os.path.getsize(filename)
- infile = _open(filename, "rb")
+ infile = file(filename, "rb")
bf = BisectFile(infile, size)
left = bisect.bisect_left(bf, wantword)
@@ -238,7 +241,7 @@ def _find_entry(filename, wantword):
def _find_range(filename, wantword):
size = os.path.getsize(filename)
- infile = _open(filename, "rb")
+ infile = file(filename, "rb")
bf = BisectFile(infile, size)
left = bisect.bisect_left(bf, wantword)
@@ -272,7 +275,7 @@ def _lookup_alias(id_filename, word):
return primary_keys
def create(dbname, primary_namespace, secondary_namespaces,
- formatname = "unknown"):
+ formatname="unknown"):
os.mkdir(dbname)
config_filename = os.path.join(dbname, "config.dat")
BaseDB.write_config(config_filename = config_filename,
@@ -297,7 +300,7 @@ def create(dbname, primary_namespace, se
return open(dbname, "rw")
-def open(dbname, mode = "r"):
+def open(dbname, mode="r"):
if mode == "r":
return DiskFlatDB(dbname)
elif mode == "rw":
@@ -308,7 +311,7 @@ def open(dbname, mode = "r"):
raise TypeError("Unknown mode: %r" % (mode,))
def _get_first_words(filename):
- infile = _open(filename, "rb")
+ infile = file(filename, "rb")
size = int(infile.read(4))
data = []
while 1:
Index: Bio/Mindy/Location.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/Location.py,v
retrieving revision 1.2
diff -u -p -r1.2 Location.py
--- Bio/Mindy/Location.py 2002/02/26 11:32:26 1.2
+++ Bio/Mindy/Location.py 2003/08/12 19:12:21
@@ -1,6 +1,6 @@
import compression
-class Location:
+class Location(object):
"""Handle for a record (use 'text' to get the record's text)"""
def __init__(self, namespace, name, filename, startpos, length):
self.namespace = namespace
@@ -9,26 +9,26 @@ class Location:
self.startpos = startpos
self.length = length
def __repr__(self):
- return "Location(namespace = %r, name = %r, filename = %r, startpos = %r, length = %r)" % (self.namespace, self.name, self.filename, self.startpos, self.length)
+ return "Location(namespace = %r, name = %r, filename = %r," \
+ " startpos = %r, length = %r)" % \
+ (self.namespace, self.name, self.filename,
+ self.startpos, self.length)
def __str__(self):
return "Location(%s:%s at %s: %s, %s)" % \
(self.namespace, self.name,
self.filename,self.startpos, self.length)
- def __getattr__(self, key):
- if key == "text":
- infile = compression.open_file(self.filename)
- if hasattr(infile, "seek"):
- infile.seek(self.startpos)
- return infile.read(self.length)
- # read 1MB chunks at a time
- CHUNKSIZE = 1000000
- count = 0
- while count + CHUNKSIZE < self.startpos:
- infile.read(CHUNKSIZE)
- count += CHUNKSIZE
- infile.read(self.startpos - count)
+ def get_text(self):
+ infile = compression.open(self.filename)
+ if hasattr(infile, "seek"):
+ infile.seek(self.startpos)
return infile.read(self.length)
- elif key == "__members__":
- return ["text"]
- raise AttributeError(key)
+ # read 1MiB chunks at a time
+ CHUNKSIZE = 1048576
+ count = 0
+ while count + CHUNKSIZE < self.startpos:
+ infile.read(CHUNKSIZE)
+ count += CHUNKSIZE
+ infile.read(self.startpos - count)
+ return infile.read(self.length)
+ text = property(get_text)
Index: Bio/Mindy/SimpleSeqRecord.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/SimpleSeqRecord.py,v
retrieving revision 1.2
diff -u -p -r1.2 SimpleSeqRecord.py
--- Bio/Mindy/SimpleSeqRecord.py 2002/12/10 20:56:05 1.2
+++ Bio/Mindy/SimpleSeqRecord.py 2003/08/12 19:12:22
@@ -94,8 +94,10 @@ class FixDocumentBuilder(BuildSeqRecord)
# --- convenience functions for indexing
# you should just use these unless you are doing something fancy
-def create_berkeleydb(files, db_name, indexer = SimpleIndexer()):
+def create_berkeleydb(files, db_name, indexer=None):
from Bio.Mindy import BerkeleyDB
+ if indexer is None:
+ indexer = SimpleIndexer()
unique_name = indexer.primary_key_name()
alias_names = indexer.secondary_key_names()
creator = BerkeleyDB.create(db_name, unique_name, alias_names)
@@ -104,8 +106,10 @@ def create_berkeleydb(files, db_name, in
creator.load(filename, builder = builder, fileid_info = {})
creator.close()
-def create_flatdb(files, db_name, indexer = SimpleIndexer()):
+def create_flatdb(files, db_name, indexer=None):
from Bio.Mindy import FlatDB
+ if indexer is None:
+ indexer = SimpleIndexer()
unique_name = indexer.primary_key_name()
alias_names = indexer.secondary_key_names()
creator = FlatDB.create(db_name, unique_name, alias_names)
Index: Bio/Mindy/XPath.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/XPath.py,v
retrieving revision 1.3
diff -u -p -r1.3 XPath.py
--- Bio/Mindy/XPath.py 2002/03/01 15:07:21 1.3
+++ Bio/Mindy/XPath.py 2003/08/12 19:12:22
@@ -1,4 +1,5 @@
-import xml.sax, re
+import xml.sax
+import re
from Bio import Std
@@ -10,7 +11,7 @@ _pat_tag_re = re.compile(r"""^//(%s)(\[@
#') # emacs cruft
-def parse_simple_xpath(s):
+def _parse_simple_xpath(s):
# Only supports two formats
# //tag
# //tag[@attr="value"]
@@ -32,11 +33,23 @@ def parse_simple_xpath(s):
def xpath_index(dbname,
filenames,
primary_namespace,
- extract_info, # pair of (data_value, xpath)
- format = "sequence",
- record_tag = Std.record.tag,
- creator_factory = None,
+ extract_info,
+ format="sequence",
+ record_tag=Std.record.tag,
+ creator_factory=None,
):
+ """Index a flat-file databank.
+
+ Arguments:
+ dbname -- databank name
+ filenames -- list of file names; full paths should be used
+ primary_namespace -- primary identifier namespace
+ extract_info -- list of pairs (data_value, xpath)
+ format -- name of the file format (default: "sequence")
+ record_tag -- record tag (default: `Bio.Std.record.tag`)
+ creator_factory -- creator factory (default: BerkeleyDB.create)
+
+ """
if creator_factory is None:
import BerkeleyDB
creator_factory = BerkeleyDB.create
@@ -55,28 +68,32 @@ def xpath_index(dbname,
raise TypeError("Property %r has no xpath definition" %
(primary_namespace,))
- creator = creator_factory(dbname, primary_namespace, data_names)
- builder = GrabXPathNodes(extract_info)
+ creator = creator_factory(dbname, primary_namespace, data_names,
+ formatname = format)
+ builder = _GrabXPathNodes(extract_info)
+ fileid_info = {}
for filename in filenames:
- creator.load(filename, builder = builder, record_tag = record_tag,
- formatname = format)
+ creator.load(filename, builder = builder, fileid_info = fileid_info,
+ record_tag = record_tag)
creator.close()
-class GrabXPathNodes(xml.sax.ContentHandler):
+class _GrabXPathNodes(xml.sax.ContentHandler):
def __init__(self, extractinfo):
+ xml.sax.ContentHandler.__init__(self)
self._fast_tags = _fast_tags = {}
for property, xpath in extractinfo:
- tag, attrs = parse_simple_xpath(xpath)
+ tag, attrs = _parse_simple_xpath(xpath)
_fast_tags.setdefault(tag, []).append( (attrs, property) )
# for doing the endElement in the correct order,
# which is opposite to the input order
- self._rev_tags = _rev_tags = {}
+ _rev_tags = {}
for k, v in self._fast_tags.items():
v = v[:]
v.reverse()
- self._rev_tags[k] = v
+ _rev_tags[k] = v
+ self._rev_tags = _rev_tags
def uses_tags(self):
return self._fast_tags.keys()
Index: Bio/Mindy/__init__.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/__init__.py,v
retrieving revision 1.6
diff -u -p -r1.6 __init__.py
--- Bio/Mindy/__init__.py 2002/03/01 15:07:21 1.6
+++ Bio/Mindy/__init__.py 2003/08/12 19:12:22
@@ -1,9 +1,13 @@
-import os, sys
+import os
-_open = open # rename for internal use -- gets redefined below
+# For python 2.1 compatibility, one can add
+##try:
+## file
+##except NameError:
+## file = open
-def open(dbname, mode = "r"):
- text = _open(os.path.join(dbname, "config.dat"), "rb").read()
+def open(dbname, mode="r"):
+ text = file(os.path.join(dbname, "config.dat"), "rb").read()
line = text.split("\n")[0]
if line == "index\tBerkeleyDB/1":
import BerkeleyDB
@@ -18,19 +22,19 @@ def open(dbname, mode = "r"):
def main():
from Bio import Std
import XPath
- import FlatDB
+ ##import FlatDB
XPath.xpath_index(
- #dbname = "sprot_flat",
+ ##dbname = "sprot_flat",
dbname = "sprot_small",
filenames = ["/home/dalke/ftps/swissprot/smaller_sprot38.dat",
- #filenames = ["/home/dalke/ftps/swissprot/sprot38.dat",
+ ##filenames = ["/home/dalke/ftps/swissprot/sprot38.dat",
],
primary_namespace = "entry",
extract_info = [
("entry", "//entry_name"),
("accession", "//%s[@type='accession']" % (Std.dbid.tag,)),
],
- #creator_factory = FlatDB.CreateFlatDB,
+ ##creator_factory = FlatDB.CreateFlatDB,
)
Index: Bio/Mindy/compression.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Mindy/compression.py,v
retrieving revision 1.1
diff -u -p -r1.1 compression.py
--- Bio/Mindy/compression.py 2002/01/28 20:55:30 1.1
+++ Bio/Mindy/compression.py 2003/08/12 19:12:22
@@ -1,4 +1,5 @@
-import commands, os
+import commands
+import os
_uncompress_table = {
".bz": "bzip2",
@@ -8,21 +9,23 @@ _uncompress_table = {
".Z": "compress",
}
-def open_file(filename, mode = "rb"):
+def open(filename, mode="rb"):
ext = os.path.splitext(filename)[1]
type = _uncompress_table.get(ext)
if type is None:
- return open(filename, mode)
+ return file(filename, mode)
if type == "gzip":
import gzip
- gzip.open(filename, mode)
+ return gzip.open(filename, mode)
if type == "bzip2":
- cmd = "bzcat --decompress"
- cmd += commands.mkarg(filename)
- return os.popen(cmd, mode)
+ try:
+ import bz2
+ except ImportError:
+ cmd = "bzcat --decompress %s" % commands.mkarg(filename)
+ return os.popen(cmd, mode)
+ return bz2.BZ2File(filename, mode)
if type == "compress":
- cmd = "zcat -d"
- cmd += commands.mkarg(filename)
+ cmd = "zcat -d %s" % commands.mkarg(filename)
return os.popen(cmd, mode)
raise AssertionError("What's a %r?" % type)
-------------- next part --------------
Index: Bio/expressions//swissprot/sprot38.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/expressions/swissprot/sprot38.py,v
retrieving revision 1.4
diff -u -p -r1.4 sprot38.py
--- Bio/expressions//swissprot/sprot38.py 2002/02/27 07:31:32 1.4
+++ Bio/expressions//swissprot/sprot38.py 2003/08/12 19:13:33
@@ -111,9 +111,9 @@ RN = Martel.Group("RN", Martel.Re("RN
#--- RP
-# occurs once
+# 1 or more
RP = Simple("RP", "reference_position")
-
+RP_block = Martel.Group("RP_block", Martel.Rep1(RP))
#--- RC
@@ -151,7 +151,7 @@ RL_block = Martel.Group("RL_block", Mart
reference = Martel.Group("reference",
RN + \
- RP + \
+ RP_block + \
Martel.Opt(RC_block) + \
Martel.Opt(RX) + \
RA_block + \
More information about the Biopython-dev
mailing list