[Biopython-dev] BioSQL and PostgreSQL
Yves Bastide
ybastide at irisa.fr
Mon Nov 18 17:15:37 EST 2002
Hi,
I've adapted BioSQL/*.py to work with PostgreSQL. Well, at least the
tests pass... Patch attached, comments are welcome.
A few points:
* I've made utility classes for nonstandard SQL/DBAPI operations, e.g.
retrieving the id of the last inserted record. (The implementation is
so-so, I know :). There are Generic_dbutils, Mysql_dbutils and
Pg_dbutils. The choice is made according to the DBAPI module's name;
only MySQLdb and psycopg are currently recognised.
* Replaced count = self.cursor.execute by self.cursor.execute and fetch
* Likewise, self.adaptor.execute_one("INSERT ...") -->
self.adaptor.execute()
* SUBSTRING(biosequence_str, %s, %s) --> SUBSTRING(biosequence_str FROM
%s FOR %s). SQL is beautiful...
* I've modified test_BioSQL.py so that the first doesn't require a
schema (it will DROP DATABASE/CREATE DATABASE like the other ones). I've
also added tearDown()'s to close the database connections; otherwise,
psycopg cannot drop the db.
* Modified open_database: psycopg uses a dsn, not keyword arguments.
(Done but not included in the patch: adaptations to the current
biosql-schema CVS.)
Regards,
Yves
-------------- next part --------------
Index: BioSQL/BioSeqDatabase.py
===================================================================
RCS file: /home/repository/biopython/biopython/BioSQL/BioSeqDatabase.py,v
retrieving revision 1.10
diff -u -p -r1.10 BioSeqDatabase.py
--- BioSQL/BioSeqDatabase.py 2002/03/01 08:39:24 1.10
+++ BioSQL/BioSeqDatabase.py 2002/11/18 21:18:43
@@ -5,6 +5,7 @@ database, and is compatible with the Bio
"""
import BioSeq
import Loader
+import DBUtils
def open_database(driver = "MySQLdb", *args, **kwargs):
"""Main interface for loading a existing BioSQL-style database.
@@ -26,16 +27,49 @@ def open_database(driver = "MySQLdb", *a
"""
module = __import__(driver)
connect = getattr(module, "connect")
- conn = connect(*args, **kwargs)
+ try:
+ conn = connect(*args, **kwargs)
+ except TypeError:
+ # Perhaps a version of psycopg with different parameter style
+ # (i.e., dsn="dbname= host= user=..."
+ # FIXME: doesn't use the args array
+ kw = kwargs.copy()
+
+ for k in kw.keys():
+ if kw[k] == '': del kw[k]
+
+ if kw.has_key('db'):
+ kw['dbname'] = kw['db']
+ del kw['db']
+ if kw.has_key('passwd'):
+ kw['password'] = kw['passwd']
+ del kw['passwd']
+
+ # PostgreSQL needs a database to connect to, so use a
+ # system one if needed
+ # FIXME: might be a security risk?
+ # So currently, we keep dbname=user
+## if not kw.has_key('dbname'):
+## kw['dbname'] = 'template1'
+
+ dsn = ' '.join(['='.join(i) for i in kw.items()])
+ conn = connect(dsn)
+
return DBServer(conn, module)
class DBServer:
def __init__(self, conn, module):
- self.conn = conn
self.module = module
- self.adaptor = Adaptor(self.conn)
+ if module.__name__ == 'psycopg':
+ create_dbutils = DBUtils.create_Pg_dbutils
+ elif module.__name__ == 'MySQLdb':
+ create_dbutils = DBUtils.create_Mysql_dbutils
+ else:
+ create_dbutils = DBUtils.create_Generic_dbutils
+ self.adaptor = Adaptor(conn, create_dbutils)
+
def __repr__(self):
- return self.__class__.__name__ + "(%r)" % self.conn
+ return self.__class__.__name__ + "(%r)" % self.adaptor.conn
def __getitem__(self, name):
return BioSeqDatabase(self.adaptor, name)
def keys(self):
@@ -58,7 +92,7 @@ class DBServer:
# make the database
sql = r"INSERT INTO biodatabase (name) VALUES" \
r" (%s)"
- self.adaptor.execute_one(sql, (db_name))
+ self.adaptor.execute(sql, (db_name,))
return BioSeqDatabase(self.adaptor, db_name)
def load_database_sql(self, sql_file):
@@ -74,6 +108,8 @@ class DBServer:
for line in sql_handle.xreadlines():
if line.find("#") == 0: # don't include comment lines
pass
+ elif line.find("--") == 0: # ditto, SQL std. comments
+ pass
elif line.strip(): # only include non-blank lines
sql += line.strip()
sql += ' '
@@ -84,40 +120,48 @@ class DBServer:
self.adaptor.cursor.execute(sql_line, ())
class Adaptor:
- def __init__(self, conn):
+ def __init__(self, conn, create_dbutils):
self.conn = conn
self.cursor = conn.cursor()
+ self.dbutils = create_dbutils()##self.conn, self.cursor)
+
+ def last_id(self, table):
+ return self.dbutils.last_id(self.cursor, table)
+ def autocommit(self, y = 1):
+ return self.dbutils.autocommit(self.conn, y)
+
def fetch_dbid_by_dbname(self, dbname):
- count = self.cursor.execute(
+ self.cursor.execute(
r"select biodatabase_id from biodatabase where name = %s",
(dbname,))
- if count == 0:
+ rv = self.cursor.fetchall()
+ if not rv:
raise KeyError("Cannot find biodatabase with name %r" % dbname)
- assert count == 1, "More than one biodatabase with name %r" % dbname
- return self.cursor.fetchone()[0]
+ assert len(rv) == 1, "More than one biodatabase with name %r" % dbname
+ return rv[0][0]
def fetch_seqid_by_display_id(self, dbid, name):
- count = self.cursor.execute(
+ self.cursor.execute(
r"select bioentry_id from bioentry where "
r" biodatabase_id = %s and display_id = %s",
(dbid, name))
- if count == 0:
+ rv = self.cursor.fetchall()
+ if not rv:
raise IndexError("Cannot find display id %r" % name)
- assert count == 1, "More than one entry with display id of %r" % name
- seqid, = self.cursor.fetchone()
- return seqid
+ assert len(rv) == 1, "More than one entry with display id of %r" % name
+ return rv[0][0]
def fetch_seqid_by_accession(self, dbid, name):
- count = self.cursor.execute(
+ self.cursor.execute(
r"select bioentry_id from bioentry where "
r" biodatabase_id = %s and accession = %s",
(dbid, name))
- if count == 0:
+ rv = self.cursor.fetchall()
+ if not rv:
raise IndexError("Cannot find accession %r" % name)
- assert count == 1, "More than one entry with accession of %r" % name
- seqid, = self.cursor.fetchone()
- return seqid
+ assert len(rv) == 1, "More than one entry with accession of %r" % name
+ return rv[0][0]
def fetch_seqid_by_seqid(self, dbid, seqid):
# XXX can't implement this right since it doesn't seem like the
@@ -151,9 +196,10 @@ class Adaptor:
return [field[0] for field in self.cursor.fetchall()]
def execute_one(self, sql, args):
- count = self.cursor.execute(sql, args)
- assert count == 1, "Expected 1 response, got %s" % count
- return self.cursor.fetchone()
+ self.cursor.execute(sql, args)
+ rv = self.cursor.fetchall()
+ assert len(rv) == 1, "Expected 1 response, got %s" % count
+ return rv[0]
def execute(self, sql, args):
"""Just execute an sql command.
@@ -163,7 +209,7 @@ class Adaptor:
def get_subseq_as_string(self, seqid, start, end):
length = end - start
return self.execute_one(
- """select SUBSTRING(biosequence_str, %s, %s)
+ """select SUBSTRING(biosequence_str FROM %s FOR %s)
from biosequence where bioentry_id = %s""",
(start+1, length, seqid))[0]
Index: BioSQL/Loader.py
===================================================================
RCS file: /home/repository/biopython/biopython/BioSQL/Loader.py,v
retrieving revision 1.8
diff -u -p -r1.8 Loader.py
--- BioSQL/Loader.py 2002/05/28 13:05:42 1.8
+++ BioSQL/Loader.py 2002/11/18 21:18:43
@@ -57,7 +57,7 @@ class DatabaseLoader:
else:
sql = r"INSERT INTO ontology_term (term_name, term_definition)" \
r"VALUES (%s, %s)"
- self.adaptor.execute_one(sql, (term_name, term_description))
+ self.adaptor.execute(sql, (term_name, term_description))
# recursively call this to give back the id
return self._get_ontology_id(term_name, term_description)
@@ -78,12 +78,10 @@ class DatabaseLoader:
sql = r"INSERT INTO bioentry (biodatabase_id, display_id, " \
r"accession, entry_version, division) VALUES" \
r" (%s, %s, %s, %s, %s)"
- self.adaptor.execute_one(sql, (self.dbid, record.name,
- accession, version, division))
+ self.adaptor.execute(sql, (self.dbid, record.name,
+ accession, version, division))
# now retrieve the id for the bioentry
- sql = r"SELECT max(bioentry_id) FROM bioentry"
- results = self.adaptor.execute_one(sql, ())
- bioentry_id = results[0]
+ bioentry_id = self.adaptor.last_id('bioentry')
return bioentry_id
@@ -100,7 +98,7 @@ class DatabaseLoader:
date_id = self._get_ontology_id("date", "Sequence date")
sql = r"INSERT INTO bioentry_qualifier_value VALUES" \
r" (%s, %s, %s)"
- self.adaptor.execute_one(sql, (bioentry_id, date_id, date))
+ self.adaptor.execute(sql, (bioentry_id, date_id, date))
def _load_bioentry_taxa(self, record, bioentry_id):
"""Add taxa information to the database.
@@ -109,9 +107,9 @@ class DatabaseLoader:
try:
# XXX this isn't right, we need taxa ids and other junk
taxa = record.annotations["taxa"]
- sql = r"INSERT INTO bioentry_taxa VALUES" \
+ sql = r"INSERT INTO bioentry_taxa(bioentry_id, taxa_id) VALUES" \
r" (%s, %s)"
- self.adapter.execute_one(sql, (bioentry_id, taxa))
+ self.adapter.execute(sql, (bioentry_id, taxa))
except KeyError:
pass
@@ -121,26 +119,26 @@ class DatabaseLoader:
accession, version = record.id.split(".")
# determine the string representation of the alphabet
if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
- mol_type = "DNA"
+ alphabet = "DNA"
elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
- mol_type = "RNA"
+ alphabet = "RNA"
elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
- mol_type = "PROTEIN"
+ alphabet = "PROTEIN"
else:
- mol_type = "UNKNOWN"
+ alphabet = "UNKNOWN"
sql = r"INSERT INTO biosequence (bioentry_id, seq_version, " \
r"biosequence_str, molecule) VALUES (%s, %s, %s, %s)"
- self.adaptor.execute_one(sql, (bioentry_id, version, record.seq.data,
- mol_type))
+ self.adaptor.execute(sql, (bioentry_id, version, record.seq.data,
+ alphabet))
def _load_bioentry_description(self, record, bioentry_id):
"""Load the description table.
"""
descr_id = self._get_ontology_id("description", "Sequence description")
sql = r"INSERT INTO bioentry_qualifier_value VALUES (%s, %s, %s)"
- self.adaptor.execute_one(sql, (bioentry_id, descr_id,
- record.description))
+ self.adaptor.execute(sql, (bioentry_id, descr_id,
+ record.description))
def _load_seqfeature(self, feature, feature_rank, bioentry_id):
"""Load a biopython SeqFeature into the database.
@@ -161,11 +159,9 @@ class DatabaseLoader:
# XXX This doesn't do source yet, since I'm not sure I understand it.
sql = r"INSERT INTO seqfeature (bioentry_id, seqfeature_key_id, " \
r"seqfeature_rank) VALUES (%s, %s, %s)"
- self.adaptor.execute_one(sql, (bioentry_id, seqfeature_key_id,
- feature_rank))
- sql = r"SELECT max(seqfeature_id) FROM seqfeature"
- results = self.adaptor.execute_one(sql, ())
- seqfeature_id = results[0]
+ self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
+ feature_rank))
+ seqfeature_id = self.adaptor.last_id('seqfeature')
return seqfeature_id
@@ -212,7 +208,7 @@ class DatabaseLoader:
start = feature.location.nofuzzy_start + 1
end = feature.location.nofuzzy_end
- self.adaptor.execute_one(sql, (seqfeature_id, start, end, strand, rank))
+ self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank))
def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
"""Insert the (key, value) pair qualifiers relating to a feature.
@@ -228,11 +224,11 @@ class DatabaseLoader:
qualifier_value = qualifiers[qualifier_key][qual_value_rank]
sql = r"INSERT INTO seqfeature_qualifier_value VALUES" \
r" (%s, %s, %s, %s)"
- self.adaptor.execute_one(sql, (seqfeature_id,
+ self.adaptor.execute(sql, (seqfeature_id,
qualifier_key_id, qual_value_rank, qualifier_value))
class DatabaseRemover:
- """Compliment the Loader functionality by fully removing a database.
+ """Complement the Loader functionality by fully removing a database.
This probably isn't really useful for normal purposes, since you
can just do a:
@@ -242,6 +238,7 @@ class DatabaseRemover:
XXX I think this might be the worst optimized SQL in the history
of the world. There is probably a much better way to do it.
+ [The "right" way is of course to have FKs--YB]
"""
def __init__(self, adaptor, dbid):
"""Initialize with a database id and adaptor connection.
Index: Doc/Tutorial.tex
===================================================================
RCS file: /home/repository/biopython/biopython/Doc/Tutorial.tex,v
retrieving revision 1.28
diff -u -p -r1.28 Tutorial.tex
--- Doc/Tutorial.tex 2002/10/17 03:21:32 1.28
+++ Doc/Tutorial.tex 2002/11/18 21:18:45
@@ -3323,7 +3323,7 @@ detect likely problems. I will give seve
\ref{problem structures}.
\begin{figure}
-{\par\centering \resizebox*{0.75\textwidth}{!}{\includegraphics{smcra.eps}} \par}
+{\par\centering \resizebox*{0.75\textwidth}{!}{\includegraphics{smcra}} \par}
\caption{\label{SMCRA UML diagram}UML diagram of the SMCRA data structure used to represent
a macromolecular structure.}
\end{figure}
Index: Tests/test_BioSQL.py
===================================================================
RCS file: /home/repository/biopython/biopython/Tests/test_BioSQL.py,v
retrieving revision 1.9
diff -u -p -r1.9 test_BioSQL.py
--- Tests/test_BioSQL.py 2002/01/31 18:31:07 1.9
+++ Tests/test_BioSQL.py 2002/11/18 21:18:45
@@ -1,8 +1,5 @@
#!/usr/bin/env python
"""Tests for dealing with storage of biopython objects in a relational db.
-
-Currently these tests require a MySQL db loaded with the GenBank info
-in GenBank/cor6_6.gb. This loading can be done with bioperl-db.
"""
# standard library
import sys
@@ -20,14 +17,21 @@ from Bio import GenBank
from BioSQL import BioSeqDatabase
from BioSQL import BioSeq
-# Constants for the MySQL database
+##DBDRIVER = 'MySQLdb'
+##DBTYPE = 'mysql'
+DBDRIVER = 'psycopg'
+DBTYPE = 'pg'
+
+# Works for mysql and postgresql, not oracle
+DBSCHEMA = "biosqldb-" + DBTYPE + ".sql"
+# Constants for the database driver
DBHOST = 'localhost'
-DBUSER = 'chapmanb'
+DBUSER = ''
DBPASSWD = ''
TESTDB = 'biosql'
# XXX I need to put these SQL files somewhere in biopython
SQL_FILE = os.path.join(os.pardir, os.pardir, "biosql-schema", "sql",
- "biosqldb-mysql.sql")
+ DBSCHEMA)
def run_tests(argv):
test_suite = testing_suite()
@@ -49,31 +53,57 @@ def testing_suite():
return test_suite
-def load_database(gb_handle):
- """Load a GenBank file into a BioSQL database.
-
- This is useful for running tests against a newly created database.
- """
+def create_database():
+ """Create an empty BioSQL database."""
# first open a connection to create the database
- server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+ server = BioSeqDatabase.open_database(driver = DBDRIVER,
+ user = DBUSER, passwd = DBPASSWD,
host = DBHOST)
-
+
+ # Auto-commit: postgresql cannot drop database in a transaction
+ try:
+ server.adaptor.autocommit()
+ except AttributeError:
+ pass
+
# drop anything in the database
try:
sql = r"DROP DATABASE " + TESTDB
server.adaptor.cursor.execute(sql, ())
except server.module.OperationalError: # the database doesn't exist
pass
+ except server.module.IntegrityError, e: # ditto--perhaps
+ if str(e).find('database "%s" does not exist' % TESTDB) > 0:
+ pass
+ else:
+ raise
# create a new database
sql = r"CREATE DATABASE " + TESTDB
- server.adaptor.execute_one(sql, ())
+ server.adaptor.execute(sql, ())
+
+ server.adaptor.conn.close()
+
+ # now open a connection to load the database
+ server = BioSeqDatabase.open_database(driver = DBDRIVER,
+ user = DBUSER, passwd = DBPASSWD,
+ host = DBHOST, db = TESTDB)
+ server.load_database_sql(SQL_FILE)
+ server.adaptor.conn.commit()
+ server.adaptor.conn.close()
+
+def load_database(gb_handle):
+ """Load a GenBank file into a BioSQL database.
+ This is useful for running tests against a newly created database.
+ """
+
+ create_database()
# now open a connection to load the database
db_name = "biosql-test"
- server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+ server = BioSeqDatabase.open_database(driver = DBDRIVER,
+ user = DBUSER, passwd = DBPASSWD,
host = DBHOST, db = TESTDB)
- server.load_database_sql(SQL_FILE)
db = server.new_database(db_name)
# get the GenBank file we are going to put into it
@@ -81,6 +111,8 @@ def load_database(gb_handle):
iterator = GenBank.Iterator(gb_handle, parser)
# finally put it in the database
db.load(iterator)
+ server.adaptor.conn.commit()
+ server.adaptor.conn.close()
class ReadTest(unittest.TestCase):
"""Test reading a database from an already built database.
@@ -95,11 +127,17 @@ class ReadTest(unittest.TestCase):
load_database(gb_handle)
gb_handle.close()
- server = BioSeqDatabase.open_database(user = DBUSER,
- passwd = DBPASSWD, host = DBHOST, db = TESTDB)
+ server = BioSeqDatabase.open_database(driver = DBDRIVER,
+ user = DBUSER,
+ passwd = DBPASSWD,
+ host = DBHOST, db = TESTDB)
self.db = server["biosql-test"]
+ def tearDown(self):
+ self.db.adaptor.conn.close()
+ del self.db
+
def t_get_db_items(self):
"""Get a list of all items in the database.
"""
@@ -139,10 +177,16 @@ class SeqInterfaceTest(unittest.TestCase
load_database(gb_handle)
gb_handle.close()
- server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+ server = BioSeqDatabase.open_database(driver = DBDRIVER,
+ user = DBUSER, passwd = DBPASSWD,
host = DBHOST, db = TESTDB)
- db = server["biosql-test"]
- self.item = db.lookup(accession = "X62281")
+ self.db = server["biosql-test"]
+ self.item = self.db.lookup(accession = "X62281")
+
+ def tearDown(self):
+ self.db.adaptor.conn.close()
+ del self.db
+ del self.item
def t_seq_record(self):
"""Make sure SeqRecords from BioSQL implement the right interface.
@@ -217,9 +261,14 @@ class LoaderTest(unittest.TestCase):
"""Load a database from a GenBank file.
"""
def setUp(self):
+
+ # create TESTDB
+ create_database()
+
# load the database
db_name = "biosql-test"
- server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+ server = BioSeqDatabase.open_database(driver = DBDRIVER,
+ user = DBUSER, passwd = DBPASSWD,
host = DBHOST, db = TESTDB)
# remove the database if it already exists
@@ -237,6 +286,10 @@ class LoaderTest(unittest.TestCase):
parser = GenBank.FeatureParser()
self.iterator = GenBank.Iterator(handle, parser)
+ def tearDown(self):
+ self.db.adaptor.conn.close()
+ del self.db
+
def t_load_database(self):
"""Load SeqRecord objects into a BioSQL database.
"""
@@ -267,9 +320,14 @@ class InDepthLoadTest(unittest.TestCase)
load_database(gb_handle)
gb_handle.close()
- server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+ server = BioSeqDatabase.open_database(driver = DBDRIVER,
+ user = DBUSER, passwd = DBPASSWD,
host = DBHOST, db = TESTDB)
self.db = server["biosql-test"]
+
+ def tearDown(self):
+ self.db.adaptor.conn.close()
+ del self.db
def t_record_loading(self):
"""Make sure all records are correctly loaded.
--- /dev/null 1970-01-01 01:00:00.000000000 +0100
+++ BioSQL/DBUtils.py 2002-11-18 22:19:07.000000000 +0100
@@ -0,0 +1,59 @@
+class Generic_dbutils:
+ def __init__(self):
+ pass
+
+ def tname(self, table):
+ if table != 'biosequence': return table
+ else: return 'bioentry'
+
+# Disabled: better safe than sorry
+## def next_id(self, cursor, table):
+## # XXX brain-dead! Hopefully, the database will enforce PK unicity..
+## table = self.tname(table)
+## sql = r"select 1+max(%s_id) from %s" % (table, table)
+## cursor.execute(sql)
+## rv = cursor.fetchone()
+## return rv[0]
+
+ def last_id(self, cursor, table):
+ # XXX: Unsafe without transactions isolation
+ table = self.tname(table)
+ sql = r"select max(%s_id) from %s" % table
+ cursor.execute(sql)
+ rv = cursor.fetchone()
+ return rv[0]
+
+ def autocommit(self, conn, y = 1):
+ # Let's hope it was not really needed
+ pass
+
+class Mysql_dbutils(Generic_dbutils):
+ def last_id(self, cursor, table):
+ return cursor.insert_id()
+
+class Pg_dbutils(Generic_dbutils):
+ def next_id(self, cursor, table):
+ table = self.tname(table)
+ sql = r"select nextval('%s_pk_seq')" % table
+ cursor.execute(sql)
+ rv = cursor.fetchone()
+ return rv[0]
+
+ def last_id(self, cursor, table):
+ table = self.tname(table)
+ sql = r"select currval('%s_pk_seq')" % table
+ cursor.execute(sql)
+ rv = cursor.fetchone()
+ return rv[0]
+
+ def autocommit(self, conn, y = 1):
+ conn.autocommit(y)
+
+def create_Generic_dbutils():
+ return Generic_dbutils()
+
+def create_Mysql_dbutils():
+ return Mysql_dbutils()
+
+def create_Pg_dbutils():
+ return Pg_dbutils()
More information about the Biopython-dev
mailing list