[Biopython-dev] BioSQL and PostgreSQL

Mon Nov 18 17:15:37 EST 2002

Hi,

I've adapted BioSQL/*.py to work with PostgreSQL.  Well, at least the 
tests pass... Patch attached, comments are welcome.

A few points:

* I've made utility classes for nonstandard SQL/DBAPI operations, e.g. 
retrieving the id of the last inserted record.  (The implementation is 
so-so, I know :).  There are Generic_dbutils, Mysql_dbutils and 
Pg_dbutils.  The choice is made according to the DBAPI module's name; 
only MySQLdb and psycopg are currently recognised.

* Replaced count = self.cursor.execute by self.cursor.execute and fetch

* Likewise, self.adaptor.execute_one("INSERT ...") --> 
self.adaptor.execute()

* SUBSTRING(biosequence_str, %s, %s) --> SUBSTRING(biosequence_str FROM 
%s FOR %s).  SQL is beautiful...

* I've modified test_BioSQL.py so that the first doesn't require a 
schema (it will DROP DATABASE/CREATE DATABASE like the other ones). I've 
also added tearDown()'s to close the database connections; otherwise, 
psycopg cannot drop the db.

* Modified open_database: psycopg uses a dsn, not keyword arguments.


(Done but not included in the patch: adaptations to the current 
biosql-schema CVS.)


Regards,

Yves
-------------- next part --------------
Index: BioSQL/BioSeqDatabase.py
===================================================================
RCS file: /home/repository/biopython/biopython/BioSQL/BioSeqDatabase.py,v
retrieving revision 1.10
diff -u -p -r1.10 BioSeqDatabase.py

--- BioSQL/BioSeqDatabase.py	2002/03/01 08:39:24	1.10
+++ BioSQL/BioSeqDatabase.py	2002/11/18 21:18:43
@@ -5,6 +5,7 @@ database, and is compatible with the Bio
 """
 import BioSeq
 import Loader
+import DBUtils
 
 def open_database(driver = "MySQLdb", *args, **kwargs):
     """Main interface for loading a existing BioSQL-style database.
@@ -26,16 +27,49 @@ def open_database(driver = "MySQLdb", *a
     """
     module = __import__(driver)
     connect = getattr(module, "connect")
-    conn = connect(*args, **kwargs)
+    try:
+        conn = connect(*args, **kwargs)
+    except TypeError:
+        # Perhaps a version of psycopg with different parameter style
+        # (i.e., dsn="dbname= host= user=..."
+        # FIXME: doesn't use the args array
+        kw = kwargs.copy()
+
+        for k in kw.keys():
+            if kw[k] == '': del kw[k]
+
+        if kw.has_key('db'):
+            kw['dbname'] = kw['db']
+            del kw['db']
+        if kw.has_key('passwd'):
+            kw['password'] = kw['passwd']
+            del kw['passwd']
+
+        # PostgreSQL needs a database to connect to, so use a
+        # system one if needed
+        # FIXME: might be a security risk?
+        #        So currently, we keep dbname=user
+##        if not kw.has_key('dbname'):
+##            kw['dbname'] = 'template1'
+        
+        dsn = ' '.join(['='.join(i) for i in kw.items()])
+        conn = connect(dsn)
+    
     return DBServer(conn, module)
 
 class DBServer:
     def __init__(self, conn, module):
-        self.conn = conn
         self.module = module
-        self.adaptor = Adaptor(self.conn)
+        if module.__name__ == 'psycopg':
+            create_dbutils = DBUtils.create_Pg_dbutils
+        elif module.__name__ == 'MySQLdb':
+            create_dbutils = DBUtils.create_Mysql_dbutils
+        else:
+            create_dbutils = DBUtils.create_Generic_dbutils
+        self.adaptor = Adaptor(conn, create_dbutils)
+        
     def __repr__(self):
-        return self.__class__.__name__ + "(%r)" % self.conn
+        return self.__class__.__name__ + "(%r)" % self.adaptor.conn
     def __getitem__(self, name):
         return BioSeqDatabase(self.adaptor, name)
     def keys(self):
@@ -58,7 +92,7 @@ class DBServer:
         # make the database
         sql = r"INSERT INTO biodatabase (name) VALUES" \
               r" (%s)" 
-        self.adaptor.execute_one(sql, (db_name))
+        self.adaptor.execute(sql, (db_name,))
         return BioSeqDatabase(self.adaptor, db_name)
 
     def load_database_sql(self, sql_file):
@@ -74,6 +108,8 @@ class DBServer:
         for line in sql_handle.xreadlines():
             if line.find("#") == 0: # don't include comment lines
                 pass
+            elif line.find("--") == 0: # ditto, SQL std. comments
+                pass
             elif line.strip(): # only include non-blank lines
                 sql += line.strip()
                 sql += ' '
@@ -84,40 +120,48 @@ class DBServer:
             self.adaptor.cursor.execute(sql_line, ())
 
 class Adaptor:
-    def __init__(self, conn):
+    def __init__(self, conn, create_dbutils):
         self.conn = conn
         self.cursor = conn.cursor()
+        self.dbutils = create_dbutils()##self.conn, self.cursor)
+
+    def last_id(self, table):
+        return self.dbutils.last_id(self.cursor, table)
 
+    def autocommit(self, y = 1):
+        return self.dbutils.autocommit(self.conn, y)
+
     def fetch_dbid_by_dbname(self, dbname):
-        count = self.cursor.execute(
+        self.cursor.execute(
             r"select biodatabase_id from biodatabase where name = %s",
             (dbname,))
-        if count == 0:
+        rv = self.cursor.fetchall()
+        if not rv:
             raise KeyError("Cannot find biodatabase with name %r" % dbname)
-        assert count == 1, "More than one biodatabase with name %r" % dbname
-        return self.cursor.fetchone()[0]
+        assert len(rv) == 1, "More than one biodatabase with name %r" % dbname
+        return rv[0][0]
 
     def fetch_seqid_by_display_id(self, dbid, name):
-        count = self.cursor.execute(
+        self.cursor.execute(
             r"select bioentry_id from bioentry where "
             r"    biodatabase_id = %s and display_id = %s",
             (dbid, name))
-        if count == 0:
+        rv = self.cursor.fetchall()
+        if not rv:
             raise IndexError("Cannot find display id %r" % name)
-        assert count == 1, "More than one entry with display id of %r" % name
-        seqid, = self.cursor.fetchone()
-        return seqid
+        assert len(rv) == 1, "More than one entry with display id of %r" % name
+        return rv[0][0]
 
     def fetch_seqid_by_accession(self, dbid, name):
-        count = self.cursor.execute(
+        self.cursor.execute(
             r"select bioentry_id from bioentry where "
             r"    biodatabase_id = %s and accession = %s",
             (dbid, name))
-        if count == 0:
+        rv = self.cursor.fetchall()
+        if not rv:
             raise IndexError("Cannot find accession %r" % name)
-        assert count == 1, "More than one entry with accession of %r" % name
-        seqid, = self.cursor.fetchone()
-        return seqid
+        assert len(rv) == 1, "More than one entry with accession of %r" % name
+        return rv[0][0]
 
     def fetch_seqid_by_seqid(self, dbid, seqid):
         # XXX can't implement this right since it doesn't seem like the 
@@ -151,9 +196,10 @@ class Adaptor:
         return [field[0] for field in self.cursor.fetchall()]
 
     def execute_one(self, sql, args):
-        count = self.cursor.execute(sql, args)
-        assert count == 1, "Expected 1 response, got %s" % count
-        return self.cursor.fetchone()
+        self.cursor.execute(sql, args)
+        rv = self.cursor.fetchall()
+        assert len(rv) == 1, "Expected 1 response, got %s" % count
+        return rv[0]
 
     def execute(self, sql, args):
         """Just execute an sql command.
@@ -163,7 +209,7 @@ class Adaptor:
     def get_subseq_as_string(self, seqid, start, end):
         length = end - start
         return self.execute_one(
-            """select SUBSTRING(biosequence_str, %s, %s)
+            """select SUBSTRING(biosequence_str FROM %s FOR %s)
                      from biosequence where bioentry_id = %s""",
             (start+1, length, seqid))[0]
 
Index: BioSQL/Loader.py
===================================================================
RCS file: /home/repository/biopython/biopython/BioSQL/Loader.py,v
retrieving revision 1.8
diff -u -p -r1.8 Loader.py
--- BioSQL/Loader.py	2002/05/28 13:05:42	1.8
+++ BioSQL/Loader.py	2002/11/18 21:18:43
@@ -57,7 +57,7 @@ class DatabaseLoader:
         else:
             sql = r"INSERT INTO ontology_term (term_name, term_definition)" \
                   r"VALUES (%s, %s)"
-            self.adaptor.execute_one(sql, (term_name, term_description))
+            self.adaptor.execute(sql, (term_name, term_description))
             # recursively call this to give back the id
             return self._get_ontology_id(term_name, term_description)
    
@@ -78,12 +78,10 @@ class DatabaseLoader:
         sql = r"INSERT INTO bioentry (biodatabase_id, display_id, " \
               r"accession, entry_version, division) VALUES" \
               r" (%s, %s, %s, %s, %s)"
-        self.adaptor.execute_one(sql, (self.dbid, record.name, 
-                                       accession, version, division))
+        self.adaptor.execute(sql, (self.dbid, record.name, 
+                                   accession, version, division))
         # now retrieve the id for the bioentry
-        sql = r"SELECT max(bioentry_id) FROM bioentry"
-        results = self.adaptor.execute_one(sql, ())
-        bioentry_id = results[0]
+        bioentry_id = self.adaptor.last_id('bioentry')
 
         return bioentry_id
 
@@ -100,7 +98,7 @@ class DatabaseLoader:
         date_id = self._get_ontology_id("date", "Sequence date")
         sql = r"INSERT INTO bioentry_qualifier_value VALUES" \
               r" (%s, %s, %s)" 
-        self.adaptor.execute_one(sql, (bioentry_id, date_id, date))
+        self.adaptor.execute(sql, (bioentry_id, date_id, date))
 
     def _load_bioentry_taxa(self, record, bioentry_id):
         """Add taxa information to the database.
@@ -109,9 +107,9 @@ class DatabaseLoader:
         try:
             # XXX this isn't right, we need taxa ids and other junk
             taxa = record.annotations["taxa"]
-            sql = r"INSERT INTO bioentry_taxa VALUES" \
+            sql = r"INSERT INTO bioentry_taxa(bioentry_id, taxa_id) VALUES" \
                   r" (%s, %s)" 
-            self.adapter.execute_one(sql, (bioentry_id, taxa))
+            self.adapter.execute(sql, (bioentry_id, taxa))
         except KeyError:
             pass
 
@@ -121,26 +119,26 @@ class DatabaseLoader:
         accession, version = record.id.split(".")
         # determine the string representation of the alphabet
         if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
-            mol_type = "DNA"
+            alphabet = "DNA"
         elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
-            mol_type = "RNA"
+            alphabet = "RNA"
         elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
-            mol_type = "PROTEIN"
+            alphabet = "PROTEIN"
         else:
-            mol_type = "UNKNOWN"
+            alphabet = "UNKNOWN"
         
         sql = r"INSERT INTO biosequence (bioentry_id, seq_version, " \
               r"biosequence_str, molecule) VALUES (%s, %s, %s, %s)"
-        self.adaptor.execute_one(sql, (bioentry_id, version, record.seq.data,
-                                       mol_type))
+        self.adaptor.execute(sql, (bioentry_id, version, record.seq.data,
+                                   alphabet))
 
     def _load_bioentry_description(self, record, bioentry_id):
         """Load the description table.
         """
         descr_id = self._get_ontology_id("description", "Sequence description")
         sql = r"INSERT INTO bioentry_qualifier_value VALUES (%s, %s, %s)"
-        self.adaptor.execute_one(sql, (bioentry_id, descr_id, 
-                                       record.description))
+        self.adaptor.execute(sql, (bioentry_id, descr_id, 
+                                   record.description))
 
     def _load_seqfeature(self, feature, feature_rank, bioentry_id):
         """Load a biopython SeqFeature into the database.
@@ -161,11 +159,9 @@ class DatabaseLoader:
         # XXX This doesn't do source yet, since I'm not sure I understand it.
         sql = r"INSERT INTO seqfeature (bioentry_id, seqfeature_key_id, " \
               r"seqfeature_rank) VALUES (%s, %s, %s)"
-        self.adaptor.execute_one(sql, (bioentry_id, seqfeature_key_id,
-                                       feature_rank))
-        sql = r"SELECT max(seqfeature_id) FROM seqfeature"
-        results = self.adaptor.execute_one(sql, ())
-        seqfeature_id = results[0]
+        self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
+                                   feature_rank))
+        seqfeature_id = self.adaptor.last_id('seqfeature')
 
         return seqfeature_id
 
@@ -212,7 +208,7 @@ class DatabaseLoader:
         start = feature.location.nofuzzy_start + 1
         end = feature.location.nofuzzy_end 
             
-        self.adaptor.execute_one(sql, (seqfeature_id, start, end, strand, rank))
+        self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank))
 
     def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
         """Insert the (key, value) pair qualifiers relating to a feature.
@@ -228,11 +224,11 @@ class DatabaseLoader:
                 qualifier_value = qualifiers[qualifier_key][qual_value_rank]
                 sql = r"INSERT INTO seqfeature_qualifier_value VALUES" \
                       r" (%s, %s, %s, %s)"
-                self.adaptor.execute_one(sql, (seqfeature_id,
+                self.adaptor.execute(sql, (seqfeature_id,
                   qualifier_key_id, qual_value_rank, qualifier_value))
        
 class DatabaseRemover:
-    """Compliment the Loader functionality by fully removing a database.
+    """Complement the Loader functionality by fully removing a database.
 
     This probably isn't really useful for normal purposes, since you
     can just do a:
@@ -242,6 +238,7 @@ class DatabaseRemover:
 
     XXX I think this might be the worst optimized SQL in the history
     of the world. There is probably a much better way to do it.
+    [The "right" way is of course to have FKs--YB]
     """
     def __init__(self, adaptor, dbid):
         """Initialize with a database id and adaptor connection.
Index: Doc/Tutorial.tex
===================================================================
RCS file: /home/repository/biopython/biopython/Doc/Tutorial.tex,v
retrieving revision 1.28
diff -u -p -r1.28 Tutorial.tex
--- Doc/Tutorial.tex	2002/10/17 03:21:32	1.28
+++ Doc/Tutorial.tex	2002/11/18 21:18:45
@@ -3323,7 +3323,7 @@ detect likely problems. I will give seve
 \ref{problem structures}. 
 
 \begin{figure}
-{\par\centering \resizebox*{0.75\textwidth}{!}{\includegraphics{smcra.eps}} \par}
+{\par\centering \resizebox*{0.75\textwidth}{!}{\includegraphics{smcra}} \par}
 \caption{\label{SMCRA UML diagram}UML diagram of the SMCRA data structure used to represent
 a macromolecular structure.}
 \end{figure}
Index: Tests/test_BioSQL.py
===================================================================
RCS file: /home/repository/biopython/biopython/Tests/test_BioSQL.py,v
retrieving revision 1.9
diff -u -p -r1.9 test_BioSQL.py
--- Tests/test_BioSQL.py	2002/01/31 18:31:07	1.9
+++ Tests/test_BioSQL.py	2002/11/18 21:18:45
@@ -1,8 +1,5 @@
 #!/usr/bin/env python
 """Tests for dealing with storage of biopython objects in a relational db.
-
-Currently these tests require a MySQL db loaded with the GenBank info
-in GenBank/cor6_6.gb. This loading can be done with bioperl-db.
 """
 # standard library
 import sys
@@ -20,14 +17,21 @@ from Bio import GenBank
 from BioSQL import BioSeqDatabase
 from BioSQL import BioSeq
 
-# Constants for the MySQL database
+##DBDRIVER = 'MySQLdb'
+##DBTYPE = 'mysql'
+DBDRIVER = 'psycopg'
+DBTYPE = 'pg'
+
+# Works for mysql and postgresql, not oracle
+DBSCHEMA = "biosqldb-" + DBTYPE + ".sql"
+# Constants for the database driver
 DBHOST = 'localhost'
-DBUSER = 'chapmanb'
+DBUSER = ''
 DBPASSWD = ''
 TESTDB = 'biosql'
 # XXX I need to put these SQL files somewhere in biopython
 SQL_FILE = os.path.join(os.pardir, os.pardir, "biosql-schema", "sql",
-                        "biosqldb-mysql.sql")
+                        DBSCHEMA)
 
 def run_tests(argv):
     test_suite = testing_suite()
@@ -49,31 +53,57 @@ def testing_suite():
 
     return test_suite
 
-def load_database(gb_handle):
-    """Load a GenBank file into a BioSQL database.
-    
-    This is useful for running tests against a newly created database.
-    """
+def create_database():
+    """Create an empty BioSQL database."""
     # first open a connection to create the database
-    server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+    server = BioSeqDatabase.open_database(driver = DBDRIVER,
+                                          user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST)
-    
+
+    # Auto-commit: postgresql cannot drop database in a transaction
+    try:
+        server.adaptor.autocommit()
+    except AttributeError:
+        pass
+
     # drop anything in the database
     try:
         sql = r"DROP DATABASE " + TESTDB
         server.adaptor.cursor.execute(sql, ())
     except server.module.OperationalError: # the database doesn't exist
         pass
+    except server.module.IntegrityError, e: # ditto--perhaps
+        if str(e).find('database "%s" does not exist' % TESTDB) > 0:
+            pass
+        else:
+            raise
     
     # create a new database
     sql = r"CREATE DATABASE " + TESTDB
-    server.adaptor.execute_one(sql, ())
+    server.adaptor.execute(sql, ())
+
+    server.adaptor.conn.close()
+
+    # now open a connection to load the database
+    server = BioSeqDatabase.open_database(driver = DBDRIVER,
+                                          user = DBUSER, passwd = DBPASSWD,
+                                          host = DBHOST, db = TESTDB)
+    server.load_database_sql(SQL_FILE)
+    server.adaptor.conn.commit()
+    server.adaptor.conn.close()
+
+def load_database(gb_handle):
+    """Load a GenBank file into a BioSQL database.
     
+    This is useful for running tests against a newly created database.
+    """
+
+    create_database()
     # now open a connection to load the database
     db_name = "biosql-test"
-    server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+    server = BioSeqDatabase.open_database(driver = DBDRIVER,
+                                          user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST, db = TESTDB)
-    server.load_database_sql(SQL_FILE)
     db = server.new_database(db_name)
     
     # get the GenBank file we are going to put into it
@@ -81,6 +111,8 @@ def load_database(gb_handle):
     iterator = GenBank.Iterator(gb_handle, parser)
     # finally put it in the database
     db.load(iterator)
+    server.adaptor.conn.commit()
+    server.adaptor.conn.close()
 
 class ReadTest(unittest.TestCase):
     """Test reading a database from an already built database.
@@ -95,11 +127,17 @@ class ReadTest(unittest.TestCase):
         load_database(gb_handle)
         gb_handle.close()
             
-        server = BioSeqDatabase.open_database(user = DBUSER, 
-                     passwd = DBPASSWD, host = DBHOST, db = TESTDB)
+        server = BioSeqDatabase.open_database(driver = DBDRIVER,
+                                              user = DBUSER, 
+                                              passwd = DBPASSWD,
+                                              host = DBHOST, db = TESTDB)
             
         self.db = server["biosql-test"]
 
+    def tearDown(self):
+        self.db.adaptor.conn.close()
+        del self.db
+
     def t_get_db_items(self):
         """Get a list of all items in the database.
         """
@@ -139,10 +177,16 @@ class SeqInterfaceTest(unittest.TestCase
         load_database(gb_handle)
         gb_handle.close()
 
-        server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+        server = BioSeqDatabase.open_database(driver = DBDRIVER,
+                                              user = DBUSER, passwd = DBPASSWD,
                                               host = DBHOST, db = TESTDB)
-        db = server["biosql-test"]
-        self.item = db.lookup(accession = "X62281")
+        self.db = server["biosql-test"]
+        self.item = self.db.lookup(accession = "X62281")
+
+    def tearDown(self):
+        self.db.adaptor.conn.close()
+        del self.db
+        del self.item
     
     def t_seq_record(self):
         """Make sure SeqRecords from BioSQL implement the right interface.
@@ -217,9 +261,14 @@ class LoaderTest(unittest.TestCase):
     """Load a database from a GenBank file.
     """
     def setUp(self):
+
+        # create TESTDB
+        create_database()
+        
         # load the database
         db_name = "biosql-test"
-        server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+        server = BioSeqDatabase.open_database(driver = DBDRIVER,
+                                              user = DBUSER, passwd = DBPASSWD,
                                               host = DBHOST, db = TESTDB)
         
         # remove the database if it already exists
@@ -237,6 +286,10 @@ class LoaderTest(unittest.TestCase):
         parser = GenBank.FeatureParser()
         self.iterator = GenBank.Iterator(handle, parser)
 
+    def tearDown(self):
+        self.db.adaptor.conn.close()
+        del self.db
+
     def t_load_database(self):
         """Load SeqRecord objects into a BioSQL database.
         """
@@ -267,9 +320,14 @@ class InDepthLoadTest(unittest.TestCase)
         load_database(gb_handle)
         gb_handle.close()
 
-        server = BioSeqDatabase.open_database(user = DBUSER, passwd = DBPASSWD,
+        server = BioSeqDatabase.open_database(driver = DBDRIVER,
+                                              user = DBUSER, passwd = DBPASSWD,
                                               host = DBHOST, db = TESTDB)
         self.db = server["biosql-test"]
+
+    def tearDown(self):
+        self.db.adaptor.conn.close()
+        del self.db
 
     def t_record_loading(self):
         """Make sure all records are correctly loaded.
--- /dev/null	1970-01-01 01:00:00.000000000 +0100
+++ BioSQL/DBUtils.py	2002-11-18 22:19:07.000000000 +0100
@@ -0,0 +1,59 @@
+class Generic_dbutils:
+    def __init__(self):
+        pass
+
+    def tname(self, table):
+        if table != 'biosequence': return table
+        else: return 'bioentry'
+
+# Disabled: better safe than sorry
+##    def next_id(self, cursor, table):
+##        # XXX brain-dead! Hopefully, the database will enforce PK unicity..
+##        table = self.tname(table)
+##        sql = r"select 1+max(%s_id) from %s" % (table, table)
+##        cursor.execute(sql)
+##        rv = cursor.fetchone()
+##        return rv[0]
+        
+    def last_id(self, cursor, table):
+        # XXX: Unsafe without transactions isolation
+        table = self.tname(table)
+        sql = r"select max(%s_id) from %s" % table
+        cursor.execute(sql)
+        rv = cursor.fetchone()
+        return rv[0]
+
+    def autocommit(self, conn, y = 1):
+        # Let's hope it was not really needed
+        pass
+
+class Mysql_dbutils(Generic_dbutils):
+    def last_id(self, cursor, table):
+        return cursor.insert_id()
+
+class Pg_dbutils(Generic_dbutils):
+    def next_id(self, cursor, table):
+        table = self.tname(table)
+        sql = r"select nextval('%s_pk_seq')" % table
+        cursor.execute(sql)
+        rv = cursor.fetchone()
+        return rv[0]
+        
+    def last_id(self, cursor, table):
+        table = self.tname(table)
+        sql = r"select currval('%s_pk_seq')" % table
+        cursor.execute(sql)
+        rv = cursor.fetchone()
+        return rv[0]
+
+    def autocommit(self, conn, y = 1):
+        conn.autocommit(y)
+
+def create_Generic_dbutils():
+    return Generic_dbutils()
+
+def create_Mysql_dbutils():
+    return Mysql_dbutils()
+
+def create_Pg_dbutils():
+    return Pg_dbutils()