[Biopython-dev] Re: BioSQL upgrade [3 of 3]

Fri Nov 22 17:48:53 EST 2002

The big one.
-------------- next part --------------
Index: BioSQL/BioSeq.py
===================================================================
RCS file: /home/repository/biopython/biopython/BioSQL/BioSeq.py,v
retrieving revision 1.8
diff -u -p -r1.8 BioSeq.py

--- BioSQL/BioSeq.py	2002/03/01 12:22:05	1.8
+++ BioSQL/BioSeq.py	2002/11/22 22:13:21
@@ -14,7 +14,7 @@ class DBSeq:  # This implements the biop
     def __getattr__(self, name):
         if name == "data":
             return self.tostring()
-        raise AttributeError(name)
+        raise AttributeError, name
     
     def __len__(self):
         return self._length
@@ -55,18 +55,20 @@ class DBInternalSeq:
         self.primary_id = primary_id
         self.adaptor = adaptor
 
-        self.name, self.id, _length, self.moltype = \
+        self.name, self.id, _length, self.description, self.moltype = \
                          self.adaptor.execute_one(
             """select en.display_id, en.accession, length(bs.biosequence_str),
-                                      bs.molecule
-                                from bioentry en, biosequence bs
-                                where bs.bioentry_id = en.bioentry_id and
-                                      bs.bioentry_id = %s""",
+                      en.description, bs.alphabet
+               from bioentry en, biosequence bs
+               where bs.bioentry_id = en.bioentry_id and
+                     bs.bioentry_id = %d""",
             (self.primary_id,))
 
         self._length = int(_length)
-
+        
     def __getattr__(self, name):
+        if name[:1] == '_':
+            raise AttributeError, name
         if name == "seq":
             moltype = self.moltype.upper()
             from Bio.Alphabet import IUPAC
@@ -84,23 +86,9 @@ class DBInternalSeq:
             return seq
         f = getattr(self, "_get_" + name, None)
         if f is None:
-            raise AttributeError(name)
+            raise AttributeError, name
         return f()
 
-    def _get_description(self):
-        descr_results = _get_ontology_terms("description", self.primary_id,
-                                            self.adaptor)
-        if len(descr_results) == 0:
-            description = ""
-        elif len(descr_results) == 1:
-            description = descr_results[0]
-        else:
-            raise ValueError("Got multiple unexpected descriptions: %s" %
-                             descr_results)
-
-        self.description = description
-        return description
-
     def __len__(self):
         return self._length
 
@@ -110,10 +98,13 @@ def _get_ontology_terms(ontology_name, b
     sql = r"SELECT ontology_term_id FROM ontology_term " \
           r"WHERE term_name = %s" 
     id_info = adaptor.execute_and_fetchall(sql, (ontology_name,))
+    if id_info is None:
+        return None
+    
     ontology_id = id_info[0][0]
 
     sql = r"SELECT qualifier_value FROM bioentry_qualifier_value " \
-          r"WHERE bioentry_id = %s AND ontology_term_id = %s"
+          r"WHERE bioentry_id = %d AND ontology_term_id = %s"
     values = adaptor.execute_and_fetch_col0(sql, (bioentry_id,
                                                   ontology_id))
     return values
@@ -190,7 +181,7 @@ class Species:
         elif name == "genus":
             return self.classification[1]
 
-        raise AttributeError(name)
+        raise AttributeError, name
 
     def __setattr__(self, name, val):
         if name == "species":
@@ -212,9 +203,11 @@ class Annotation:
         self.primary_id = primary_id
 
     def __getattr__(self, name):
+        if name[:1] == '_':
+            raise AttributeError, name
         f = getattr(self, "_get_" + name, None)
         if f is None:
-            raise AttributeError(name)
+            raise AttributeError, name
         return f()
     
     # functions to make this more like a dictionary
@@ -222,7 +215,7 @@ class Annotation:
         if key in ["comments", "dblinks", "references"]:
             return getattr(self, key)
         else:
-            raise KeyError("Unexpected item: %s")
+            raise KeyError("Unexpected item: %s" % key)
 
     def has_key(self, key):
         if key in ["comments", "dblinks", "references"]:
@@ -236,7 +229,7 @@ class Annotation:
 
     def _get_comments(self):
         comments = self.adaptor.execute_and_fetch_col0(
-            """select comment_text from comment where bioentry_id = %s""",
+            """select comment_text from comment where bioentry_id = %d""",
             (self.primary_id,))
         self.comments = comments
         return comments
@@ -244,7 +237,7 @@ class Annotation:
     def _get_dblinks(self):
         dblink_info = self.adaptor.execute_and_fetchall(
             """select dbname, accession from bioentry_direct_links
-                    where source_bioentry_id = %s""",
+                    where source_bioentry_id = %d""",
             (self.primary_id,))
         dblinks =  [DBLink(database, primary_id) for (database, primary_id)
                                                      in dblink_info]
@@ -255,7 +248,7 @@ class Annotation:
         results = self.adaptor.execute_and_fetchall(
             """select reference_id, reference_start,reference_end
                        from bioentry_reference
-                       where bioentry_id = %s
+                       where bioentry_id = %d
                        order by reference_rank""",
             (self.primary_id,))
 
@@ -284,8 +277,8 @@ def load_seq_features(adaptor, primary_i
     from Bio import SeqFeature
     
     # Get the seqfeature id list
-    sql = r"SELECT seqfeature_id, seqfeature_rank, seqfeature_key_id " \
-          r"FROM seqfeature WHERE bioentry_id = %s"
+    sql = r"SELECT seqfeature_id, seqfeature_rank, ontology_term_id " \
+          r"FROM seqfeature WHERE bioentry_id = %d"
     results = adaptor.execute_and_fetchall(sql, (primary_id,))
 
     seq_feature_list = []
@@ -324,9 +317,9 @@ def load_seq_features(adaptor, primary_i
 
         # Get any remote reference information
         remote_results = adaptor.execute_and_fetchall("""
-          SELECT rem.seqfeature_location_id, rem.accession, rem.version
-            FROM remote_seqfeature_name rem, seqfeature_location sfl
-            WHERE rem.seqfeature_location_id = sfl.seqfeature_location_id AND
+          SELECT seqfeature_location_id, accession, version
+            FROM seqfeature_location sfl, dbxref drf
+            WHERE drf.dbxref_id = sfl.dbxref_id AND
                   sfl.seqfeature_id = %s""",
                                                       (seqfeature_id,))
         # Do the merge locally
@@ -387,10 +380,10 @@ class DBSeqRecord:
 
         self.version, _length, self.division = \
                       self.adaptor.execute_one(
-            """select en.entry_version, length(bs.biosequence_str), en.division
+            """select en.entry_version, length(bs.biosequence_str), bs.division
                     from bioentry en, biosequence bs
                     where bs.bioentry_id = en.bioentry_id and
-                          bs.bioentry_id = %s""",
+                          bs.bioentry_id = %d""",
             (self.primary_id,))
         self._length = int(_length)
         
@@ -399,12 +392,12 @@ class DBSeqRecord:
 
     def __getattr__(self, name):
         if name[:1] == "_":
-            raise AttributeError(name)
+            raise AttributeError, name
         if name in self._forward_getattr:
             return getattr(self.primary_seq, name)
         f = getattr(self, "_get_" + name, None)
         if f is None:
-            raise AttributeError(name)
+            raise AttributeError, name
         return f()
 
     def _get_primary_seq(self):
@@ -430,14 +423,14 @@ class DBSeqRecord:
    
     def _get_dates(self):
         self.dates = _get_ontology_terms("date", self.primary_id, self.adaptor)
-        return dates
+        return self.dates
 
     def _get_species(self):
         full_lineage, common_name = self.adaptor.execute_one(
             """select tx.full_lineage, tx.common_name
-                           from taxa tx, bioentry_taxa bt
-                           where tx.taxa_id = bt.taxa_id and
-                                 bt.bioentry_id = %s""",
+                           from taxon tx, bioentry be
+                           where tx.taxon_id = be.taxon_id and
+                                 be.bioentry_id = %d""",
             (self.primary_id,))
         terms = full_lineage.split(":")
         species = Species(terms, common_name)
@@ -448,9 +441,7 @@ class DBSeqRecord:
         return version
 
     def _get_keywords(self):
-        keywords = self.adaptor.execute_and_fetch_col0(
-            """select keywords from bioentry_keywords
-                               where bioentry_id = %s""",
-            (self.primary_id,))
+        keywords = _get_ontology_term('Keywords', self.primary_id,
+                                      self.adaptor)
         self.keywords = keywords
         return keywords
Index: BioSQL/BioSeqDatabase.py
===================================================================
RCS file: /home/repository/biopython/biopython/BioSQL/BioSeqDatabase.py,v
retrieving revision 1.11
diff -u -p -r1.11 BioSeqDatabase.py
--- BioSQL/BioSeqDatabase.py	2002/11/20 15:38:11	1.11
+++ BioSQL/BioSeqDatabase.py	2002/11/22 22:13:21
@@ -58,11 +58,12 @@ def open_database(driver = "MySQLdb", *a
     return DBServer(conn, module)
 
 class DBServer:
-    def __init__(self, conn, module):
+    def __init__(self, conn, module, module_name = None):
         self.module = module
-        if module.__name__ == 'psycopg':
+        if module_name is None: module_name = module.__name__
+        if module_name == 'psycopg':
             create_dbutils = DBUtils.create_Pg_dbutils
-        elif module.__name__ == 'MySQLdb':
+        elif module_name == 'MySQLdb':
             create_dbutils = DBUtils.create_Mysql_dbutils
         else:
             create_dbutils = DBUtils.create_Generic_dbutils
@@ -123,7 +124,7 @@ class Adaptor:
     def __init__(self, conn, create_dbutils):
         self.conn = conn
         self.cursor = conn.cursor()
-        self.dbutils = create_dbutils()##self.conn, self.cursor)
+        self.dbutils = create_dbutils()
 
     def last_id(self, table):
         return self.dbutils.last_id(self.cursor, table)
@@ -131,6 +132,12 @@ class Adaptor:
     def autocommit(self, y = 1):
         return self.dbutils.autocommit(self.conn, y)
 
+    def commit(self):
+        return self.conn.commit()
+
+    def rollback(self):
+        return self.conn.rollback()
+
     def fetch_dbid_by_dbname(self, dbname):
         self.cursor.execute(
             r"select biodatabase_id from biodatabase where name = %s",
@@ -138,14 +145,17 @@ class Adaptor:
         rv = self.cursor.fetchall()
         if not rv:
             raise KeyError("Cannot find biodatabase with name %r" % dbname)
-        assert len(rv) == 1, "More than one biodatabase with name %r" % dbname
+        # Cannot happen (UK)
+##        assert len(rv) == 1, "More than one biodatabase with name %r" % dbname
         return rv[0][0]
 
     def fetch_seqid_by_display_id(self, dbid, name):
-        self.cursor.execute(
-            r"select bioentry_id from bioentry where "
-            r"    biodatabase_id = %s and display_id = %s",
-            (dbid, name))
+        sql = r"select bioentry_id from bioentry where display_id = %s"
+        fields = [name]
+        if dbid:
+            sql += " and biodatabase_id = %d"
+            fields.append(dbid)
+        self.cursor.execute(sql, fields)
         rv = self.cursor.fetchall()
         if not rv:
             raise IndexError("Cannot find display id %r" % name)
@@ -153,13 +163,17 @@ class Adaptor:
         return rv[0][0]
 
     def fetch_seqid_by_accession(self, dbid, name):
-        self.cursor.execute(
-            r"select bioentry_id from bioentry where "
-            r"    biodatabase_id = %s and accession = %s",
-            (dbid, name))
+        sql = r"select bioentry_id from bioentry where accession = %s"
+        fields = [name]
+        if dbid:
+            sql += " and biodatabase_id = %d"
+            fields.append(dbid)
+
+        self.cursor.execute(sql, fields)
         rv = self.cursor.fetchall()
         if not rv:
             raise IndexError("Cannot find accession %r" % name)
+        # Can happen: several versions (or biodatabases)
         assert len(rv) == 1, "More than one entry with accession of %r" % name
         return rv[0][0]
 
@@ -203,6 +217,8 @@ class Adaptor:
     def execute(self, sql, args):
         """Just execute an sql command.
         """
+##        print "sql:", `sql`
+##        print "args:", `args`
         self.cursor.execute(sql, args)
 
     def get_subseq_as_string(self, seqid, start, end):
Index: BioSQL/Loader.py
===================================================================
RCS file: /home/repository/biopython/biopython/BioSQL/Loader.py,v
retrieving revision 1.9
diff -u -p -r1.9 Loader.py
--- BioSQL/Loader.py	2002/11/20 15:38:11	1.9
+++ BioSQL/Loader.py	2002/11/22 22:13:21
@@ -11,6 +11,8 @@ from time import gmtime, strftime
 # biopython
 from Bio import Alphabet
 
+from Bio.crc import crc64
+
 class DatabaseLoader:
     """Load a database with biopython objects.
     """
@@ -27,40 +29,135 @@ class DatabaseLoader:
         """
         bioentry_id = self._load_bioentry_table(record)
         self._load_bioentry_date(record, bioentry_id)
-        # self._load_bioentry_taxa(record, bioentry_id)
         self._load_biosequence(record, bioentry_id)
-        self._load_bioentry_description(record, bioentry_id)
+        self._load_comment(record, bioentry_id)
+        references = record.annotations.get('references', ())
+        for reference, rank in zip(references, range(len(references))):
+            self._load_reference(reference, rank, bioentry_id)
         for seq_feature_num in range(len(record.features)):
             seq_feature = record.features[seq_feature_num]
             self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
 
-    def _get_ontology_id(self, term_name, term_description = ""):
+    def _get_ontology_id(self,
+                         term_name,
+                         term_description = None,
+                         term_identifier = None,
+                         category_id = 0):
         """Get the id that corresponds to any term in an ontology.
 
         This looks through the ontology table for a the given term. If it
         is not found, a new id corresponding to this ontology is created.
         In either case, the id corresponding to that term is returned, so
         that you can reference it in another table.
+
+        The category_id can be needed to disambiguate the term:
+        it will be used if != 0.
         """
+
         # try to get the ontology term
         sql = r"SELECT ontology_term_id FROM ontology_term " \
               r"WHERE term_name = %s"
-        id_results = self.adaptor.execute_and_fetchall(sql, (term_name,))
+        fields = [term_name]
+        if category_id != 0:            # 'None' is legitimate
+            sql += ' AND category_id '
+            if category_id is None:
+                sql += 'IS NULL'
+            else:
+                sql += '= %d'
+                fields.append(category_id)
+        id_results = self.adaptor.execute_and_fetchall(sql, fields)
         # something is wrong
         if len(id_results) > 1:
             raise ValueError("Multiple ontology ids for %s: %s" % 
-                             term_name, id_results)
+                             (term_name, id_results))
         # we already have the ontology term inserted
         elif len(id_results) == 1:
             return id_results[0][0]
         # we need to create it
         else:
-            sql = r"INSERT INTO ontology_term (term_name, term_definition)" \
-                  r"VALUES (%s, %s)"
-            self.adaptor.execute(sql, (term_name, term_description))
-            # recursively call this to give back the id
-            return self._get_ontology_id(term_name, term_description)
+            # If no category_id specified, set it to null, as 0 isn't possible
+            if category_id == 0: category_id = None
+            
+            sql = r"INSERT INTO ontology_term (term_name, term_definition," \
+                  r" term_identifier, category_id)" \
+                  r" VALUES (%s, %s, %s, %d)"
+            self.adaptor.execute(sql, (term_name, term_description,
+                                       term_identifier, category_id))
+            return self.adaptor.last_id('ontology_term')
    
+    def _get_taxon_id(self, record):
+        """Get the id corresponding to a taxon.
+
+        If the species isn't in the taxon table, it is created.
+        
+        The code to find the species in the record is brittle.
+        """
+        # Binomial and full lineage
+        try:
+            binomial = record.annotations["organism"]
+        except KeyError:
+            binomial = None
+
+        # XXX no variant
+        variant = '-'
+
+        if binomial and variant:
+            sql = "SELECT taxon_id FROM taxon WHERE binomial = %s" \
+                  " AND variant = %s"
+            taxa = self.adaptor.execute_and_fetchall(sql, (binomial, variant))
+            if taxa:
+                return taxa[0][0]
+
+        # Didn't found the binomial/variant... Let's try with the taxon id
+        ncbi_taxon_id = None
+        for f in record.features:
+            if (f.type == 'source' and getattr(f, 'qualifiers', None)
+                and f.qualifiers.has_key('db_xref')):
+                for db_xref in f.qualifiers['db_xref']:
+                    if db_xref[:6] == 'taxon:':
+                        ncbi_taxon_id = int(db_xref[6:])
+                        break
+            if ncbi_taxon_id: break
+
+        if ncbi_taxon_id:
+            sql = "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %u"
+            taxa = self.adaptor.execute_and_fetchall(sql, (ncbi_taxon_id,))
+            if taxa:
+                return taxa[0][0]
+
+        # OK, so we're gonna try to insert the taxon
+        
+        # Common name
+        try:
+            common_name = record.annotations["source"]
+        except KeyError:
+            common_name = None
+
+        # Full lineage
+        try:
+            full_lineage = record.annotations["taxonomy"]
+            ante, last = binomial.split()
+            if full_lineage[-1] == ante:
+                full_lineage.append(last)
+            full_lineage.reverse()
+            full_lineage = ':'.join(full_lineage)
+        except KeyError:
+            full_lineage = None
+
+        # Check for the NON NULLs
+        if binomial == None or variant == None or full_lineage == None:
+            return
+        
+        # Insert into the taxon table
+        sql = "INSERT INTO taxon (binomial, variant, common_name," \
+              " ncbi_taxon_id, full_lineage)" \
+              " VALUES (%s, %s, %s, %d, %s)"
+        self.adaptor.execute(sql, (binomial, variant, common_name,
+                                   ncbi_taxon_id, full_lineage))
+        taxon_id = self.adaptor.last_id('taxon')
+
+        return taxon_id
+
     def _load_bioentry_table(self, record):
         """Fill the bioentry table with sequence information.
         """
@@ -68,18 +165,21 @@ class DatabaseLoader:
         
         if record.id.find('.') >= 0: # try to get a version from the id
             accession, version = record.id.split('.')
-        else: # otherwise just use a null version
+            version = int(version)
+        else: # otherwise just use a version of 0
             accession = record.id
             version = 0
-        try:
-            division = record.annotations["data_file_divison"]
-        except KeyError:
-            division = "No"
-        sql = r"INSERT INTO bioentry (biodatabase_id, display_id, " \
-              r"accession, entry_version, division) VALUES" \
-              r" (%s, %s, %s, %s, %s)"
-        self.adaptor.execute(sql, (self.dbid, record.name, 
-                                   accession, version, division))
+            
+        taxon_id = self._get_taxon_id(record)
+        identifier = record.annotations.get('gi')
+        description = getattr(record, 'description', None)
+        
+        sql = r"INSERT INTO bioentry (biodatabase_id, taxon_id, display_id, " \
+              r"accession, identifier, description, entry_version) VALUES" \
+              r" (%d, %d, %s, %s, %s, %s, %d)"
+        self.adaptor.execute(sql, (self.dbid, taxon_id, record.name, 
+                                   accession, identifier, description,
+                                   version))
         # now retrieve the id for the bioentry
         bioentry_id = self.adaptor.last_id('bioentry')
 
@@ -100,46 +200,83 @@ class DatabaseLoader:
               r" (%s, %s, %s)" 
         self.adaptor.execute(sql, (bioentry_id, date_id, date))
 
-    def _load_bioentry_taxa(self, record, bioentry_id):
-        """Add taxa information to the database.
-        """
-        return None # XXX don't do anything right now
-        try:
-            # XXX this isn't right, we need taxa ids and other junk
-            taxa = record.annotations["taxa"]
-            sql = r"INSERT INTO bioentry_taxa(bioentry_id, taxa_id) VALUES" \
-                  r" (%s, %s)" 
-            self.adapter.execute(sql, (bioentry_id, taxa))
-        except KeyError:
-            pass
-
     def _load_biosequence(self, record, bioentry_id):
         """Load the biosequence table in the database.
         """
         accession, version = record.id.split(".")
+        version = int(version)
         # determine the string representation of the alphabet
         if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
-            alphabet = "DNA"
+            alphabet = "dna"
         elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
-            alphabet = "RNA"
+            alphabet = "rna"
         elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
-            alphabet = "PROTEIN"
+            alphabet = "protein"
         else:
-            alphabet = "UNKNOWN"
+            alphabet = "unknown"
         
-        sql = r"INSERT INTO biosequence (bioentry_id, seq_version, " \
-              r"biosequence_str, molecule) VALUES (%s, %s, %s, %s)"
-        self.adaptor.execute(sql, (bioentry_id, version, record.seq.data,
-                                   alphabet))
-
-    def _load_bioentry_description(self, record, bioentry_id):
-        """Load the description table.
-        """
-        descr_id = self._get_ontology_id("description", "Sequence description")
-        sql = r"INSERT INTO bioentry_qualifier_value VALUES (%s, %s, %s)"
-        self.adaptor.execute(sql, (bioentry_id, descr_id, 
-                                   record.description))
+        try:
+            division = record.annotations["data_file_division"]
+        except KeyError:
+            division = "UNK"
 
+        sql = r"INSERT INTO biosequence (bioentry_id, seq_version, " \
+              r"seq_length, biosequence_str, alphabet, division) " \
+              r"VALUES (%d, %d, %d, %s, %s, %s)"
+        self.adaptor.execute(sql, (bioentry_id, version,
+                                   len(record.seq.data),
+                                   record.seq.data,
+                                   alphabet, division))
+
+    def _load_comment(self, record, bioentry_id):
+        # Assume annotations['comment'] is not a list
+        comment = record.annotations.get('comment')
+        if not comment:
+            return
+        comment = comment.replace('\n', ' ')
+        
+        sql = "INSERT INTO comment (bioentry_id, comment_text, comment_rank)" \
+              " VALUES (%d, %s, %d)"
+        self.adaptor.execute(sql, (bioentry_id, comment, 1))
+        
+    def _load_reference(self, reference, rank, bioentry_id):
+        # Currently, the UK is either the medline_id or a CRC64
+        if reference.medline_id:
+            uk = reference.medline_id
+        else:
+            s = ''
+            for f in reference.authors, reference.title, reference.journal:
+                if f: s += f
+                else: s += "<undef>"
+            uk = crc64(s)
+
+        sql = "SELECT reference_id FROM reference WHERE reference_medline = %s"
+        refs = self.adaptor.execute_and_fetch_col0(sql, (uk,))
+        if not len(refs):
+            authors = reference.authors or None
+            title =  reference.title or None
+            journal = reference.journal or None
+            sql = "INSERT INTO reference (reference_location," \
+                  " reference_title, reference_authors, reference_medline)" \
+                  " VALUES (%s, %s, %s, %s)"
+            self.adaptor.execute(sql, (journal, title,
+                                   authors, uk))
+            reference_id = self.adaptor.last_id('reference')
+        else:
+            reference_id = refs[0]
+        if len(reference.location):
+            start = 1 + int(str(reference.location[0].start))
+            end = int(str(reference.location[0].end))
+        else:
+            start = None
+            end = None
+        
+        sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
+              " reference_start, reference_end, reference_rank)" \
+              " VALUES (%d, %d, %d, %d, %d)"
+        self.adaptor.execute(sql, (bioentry_id, reference_id,
+                                   start, end, rank + 1))
+        
     def _load_seqfeature(self, feature, feature_rank, bioentry_id):
         """Load a biopython SeqFeature into the database.
         """
@@ -154,13 +291,20 @@ class DatabaseLoader:
         This loads the "key" of the seqfeature (ie. CDS, gene) and
         the basic seqfeature table itself.
         """
-        seqfeature_key_id = self._get_ontology_id(feature_type)
+        category_id = self._get_ontology_id('SeqFeature Keys')
+        seqfeature_key_id = self._get_ontology_id(feature_type,
+                                                  category_id = category_id)
+        
+        # XXX source is always EMBL/GenBank/SwissProt here; it should depend on
+        # the record
+        source_cat_id = self._get_ontology_id('SeqFeature Sources')
+        source_id = self._get_ontology_id('EMBL/GenBank/SwissProt',
+                                          category_id = source_cat_id)
         
-        # XXX This doesn't do source yet, since I'm not sure I understand it.
-        sql = r"INSERT INTO seqfeature (bioentry_id, seqfeature_key_id, " \
-              r"seqfeature_rank) VALUES (%s, %s, %s)"
+        sql = r"INSERT INTO seqfeature (bioentry_id, ontology_term_id, " \
+              r"seqfeature_source_id, seqfeature_rank) VALUES (%d, %d, %d, %d)"
         self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
-                                   feature_rank))
+                                   source_id, feature_rank + 1))
         seqfeature_id = self.adaptor.last_id('seqfeature')
 
         return seqfeature_id
@@ -193,7 +337,7 @@ class DatabaseLoader:
         """
         sql = r"INSERT INTO seqfeature_location (seqfeature_id, " \
               r"seq_start, seq_end, seq_strand, location_rank) " \
-               r"VALUES (%s, %s, %s, %s, %s)"
+               r"VALUES (%d, %d, %d, %d, %d)"
 
         # hack for NOT NULL in strand -- we have None be the same as 0
         # for strand information
@@ -208,7 +352,7 @@ class DatabaseLoader:
         start = feature.location.nofuzzy_start + 1
         end = feature.location.nofuzzy_end 
             
-        self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank))
+        self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank+1))
 
     def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
         """Insert the (key, value) pair qualifiers relating to a feature.
@@ -216,16 +360,18 @@ class DatabaseLoader:
         Qualifiers should be a dictionary of the form:
             {key : [value1, value2]}
         """
+        tag_category_id = self._get_ontology_id('Annotation Tags')
         for qualifier_key in qualifiers.keys():
-            qualifier_key_id = self._get_ontology_id(qualifier_key)
+            qualifier_key_id = self._get_ontology_id(qualifier_key,
+                                                     category_id = tag_category_id)
 
             # now add all of the values to their table
             for qual_value_rank in range(len(qualifiers[qualifier_key])):
                 qualifier_value = qualifiers[qualifier_key][qual_value_rank]
                 sql = r"INSERT INTO seqfeature_qualifier_value VALUES" \
-                      r" (%s, %s, %s, %s)"
+                      r" (%d, %d, %d, %s)"
                 self.adaptor.execute(sql, (seqfeature_id,
-                  qualifier_key_id, qual_value_rank, qualifier_value))
+                  qualifier_key_id, qual_value_rank + 1, qualifier_value))
        
 class DatabaseRemover:
     """Complement the Loader functionality by fully removing a database.