[BioSQL-l] Re: Postgres version support

Jason E. Stewart jason@openinformatics.com
22 Oct 2002 10:55:17 -0600


--=-=-=

"Hilmar Lapp" <hlapp@gnf.org> writes:

> The MAGEstk folks in fact came up with a XML-schema definition for
> describing a schema, and a corresponding converter to DDL of
> particular flavors (which I believe so far is Pg only, not
> sure). Jason, correct me if this description is too sloppy.

Hey all,

The definition was designed for Genex, and is currently DTD-based
(although the DTD is so simple, it could be turned into a schema with
no effort). 

MAGEstk uses that format when it converts a UML model into relational
DB representation.

There is a utility method, xml2sql(), that takes a set of table files
and converts them into SQL DDL for the DB backend currently in use -
to support multiple backends we would simply pass in an active DB
handle and switch on the DB driver type.

Yes, currently Pg is the only supported backend - primarily because we
needed sequences and views for Genex, and MySQL didn't have them, but
now we also use inheritance as well. 

The code in xml2sql is pretty simple, but at the moment it isn't very
generic. I'm happy to have other projects use it, and I'd be thrilled
if it became a supported part of BioPerl. I'd be happy to help make it
more generic and maintain it.

I'll include the DTD, and example table file, and the xml2sql()
subroutine from the Genex project.

Cheers,
jas.
--


--=-=-=
Content-Disposition: attachment; filename=xml2sql
Content-Description: xml2sql

=item ($sql,\@all_tables,\@audit_tables,\@all_views = xml2sql(@file_name_list);

Description: A method for converting an XML description of a relation
DB table (using table.dtd) into SQL

Return Value: a list of values:
   $sql = SQL string for creating the table
   \@all_tables   = array ref of table names to be created
   \@audit_tables = array ref of table names with audit_fks
   \@all_views    = array ref of view  names to be created

Errors: Will call die() on error

=cut

sub xml2sql {
  my (@files,$MASTER_SEQ) = @_;
  $MASTER_SEQ = 'GENEX_ID_SEQ' unless defined $MASTER_SEQ;

  my @defer_fkeys;
  my @defer_views;
  my @all_views;
  my @all_tables;
  my %audit_tables;
  my $output;
  my $parser = XML::Xerces::DOMParser->new();
  my %docs;
  my %inherit_tables;

  # we pre-parse all the files and save their docs
  foreach my $file (@files) {
    error(caller=>join(':',(caller())[1,2]),
	  message=>"couldn't open $file for input")
      unless -f $file;

    my $ERROR_HANDLER = XML::Xerces::PerlErrorHandler->new();
    $parser->setErrorHandler($ERROR_HANDLER);
    $parser->parse($file);
    my $doc = $parser->getDocument();
    $file =~ s|.*/||;
    $file =~ s|\.xml||;
    $docs{$file} = $doc;
  }


  # Since we use Postgres' inheritance system, we have to order
  # the tables so that child tables get created after parent tables.

  # Step one is to find all the tables which do not inherit and add
  # them to a tmp list, and file away those that do inherit in a
  # dependancy list hashed on who they inherit from

  my @inheritance_tmp;
  my %depends;
  foreach my $file (keys %docs) {
    my $doc = $docs{$file};
    my $table = $doc->getDocumentElement();
    my $inherits_from = $table->getAttribute('inherits_from')
      or die "Couldn't get inheritance for table: $file";

    if ($inherits_from eq 'none') {
      # we don't inherit from anything, so order is irrelevant
      push (@inheritance_tmp,$file);
    } else {
      # store the table as a dependancy of the table it inherits from
      push(@{$depends{uc($inherits_from)}},$file);

      # keep track of all tables that are inherited from
      $inherit_tables{$inherits_from}++;
    }
  }

  # now we go through the tmp list, and add the table, and
  # all tables that inherited from it
  my @inheritance_set;
  foreach my $file (@inheritance_tmp) {
    push(@inheritance_set,$docs{$file});
    foreach my $dep_file (@{$depends{uc($file)}}) {
      push(@inheritance_set,$docs{$dep_file});
    }
  }

    # Create a master sequence for all records in the db. The sequence
    # generator itself is an 8-byte integer, with a max value of
    # roughly 9 quintillion, but in PostgreSQL, v.7.1.* use datatype
    # integer for storing the value, which is 4-byte, with a limit of
    # roughly 2 billion.  PostgreSQL v.7.2.* uses the datatype
    # biginteger, which is 8-byte, and thus meets the same 9
    # quintillion limit.

    $output .= <<SQL;
CREATE SEQUENCE "$MASTER_SEQ";
SQL

  foreach my $doc (@inheritance_set) {
    my $table = $doc->getDocumentElement();
    my $table_name = $table->getAttribute('name')
      or die "Couldn't get name for table: " . $table->serialize;

    push(@all_tables,$table_name);
    my @column_sql;
    my $has_ro_groupname = 0;
    my @columns;
    my $inherits_from = $table->getAttribute('inherits_from')
      or die "Couldn't get inheritance for table: " . $table->serialize;

    if ($table->getAttribute('type') eq 'VIEW') {
      @columns = $table->getElementsByTagName('column');
      my %from;
      foreach my $column (@columns) {
	my %attrs = $column->getAttributes();
	$has_ro_groupname = 1
	  if $attrs{name} eq 'ro_groupname';
	die "No name for column in $table_name" 
	  unless exists $attrs{name};
	die "No source_table for column $attrs{name} in VIEW $table_name" 
	  unless exists $attrs{source_table};
	push(@column_sql,qq[\t$attrs{source_table}."$attrs{name}"]);
	$from{$attrs{source_table}}++;
      }
      my $column_sql = join(",\n", @column_sql);
      my $from = join(", ", keys %from);
      my $where = $table->getAttribute('where');
      my $view .= <<SQL;
CREATE VIEW $table_name AS
SELECT $column_sql
FROM $from
WHERE $where;
SQL
      # save the view data to be added to the output after all tables
      # are created
      push(@defer_views,$view);

      # save the view name to return to the caller
      push(@all_views,$table_name);
    } else {
      @columns = $table->getElementsByTagName('column');
      foreach my $column (@columns) {
	my %attrs = $column->getAttributes();
	die "No name for column in $table_name" 
	  unless exists $attrs{name};
	die "No type for column $attrs{name} in $table_name"
	  unless exists $attrs{type};
	my $null = '';
	$null = 'NOT NULL' if $attrs{not_null} eq 'true';
	push(@column_sql,qq[\t"$attrs{name}"\t$attrs{type} $null]);
	$has_ro_groupname = 1
	  if $attrs{name} eq 'ro_groupname';

	# we add the table to the Audit hash if it has an audit fkey
	$audit_tables{$table_name}++
	  if $attrs{name} eq 'audit_fk' and $table_name ne 'Audit';

	# we add the table to the Audit hash if it
	# inherits from a table with an audit fkey
	$audit_tables{$table_name}++
	  if exists $audit_tables{$inherits_from};

      }
      # do the primary key
      my ($pkey) = $table->getElementsByTagName('primary_key');
      if (defined $pkey) {
	my %attrs = $pkey->getAttributes();
	die "No column_id for pkey in $table_name"
	  unless exists $attrs{column_id};
	push(@column_sql,"\tPRIMARY KEY ($attrs{column_id})");
      }

      ###
      ### PostgreSQL inheritance cannot see child table IDs when linking,
      ### so we have to be cautious when using foreign key constraints.
      ### We only allow constraints to tables that aren't inherited from
      ###

      my @fkeys = $table->getElementsByTagName('foreign_key');
      foreach my $column (@fkeys) {
	my %attrs = $column->getAttributes();

	# don't include fkeys to tables which are inherited from
	next if $inherit_tables{$attrs{foreign_table}};

	# some fkeys refer to views which, not being tables, cannot
	# have FOREIGN KEY clauses which point to them
	next if $attrs{write_sql} eq 'false';

	die "No column_id for fkey in $table_name"
	  unless exists $attrs{column_id};
	die "No foreign_table for fkey $attrs{column_id} in $table_name"
	  unless exists $attrs{foreign_table};
	die "No foreign_table_pkey for fkey $attrs{column_id} in $table_name"
	  unless exists $attrs{foreign_table_pkey};
	# split on whitespace, join on commas
	my $constraint = "$ {table_name}_$attrs{column_id}";
	my $fkey = <<SQL;
ALTER TABLE $table_name ADD CONSTRAINT $constraint
   FOREIGN KEY ($attrs{column_id})
   REFERENCES $attrs{foreign_table} ($attrs{foreign_table_pkey});
SQL
	# we store the fkey for later, after all the tables are created
	# that way, we don't have to worry about creation order
	push(@defer_fkeys,$fkey);
      }

      my @uniques = $table->getElementsByTagName('unique');
      foreach my $column (@uniques) {
	my %attrs = $column->getAttributes();
	die "No column_ids for unique in $table_name"
	  unless exists $attrs{column_ids};
	# split on whitespace, join on commas
	my $string = join(', ',split(' ',$attrs{column_ids}));
	push(@column_sql,"\tUNIQUE ($string)");
      }
      my $column_sql = join(",\n", @column_sql);
      $output .= <<SQL;
CREATE TABLE $table_name (
$column_sql
) 
SQL

      $output .= qq[INHERITS ($inherits_from)]
	if ($inherits_from ne 'none');
      $output .= ";\n";
    } # if VIEW

    # if this table or view has security the selectable view we create
    # will enforce it in the where clause, otherwise it's public
    my $WHERE = '';
    my $from = qq[$table_name];
    if ($has_ro_groupname) {
      $WHERE = 'WHERE ro_groupname=GroupLink.groupname AND GroupLink.username=user()';
      $from .= ', GroupLink'
	unless $table_name eq 'GroupLink';
    }
    my @inherited_columns;
    if ($inherits_from ne 'none') {
      @inherited_columns = $docs{$inherits_from}->getElementsByTagName('column');
    }
    my $columns = join(",\n", map {$table_name . '."' . $_->getAttribute('name'). '"'}
		       @columns, @inherited_columns);
    my $view = "$ {table_name}_view";
    my $sql = <<SQL;
CREATE VIEW $view AS
SELECT $columns
FROM $from
$WHERE;

GRANT SELECT ON $view TO PUBLIC;
SQL
    # we defer creation of the view until after all tables are created
    # this way we don't need to know when GroupLink is created
    push(@defer_views,$sql);

    # save the view name to return to the caller
    push(@all_views,$view);


    # reset the list
    @column_sql = ();
    my @indices = $table->getElementsByTagName('index');
    foreach my $index (@indices) {
      my %attrs = $index->getAttributes();
      die "No name for index in $table_name"
	unless exists $attrs{name};
      die "No column_id for index $attrs{name} in $table_name"
	unless exists $attrs{column_id};
      push(@column_sql,qq[CREATE INDEX $attrs{name} on $table_name ( "$attrs{column_id}" );]);
    }

    # now we only grant priveleges to genex
    push(@column_sql,qq[GRANT ALL on $table_name to genex;])
      unless $table->getAttribute('type') eq 'VIEW';

    $output .= join("\n", @column_sql) . "\n";
  }

  # now we add on all the pieces that needed to wait until all tables
  # were created
  $output .= join("\n",@defer_views,@defer_fkeys);
  return ($output,\@all_tables,[keys %audit_tables],\@all_views);
}


--=-=-=
Content-Disposition: attachment; filename=table.dtd
Content-Description: table.dtd

<!-- ====================================== -->
<!-- RDBMS Table Definition DTD (table.dtd)   -->
<!-- ====================================== -->
<!-- Copyright 2001-2002 Jason E. Stewart
     All rights reserved                                           -->
<!--                    Table Type Entities                        -->
<!ENTITY data_table    "DATA"                                        >
<!ENTITY validation_table
                       "VALIDATION"                                  >
<!ENTITY subset_table  "SUBSET"                                      >
<!ENTITY linking_table "LINKING"                                     >
<!ENTITY system_table  "SYSTEM"                                      >
<!ENTITY view          "VIEW"                                        >

<!--                    Foreign Key Type Entities                  -->
<!ENTITY fkey_linking  "LINKING_TABLE"                               >
<!ENTITY fkey_lookup   "LOOKUP_TABLE"                                >
<!ENTITY fkey_oto      "ONE_TO_ONE"                                  >
<!ENTITY fkey_mto      "MANY_TO_ONE"                                 >

<!ELEMENT  table        (column|
                         unique|
                         index|
                         linking_keys|
                         foreign_key|
                         primary_key)*                               >
<!ATTLIST  table
             type       CDATA                              #REQUIRED
             name       CDATA                              #REQUIRED
             comment    CDATA                              #IMPLIED
             where      CDATA                              #IMPLIED
             inherits_from
                        CDATA                              "none"
             can_self_reference
                        (true|false)                       "false"   
             is_abstract
                        (true|false)                       "false"   >

<!ELEMENT  column       EMPTY                                        >
<!ATTLIST  column
             name       ID                                 #REQUIRED
             full_name  CDATA                              #REQUIRED
             type       CDATA                              #REQUIRED
             comment    CDATA                              #IMPLIED
             not_null   (true|false)                       "false"
             source_table
                        CDATA                              #IMPLIED  >

<!ELEMENT  unique       EMPTY                                        >
<!ATTLIST  unique
             column_ids IDREFS                             #REQUIRED >

<!ELEMENT  primary_key  EMPTY                                        >
<!ATTLIST  primary_key
             column_id  IDREF                              #REQUIRED
             serial     (true|false)                       "true"    >

<!-- 
     the write_sql attribute enables us to indicate where table 
     references exist, but we don't want them to be defined by an
     actual FOREIGN KEY constraint in the DB. This will enable the 
     API to have a getter method for this value, but it just wont'
     have a constraint in the DB.

     the can_self_reference attribute is for those cases when a table
     has a foreign key to itself, and that foreign key is permitted 
     to point to the same object. For example, this happens when
     groups are used for setting permissions, and the groups themselves
     have a group that defines the permissions for who can add or delete
     members from the group.

                                                                   -->
<!ELEMENT  foreign_key  EMPTY                                        >
<!ATTLIST  foreign_key
             column_id  IDREF                              #REQUIRED
             foreign_table
                        CDATA                              #REQUIRED
             foreign_table_pkey
                        CDATA                              #REQUIRED
             fkey_type  CDATA                              #REQUIRED 
             can_cascade
                        (true|false)                       "false"   
             write_sql  (true|false)                       "true"
             can_self_reference
                        (true|false)                       "false"   >

<!ELEMENT  linking_keys EMPTY                                        >
<!ATTLIST  linking_keys
             link1      IDREF                              #REQUIRED
             link2      IDREF                              #REQUIRED >

<!ELEMENT  index        EMPTY                                        >
<!ATTLIST  index
             name       CDATA                              #REQUIRED
             column_id  IDREF                              #REQUIRED >

<!--
     Local Variables:
     dtd-xml-flag: t
     End:
-->

--=-=-=
Content-Type: text/xml
Content-Disposition: attachment; filename=PhysicalBioAssay.xml
Content-Description: Physical BioAssay table

<?xml version="1.0" standalone="no"?>
<!DOCTYPE table SYSTEM "../../DTD/table.dtd">
<table name="PhysicalBioAssay"
      type="&data_table;"
      comment=" The PhysicalBioAssay table is used in two related, but
      distinct fashions: to represent the primary data for gene
      expression experiments and to represent processed data such as
      averaged values across replicates or ratio values for treatment
      vs. controls.   In representing the primary data, it describes
      an instance of an ArrayDesign  that has been used to assay gene
      expression levels in a preparation of the mRNA from a sample
      that has been subjected to a specific treatment in the context
      of an experiment.   It contains detailed information as to the
      treatment and protocols used on this ArrayDesign instance. The
      actual expression levels for each spot are stored in the
      PhysicalBioAssaySpots table. In general, the PhysicalBioAssay
      table has the same relationship to the PhysicalBioAssaySpots
      table as the ArrayDesign table has to the  ArrayDesignSpots
      table. However, there several cases in which the  relationship
      breaks down to some extent.   First, there may be
      PhysicalBioAssays for which no ArrayDesign is given. Second, for
      spots that represent controls on a given ArrayDesign, we may
      choose not to store expression levels for the control spots.
      Finally, in cases of ArrayDesigns with replicate spots (i.e. the
      same Reporter is used in different locations on the
      ArrayDesign), replicates will be broken out into separate
      PhysicalBioAssay rows. For example, an ArrayDesign with 1000
      distinct Reporters, each of which has been replicated 2
      times on the ArrayDesign will have 3000 corresponding
      ArrayDesignSpot rows, while a given instance of the ArrayDesign
      will be represented by three distinct PhysicalBioAssay rows,
      each of which has 1000 corresponding PhysicalBioAssaySpots
      rows. A related example is when mutiple channels are used to
      detect the signal of the expression level for each spot. In this
      case, an ArrayDesign that has 1000 distinct Reporters may
      be represented by two or more PhysicalBioAssay rows, each with
      1000 PhysicalBioAssaySpot rows representing the expression level
      reported by one channel.  PhysicalBioAssays that represent
      primary data will be grouped into  sets around the ReplicateDef
      table. This set of primary data may then be used to define
      derived PhysicalBioAssays representing the averaged values
      across members of the replicate set. Further, these
      ReplicateDefs can be used to describe TreatmentDefs which define
      the levels of the treatment and associated control
      ReplicateDefs. The TreatmentDef may reference a third type of
      PhysicalBioAssay that is derived from the ratio between the
      averaged values for the treatment replicates and the averaged
      values of the control replicates."> 
  <column name="pba_pk"
      full_name="Accession Number"
      type="serial"
      comment=""/>
  <column name="name"
      full_name="Array Name"
      type="varchar(128)"
      comment=""/>
  <column name="description"
      full_name="Description"
      type="text"
      comment=" a more verbose description of the purpose of this
      PhysicalBioAssay"/> 
  <column name="es_fk"
      full_name="Primary Experiment Set"
      type="int4"
      not_null="true"
      comment=" the primary experiment in which the PhysicalBioAssay
      was taken (the ExperimentSet in which it was submitted)"/> 
  <foreign_key column_id="es_fk"
        foreign_table      ="ExperimentSet"
        foreign_table_pkey ="es_pk"
        fkey_type          ="&fkey_mto;"/>
  <column name="array_fk"
      full_name="Array"
      not_null="true"
      type="int4"
      comment="The chip or array used with this assay"/> 
  <foreign_key column_id="array_fk"
        foreign_table      ="Array"
        foreign_table_pkey ="array_pk"
        fkey_type          ="&fkey_oto;"/>
  <column name="provider_con_fk"
      full_name="Data Provider"
      type="int4"
      comment=" the researcher that provided the array info it is
      important to destinquish between this and the rw_groupname
      because if data is imported from another DB then the owner of
      the data will not be the individual that submitted it."/>
  <foreign_key column_id="provider_con_fk"
        foreign_table      ="Contact"
        foreign_table_pkey ="con_pk"
        fkey_type          ="&fkey_oto;"/>
  <column name="ro_groupname"
      full_name="Read-Only Group Name"
      not_null="true"
      type="name"
      comment="The group with permission to view this data"/>
  <foreign_key column_id="ro_groupname"
        foreign_table      ="GroupSec"
        foreign_table_pkey ="name"
        fkey_type          ="&fkey_oto;"/>
  <column name="rw_groupname"
      full_name="Read/Write Group Name"
      not_null="true"
      type="name"
      comment="The group with permission to modify this data"/>
  <foreign_key column_id="rw_groupname"
        foreign_table      ="GroupSec"
        foreign_table_pkey ="name"
        fkey_type          ="&fkey_oto;"/>
  <column name="audit_fk"
      full_name="Audit"
      not_null="true"
      type="int4"
      comment="The audit entry when this data was last modified"/>
  <foreign_key column_id="audit_fk"
        foreign_table      ="Audit"
        foreign_table_pkey ="audit_pk"
        fkey_type          ="&fkey_oto;"/>
  <primary_key column_id="pba_pk"/>
</table>

--=-=-=--