<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>BioJavaX</title><link rel="stylesheet" href="html.css" type="text/css"><meta name="generator" content="DocBook XSL Stylesheets V1.69.1"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="book" lang="en"><div class="titlepage"><div><div><h1 class="title"><a name="d0e1"></a>BioJavaX</h1></div><div><div class="author"><h3 class="author"><span class="firstname">Richard</span> <span class="surname">Holland</span></h3></div></div><div><div class="author"><h3 class="author"><span class="firstname">Mark</span> <span class="surname">Schreiber</span></h3></div></div></div><hr></div><div class="toc"><p><b>Table of Contents</b></p><dl><dt><span class="chapter"><a href="#d0e19">1. BioJavaX is not BioJava 2 is not BioJavaX.</a></span></dt><dt><span class="chapter"><a href="#d0e44">2. What didn't change?</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e47">1. Existing interfaces.</a></span></dt><dt><span class="section"><a href="#d0e61">2. Change listeners.</a></span></dt><dt><span class="section"><a href="#d0e68">3. Event-based file parsing.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e78">3. What did change?</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e81">1. System requirements.</a></span></dt><dt><span class="section"><a href="#d0e86">2. Rich interfaces.</a></span></dt><dt><span class="section"><a href="#d0e293">3. BioSQL persistence.</a></span></dt><dt><span class="section"><a href="#d0e306">4. Better file parsers.</a></span></dt><dt><span class="section"><a href="#d0e317">5. NCBI Taxonomy loader.</a></span></dt><dt><span class="section"><a href="#d0e331">6. Namespaces.</a></span></dt><dt><span class="section"><a href="#d0e336">7. Singletons.</a></span></dt><dt><span class="section"><a href="#d0e353">8. Genetic algorithms.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e358">4. Future plans.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e361">1. BioPerl and BioPerl-DB compatibility.</a></span></dt><dt><span class="section"><a href="#d0e374">2. Efficient parsing.</a></span></dt><dt><span class="section"><a href="#d0e381">3. More file formats supported.</a></span></dt><dt><span class="section"><a href="#d0e388">4. Persistence to non-BioSQL databases.</a></span></dt><dt><span class="section"><a href="#d0e397">5. Java 1.5 and Generics.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e408">5. Singletons and the <code class="code">RichObjectFactory</code>.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e414">1. Using <code class="code">RichObjectFactory</code>.</a></span></dt><dt><span class="section"><a href="#d0e540">2. Where the singletons come from.</a></span></dt><dt><span class="section"><a href="#d0e575">3. Hibernate singletons.</a></span></dt><dt><span class="section"><a href="#d0e607">4. Managing the LRU cache.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e616">4.1. Global LRU cache size.</a></span></dt><dt><span class="section"><a href="#d0e623">4.2. Class-specific LRU cache size.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e630">5. Convenience methods.</a></span></dt><dt><span class="section"><a href="#d0e771">6. Default settings.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e801">6. Working with sequences.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e804">1. Creating sequences.</a></span></dt><dt><span class="section"><a href="#d0e900">2. Multiple accessions.</a></span></dt><dt><span class="section"><a href="#d0e909">3. Circular sequences.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e925">7. Relationships between sequences.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e928">1. Relating two sequences.</a></span></dt><dt><span class="section"><a href="#d0e942">2. Querying the relationship.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e951">8. Reading and writing files.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e954">1. Tools for reading/writing files.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e996">1.1. Reading using <code class="code">RichStreamReader</code>.</a></span></dt><dt><span class="section"><a href="#d0e1108">1.2. Writing using <code class="code">RichStreamWriter</code>.</a></span></dt><dt><span class="section"><a href="#d0e1161">1.3. Example.</a></span></dt><dt><span class="section"><a href="#d0e1168">1.4. Line widths and eliding information.</a></span></dt><dt><span class="section"><a href="#d0e1218">1.5. How parsed data becomes a sequence.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e1501">2. FASTA.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e1508">2.1. Reading.</a></span></dt><dt><span class="section"><a href="#d0e1584">2.2. Writing.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e1649">3. GenBank.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e1656">3.1. Reading.</a></span></dt><dt><span class="section"><a href="#d0e1906">3.2. Writing.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e2107">4. EMBL.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e2114">4.1. Reading.</a></span></dt><dt><span class="section"><a href="#d0e2348">4.2. Writing.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e2581">5. UniProt.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e2588">5.1. Reading.</a></span></dt><dt><span class="section"><a href="#d0e2883">5.2. Writing.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e3154">6. INSDSeq (XML).</a></span></dt><dt><span class="section"><a href="#d0e3166">7. EMBLxml (XML).</a></span></dt><dt><span class="section"><a href="#d0e3189">8. UniProtXML (XML).</a></span></dt><dt><span class="section"><a href="#d0e3369">9. New formats</a></span></dt><dt><span class="section"><a href="#d0e3388">10. NCBI Taxonomy data.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e3417">9. Creative file parsing with <code class="code">RichSeqIOListener</code>.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3423">1. Using <code class="code">RichSeqIOListener</code>s directly.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3431">1.1. Listening to events only.</a></span></dt><dt><span class="section"><a href="#d0e3452">1.2. Constructing sequences from events.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e3488">2. Parsing only specific fields.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e3509">10. Publication cross-references.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3512">1. Everything is a 'journal article'.</a></span></dt><dt><span class="section"><a href="#d0e3551">2. Editors and consortiums as authors.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e3571">11. Database cross-references.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3574">1. Database names.</a></span></dt><dt><span class="section"><a href="#d0e3600">2. Accessions and versions.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e3610">12. Working with <code class="code">RichLocation</code> objects.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3616">1. Working with locations.</a></span></dt><dt><span class="section"><a href="#d0e3737">2. Strandedness.</a></span></dt><dt><span class="section"><a href="#d0e3759">3. Remote locations.</a></span></dt><dt><span class="section"><a href="#d0e3794">4. Resolving fuzziness.</a></span></dt><dt><span class="section"><a href="#d0e3813">5. Translation.</a></span></dt><dt><span class="section"><a href="#d0e3821">6. Empty locations.</a></span></dt><dt><span class="section"><a href="#d0e3832">7. Circular locations.</a></span></dt><dt><span class="section"><a href="#d0e3847">8. Union.</a></span></dt><dt><span class="section"><a href="#d0e3858">9. Intersection.</a></span></dt><dt><span class="section"><a href="#d0e3871">10. Overlaps.</a></span></dt><dt><span class="section"><a href="#d0e3884">11. Contains.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3891">11.1. Point coordinates.</a></span></dt><dt><span class="section"><a href="#d0e3898">11.2. Other locations.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e3907">12. Obtaining the symbols for a location.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e3945">13. Features</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3948">1. Adding features to a <code class="code">RichSequence</code>.</a></span></dt><dt><span class="section"><a href="#d0e3962">2. Qualifiers as annotations.</a></span></dt><dt><span class="section"><a href="#d0e3977">3. Obtaining the symbols for a feature.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e3988">14. Relationships between features.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e3991">1. Relating two features.</a></span></dt><dt><span class="section"><a href="#d0e4005">2. Querying the relationship.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e4014">15. Annotations and Comments.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e4017">1. Annotations.</a></span></dt><dt><span class="section"><a href="#d0e4056">2. Comments.</a></span></dt><dt><span class="section"><a href="#d0e4072">3. UniProt structured comments.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e4092">16. Namespaces.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e4095">1. Obtaining <code class="code">Namespace</code> instances.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e4113">17. NCBI Taxonomy.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e4116">1. Traversing from child to parent.</a></span></dt><dt><span class="section"><a href="#d0e4121">2. Traversing from parent to child.</a></span></dt><dt><span class="section"><a href="#d0e4130">3. Finding taxons by name.</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e4139">18. BioEntry and RichSequence Databases</a></span></dt><dt><span class="chapter"><a href="#d0e4206">19. BioSQL and Hibernate.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e4209">1. Introduction to Hibernate.</a></span></dt><dt><span class="section"><a href="#d0e4234">2. Hibernate object-relational mappings.</a></span></dt><dt><span class="section"><a href="#d0e4470">3. Configuring your application to use Hibernate and BioSQL.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e4497">3.1. Installing Hibernate.</a></span></dt><dt><span class="section"><a href="#d0e4509">3.2. Copying and configuring the mapping files.</a></span></dt><dt><span class="section"><a href="#d0e4548">3.3. Opening and closing sessions.</a></span></dt><dt><span class="section"><a href="#d0e4579">3.4. Transactions.</a></span></dt><dt><span class="section"><a href="#d0e4590">3.5. Complete example.</a></span></dt></dl></dd><dt><span class="section"><a href="#d0e4605">4. Flattened locations.</a></span></dt><dt><span class="section"><a href="#d0e4617">5. Persisting objects.</a></span></dt><dt><span class="section"><a href="#d0e4636">6. Loading objects.</a></span></dt><dt><span class="section"><a href="#d0e4681">7. Loading individual values from objects.</a></span></dt><dt><span class="section"><a href="#d0e4698">8. Deleting objects.</a></span></dt><dt><span class="section"><a href="#d0e4707">9. Auto-generating the BioSQL schema.</a></span></dt><dt><span class="section"><a href="#d0e4718">10. Reading/writing objects as XML.</a></span></dt><dt><span class="section"><a href="#d0e4734">11. BioEntryDB and RichSequenceDB convenience wrappers</a></span></dt><dt><span class="section"><a href="#d0e4778">12. BioSQLFeatureFilter</a></span></dt><dt><span class="section"><a href="#d0e4843">13. <code class="code">ThinSequence</code>s and <code class="code">Feature</code>s</a></span></dt></dl></dd><dt><span class="chapter"><a href="#d0e4908">20. Genetic Algorithms.</a></span></dt><dd><dl><dt><span class="section"><a href="#d0e4911">1. Overview.</a></span></dt><dt><span class="section"><a href="#d0e4956">2. Example listing.</a></span></dt></dl></dd></dl></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e19"></a>Chapter 1. BioJavaX is not BioJava 2 is not BioJavaX.</h2></div></div></div><p>BioJavaX is an extension to the existing BioJava project. Anything written with BioJava will work with BioJavaX, and vice versa.</p><p><code class="code">org.biojavax</code> is to <code class="code">org.biojava</code> as <code class="code">javax</code> is to <code class="code">java</code>.</p><p>The BioJava2 project is a completely new project which intends to rewrite everything in BioJava from scratch, based around a new set of object designs and concepts. It is entirely incompatible with the existing BioJava project.</p><p>Therefore BioJavaX is <span class="emphasis"><em>not</em></span> BioJava 2, and has nothing to do with it. Please don't get them confused!</p></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e44"></a>Chapter 2. What didn't change?</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e47"></a>1. Existing interfaces.</h2></div></div></div><p>Backwards-compatibility is always an issue when a major new version of a piece of software is released.</p><p>BioJavaX addresses this by keeping all the new classes and interfaces tucked away inside their own special package, <code class="code">org.biojavax</code>. None of the existing interfaces were modified in any way, so any code which depends on them will not see any difference.</p><p>Apart from ongoing bugfixes, the way in which the existing classes work also has not changed.</p><p>The new interfaces introduced in BioJavaX extend those present in the existing BioJava packages. This allows new BioJavaX-derived objects to be passed to legacy code and still be understood.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e61"></a>2. Change listeners.</h2></div></div></div><p>BioJava's change listener model is intact and unchanged. The new BioJavaX classes define a set of extra change types which they fire in addition to the ones generated by existing BioJava classes.</p><p>This means that existing change listeners can be attached to BioJavaX-derived objects and still receive all the information they would normally receive.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e68"></a>3. Event-based file parsing.</h2></div></div></div><p>BioJavaX still uses event-based file parsing to read and write files, in exactly the same way as the old BioJava classes did.</p><p>However, you cannot use existing event listeners with the new BioJavaX file parsers. You must alter the listeners to extend the new <code class="code">org.biojavax.bio.seq.io.RichSeqIOListener</code> interface instead.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e78"></a>Chapter 3. What did change?</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e81"></a>1. System requirements.</h2></div></div></div><p>Java 1.4 is required for all BioJavaX packages.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e86"></a>2. Rich interfaces.</h2></div></div></div><p>BioJavaX defines a new set of interfaces for working with sequence objects. These interfaces are closely modelled on the BioSQL 1.0 schema.</p><p>The new interfaces extend existing interfaces wherever possible, in order to allow backwards-compatibility with legacy code. These interfaces are known as rich interfaces, as they could be said to be 'enriched' versions of the interfaces that they extend.</p><p>Instances of implementing classes are known as rich objects, which legacy instances known as plain ones.</p><p>Here is a list of the new rich interfaces:</p><div class="itemizedlist"><ul type="disc"><li><p><code class="code">ComparableOntology</code> (extends <code class="code">Ontology</code>)</p></li><li><p><code class="code">ComparableTerm</code> (extends <code class="code">Term</code>)</p></li><li><p><code class="code">ComparableTriple</code> (extends <code class="code">Triple</code>)</p></li><li><p><code class="code">RichSequenceIterator</code> (extends <code class="code">SequenceIterator</code>)</p></li><li><p><code class="code">RichSequence</code> (extends <code class="code">Sequence</code>)</p></li><li><p><code class="code">RichLocation</code> (extends <code class="code">Location</code>)</p></li><li><p><code class="code">RichFeature</code> (extends <code class="code">StrandedFeature</code>)</p></li><li><p><code class="code">RichFeatureHolder</code> (extends <code class="code">FeatureHolder</code>)</p></li><li><p><code class="code">RichAnnotatable</code> (extends <code class="code">Annotatable</code>)</p></li><li><p><code class="code">RichAnnotation</code> (extends <code class="code">Annotation</code>)</p></li><li><p><code class="code">BioSQLFeatureFilter</code> (extends <code class="code">FeatureFilter</code>)</p></li><li><p><code class="code">RichSequenceDB</code> (extends <code class="code">SequenceDB</code>)</p></li></ul></div><p>Wherever possible in BioJavaX, conversions are attempted if a method expecting a rich object receives a plain one. You can perform these conversions yourself by using the <code class="code">Tools</code> sub-class of the appropriate rich interface, for example to convert an old <code class="code">Sequence</code> object into a new <code class="code">RichSequence</code> object, you can do this:</p><pre class="programlisting">Sequence s = ...; // get an old Sequence object from somewhere
RichSequence rs = RichSequence.Tools.enrich(s);</pre><p>The conversion process does its best, but it is not perfect. Much of the way information is stored in the new BioJavaX object model is fundamentally incompatible with the old object model. So its always best to deal with <code class="code">RichSequence</code> objects from the word go and try to avoid instantiating older <code class="code">Sequence</code> objects as far as possible.</p><p>Other new interfaces define new concepts, or replace old interfaces entirely due to a fundamental clash in the way they see the world. Here is a list:</p><div class="itemizedlist"><ul type="disc"><li><p><code class="code">NCBITaxon</code></p></li><li><p><code class="code">BioEntry</code></p></li><li><p><code class="code">RichObjectBuilder</code></p></li><li><p>RichSequenceHandler</p></li><li><p><code class="code">Comment</code></p></li><li><p><code class="code">CrossRef</code></p></li><li><p><code class="code">CrossReferenceResolver</code></p></li><li><p><code class="code">DocRef</code></p></li><li><p><code class="code">DocRefAuthor</code></p></li><li><p><code class="code">Namespace</code></p></li><li><p><code class="code">Note</code></p></li><li><p><code class="code">RankedCrossRef</code></p></li><li><p><code class="code">RankedCrossRefable</code></p></li><li><p><code class="code">RankedDocRef</code></p></li><li><p><code class="code">BioEntryRelationship</code></p></li><li><p><code class="code">Position</code></p></li><li><p><code class="code">PositionResolver</code></p></li><li><p><code class="code">RichFeatureRelationship</code></p></li><li><p><code class="code">BioEntryDB</code></p></li></ul></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e293"></a>3. BioSQL persistence.</h2></div></div></div><p>BioJavaX introduces a whole new way of working with BioSQL databases.</p><p>Instead of attempting to re-invent the wheel with yet another new object-relational mapping system, BioJavaX uses the services of Hibernate to do all the dirty work for it. In fact, there is not a single SQL statement anywhere in the BioJavaX code.</p><p>The use of Hibernate allows users to have as much or as little control as they like over transactions and query optimisation. The Hibernate query language, HQL, is simple to learn and easy to use.</p><p>You can find out more about the Hibernate project at their website: <a href="http://www.hibernate.org/" target="_top">http://www.hibernate.org/</a></p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e306"></a>4. Better file parsers.</h2></div></div></div><p>The old BioJava file parsers worked in that they loaded all information into memory, but they didn't do much at attempting to understand the contents of the files, and they often failed miserably when trying to convert between formats.</p><p>The new parsers supplied with BioJavaX put a lot of effort into trying to fit data from the myriad of file formats out there into a form representable by BioSQL, and hence by the new BioJavaX object model. Of course this isn't always possible, but it does a much better job than the old ones.</p><p>By parsing data into a fixed object model instead of storing everything as annotations (as was the case, for instance, with the old SwissProt parsers), conversion between file formats becomes much easier.</p><p>The new file parsers also allow you to skip uninteresting parts of the file altogether, greatly speeding up simple tasks such as counting the number of sequences in a file.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e317"></a>5. NCBI Taxonomy loader.</h2></div></div></div><p>A parser is provided for loading the NCBI Taxonomy database into a set of BioJavaX <code class="code">NCBITaxon</code> objects. This parser reads the <code class="filename">node.dmp</code> and <code class="filename">names.dmp</code> files supplied by NCBI and constructs the appropriate hierarchy of objects. If you are using BioSQL, it can persist this hierarchy to the database as it goes.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e331"></a>6. Namespaces.</h2></div></div></div><p>All sequences in BioJavaX must belong to a namespace.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e336"></a>7. Singletons.</h2></div></div></div><p>BioJavaX tries to use singletons as far as possible. This is:</p><div class="itemizedlist"><ul type="disc"><li><p>to reduce memory usage.</p></li><li><p>to prevent problems with duplicate keys when persisting to BioSQL.</p></li></ul></div><p>The singletons are kept in a LRU cache managed by a <code class="code">RichObjectFactory</code>. See the chapter on this subject later in this book.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e353"></a>8. Genetic algorithms.</h2></div></div></div><p>BioJavaX introduces a new package for working with genetic algorithms.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e358"></a>Chapter 4. Future plans.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e361"></a>1. BioPerl and BioPerl-DB compatibility.</h2></div></div></div><p>We tried our best to store sequence data into BioSQL in the same way as BioPerl-DB does. We also tried to parse files in such a way that data from files would end up in the same place in BioSQL as if it had been parsed using the BioPerl file parsers then persisted using BioPerl-DB.</p><p>However, we may not have been entirely successful, particularly with regard to the naming conventions of annotations and feature qualifiers, and the use of the document and publication cross-reference tables. Likewise, our definition of fuzzy locations may differ.</p><p>So, we intend in the future to try and consolidate our efforts with those of the BioPerl and BioPerl-DB projects, along with any of the other Bio* projects who provide BioSQL persistence functionality, so that we can all read and write data to and from BioSQL in the same way.</p><p>The goal is to be able to read a file with any of the Bio* projects, persist it to the database, then read it back from the database using any of the other Bio* projects and write it out to file. The input and output files should be logically identical (give or take some minor layout or formatting issues).</p><p>Help is needed!</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e374"></a>2. Efficient parsing.</h2></div></div></div><p>The event-based parser model works great, but our implementations of actual file parsing code may leave a lot to be desired in terms of efficient use of memory or minimising the number of uses of markers in the input stream.</p><p>If you are an IO, parsing, or code optimisation guru, you would be most welcome to come have a look and speed things up a bit.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e381"></a>3. More file formats supported.</h2></div></div></div><p>We've provided parsers (and writers) for all the major formats we thought would be necessary. But there are only two of us, and it takes a while to trawl through the documentation for each format and try to shoehorn it all into the BioSQL model, even before the actual coding begins.</p><p>If there's a format you like and use daily and you think would be of use to others, but you can't find it in BioJavaX, then please do write a parser for it and contribute it to the project.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e388"></a>4. Persistence to non-BioSQL databases.</h2></div></div></div><p>Basically, right now, you can't. We have only provided Hibernate mappings for BioSQL.</p><p>There is no reason though why you can't write a new set of Hibernate XML mapping files that map the BioJavaX objects into tables in some other database format. Because of the way Hibernate works, you wouldn't have to change any of the BioJavaX code at all, only the mapping files that tell Hibernate how to translate between objects and tables.</p><p>If you do, and you think someone else could benefit from your work, please consider contributing them to the BioJava project for everyone to enjoy.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e397"></a>5. Java 1.5 and Generics.</h2></div></div></div><p>Much discussion has occurred recently about upgrading BioJava to use features only available since version 1.5 of Java (also known as Java 5). Mostly we are considering the use of generics.</p><p>A lot of this started after some Java 1.5 features accidentally slipped into the biojava-live CVS branch one day and suddenly nobody using older JVMs could compile it any more. These were quickly removed, and it was agreed to wait a while before a decision was made about the ultimate use of such features.</p><p>Java 1.5 offers a lot of features that would be very useful in BioJava, and has the potential to greatly reduce the size of the project's codebase. However, 1.5 compilers and runtime environments are not available for some platforms yet, and in other situations companies are reluctant to upgrade when they have already settled on 1.4 as their tested and accepted Java environment.</p><p>So, we won't do it yet, but we would definitely like to change in future.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e408"></a>Chapter 5. Singletons and the <code class="code">RichObjectFactory</code>.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e414"></a>1. Using <code class="code">RichObjectFactory</code>.</h2></div></div></div><p>BioJavaX revolves around the use of singleton instances. This is important to keep memory usage down, and becomes even more important when working with BioSQL databases via Hibernate to prevent duplicate records in tables. Singletons are generated in a singleton factory.</p><p><code class="code">RichObjectFactory</code> is a caching singleton factory. If you request lots of instances of the same class, the oldest ones are forgotten about and you will get a new instance next time you ask for it. This is to prevent memory blowouts. The default size of this LRU cache is 20 instances of each class.</p><p>Singletons are only important when dealing with certain classes:</p><div class="itemizedlist"><ul type="disc"><li><p><code class="code">SimpleNamespace</code></p></li><li><p><code class="code">SimpleComparableOntology</code></p></li><li><p><code class="code">SimpleNCBITaxon</code></p></li><li><p><code class="code">SimpleCrossRef</code></p></li><li><p><code class="code">SimpleDocRef</code></p></li></ul></div><p>In all other cases, you don't need to worry about singletons. In fact, the singleton factory may complain if you try to ask it to make a singleton of any class not listed above.</p><p>To generate a new instance of any of the above, you must use the <code class="code">RichObjectFactory</code>. This tool checks an LRU cache to see if you have requested an identical instance recently. If you have, it returns that instance (a singleton). If you haven't, then it creates the instance, adds it to the LRU cache, then returns it.</p><p>The parameters you supply to the <code class="code">RichObjectFactory</code> are a class name, and an array of parameters which you would normally have passed directly to that class' constructor. Here is a list of the parameters required, and an example, for each of the classes accepted by the current factory:</p><div class="table"><a name="d0e461"></a><p class="title"><b>Table 5.1. <code class="code">RichObjectFactory</code> singleton examples.</b></p><table summary="RichObjectFactory singleton examples." border="1"><colgroup><col width="33%"><col width="33%"><col width="34%"></colgroup><tbody><tr><td><code class="code">SimpleNamespace</code></td><td>[name (<code class="code">String</code>)]</td><td><pre class="programlisting">Namespace ns = (Namespace)RichObjectFactory.getObject(
SimpleNamespace.class,
new Object[]{"myNamespace"}
);</pre></td></tr><tr><td><code class="code">SimpleComparableOntology</code></td><td>[name (<code class="code">String</code>)]</td><td><pre class="programlisting">ComparableOntology ont = (ComparableOntology)RichObjectFactory.getObject(
ComparableOntology.class,
new Object[]{"myOntology"}
);</pre></td></tr><tr><td><code class="code">SimpleNCBITaxon</code></td><td>[taxID (<code class="code">Integer</code>)]</td><td><pre class="programlisting">Integer taxID = new Integer(12345);
NCBITaxon tax = (NCBITaxon)RichObjectFactory.getObject(
SimpleNCBITaxon.class,
new Object[]{taxID}
);</pre></td></tr><tr><td><code class="code">SimpleCrossRef</code></td><td>[databaseName (<code class="code">String</code>), accession (<code class="code">String</code>), version (<code class="code">Integer</code>)]</td><td><pre class="programlisting">Integer version = new Integer(0);
CrossRef cr = (CrossRef)RichObjectFactory.getObject(
SimpleCrossRef.class,
new Object[]{"PUBMED","56789",version}
);</pre></td></tr><tr><td><code class="code">SimpleDocRef</code></td><td>[authors (<code class="code">List</code> of <code class="code">DocRefAuthor</code>), location (<code class="code">String</code>)]</td><td><pre class="programlisting">DocRefAuthor author = new SimpleDocRefAuthor("Bloggs,J.");
List authors = new ArrayList();
authors.add(author);
DocRef dr = (DocRef)RichObjectFactory.getObject(
SimpleDocRef.class,
new Object[]{authors,"Journal of Voodoo Virology, 2005, 23:55-57"}
);</pre></td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e540"></a>2. Where the singletons come from.</h2></div></div></div><p>The actual instances of the classes requested are generated using a <code class="code">RichObjectBuilder</code>. The default <code class="code">RichObjectBuilder</code>, <code class="code">SimpleRichObjectBuilder</code>, uses introspection to call the constructors on the classes and create new instances. You do not need to do anything to set this up.</p><p>If you do decide to write your own <code class="code">RichObjectBuilder</code> for whatever reason, you can set it to be used by <code class="code">RichObjectFactory</code> like this:</p><pre class="programlisting">RichObjectBuilder builder = ...; // create your own one here
RichObjectFactory.setRichObjectBuilder(builder); // make the factory use it from now on</pre><p>If you change the default <code class="code">RichObjectBuilder</code> to a different one, you <span class="emphasis"><em>must</em></span> do so at the very beginning of your program before any call to the <code class="code">RichObjectFactory</code> has been made. This is because when the builder is changed, existing singletons or default instances are not removed. If you do not follow this guideline, you will end up with a mix of objects in the cache created by two different builders, which could lead to interesting situations.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e575"></a>3. Hibernate singletons.</h2></div></div></div><p>When working with Hibernate, you <span class="emphasis"><em>must</em></span> connect BioJavaX to Hibernate by calling <code class="code">RichObjectFactory.connectToBioSQL(session)</code> and passing it your session object. When using this, instances are looked up in the underlying BioSQL database first to see if they exist. If they do, they are loaded and returned. If not, they are created, then returned.</p><p>The instances returned by <code class="code">RichObjectFactory</code> when connected to Hibernate are guaranteed true singletons and will never be duplicated even if you fill up the LRU cache several times between requests.</p><p>You can replicate the behaviour of <code class="code">RichObjectFactory.connectToBioSQL(session)</code> by instantiating <code class="code">BioSQLRichObjectBuilder</code> and <code class="code">BioSQLCrossReferenceResolver</code> objects and passing these to the appropriate methods in <code class="code">RichObjectFactory</code>.</p><p>See the section on BioSQL and Hibernate later in this document for more details.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e607"></a>4. Managing the LRU cache.</h2></div></div></div><p>By default, the LRU cache keeps the 20 most recently requested instances of any given class in memory. If more than 20 objects are requested, the oldest ones are removed from the cache before the new ones are added. This keeps memory usage at a minimum.</p><p>If you are experiencing problems with duplicate instances when you expected singletons., or believe that a larger or smaller cache may help the performance of your application, then you can change the size of the LRU cache. There are two ways of doing this.</p><p>Changes to the LRU cache size are not instantaneous. The size of the cache only changes physically next time an instance is requested from it. Even then, only the cache of instances of the class requested will actually change.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e616"></a>4.1. Global LRU cache size.</h3></div></div></div><p>Changing the global LRU cache size will change the cache size for all classes. It applies the new cache size to every single class. Next time any of those classes are accessed via the RichObjectFactory, the LRU cache for that class will adjust to the new size.</p><pre class="programlisting">RichObjectFactory.setLRUCacheSize(50); // increases the global LRU cache size to 50 instances per class</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e623"></a>4.2. Class-specific LRU cache size.</h3></div></div></div><p>Changing the LRU cache size for a specific class will only affect that class. Your class-specific settings will be lost if you later change the global LRU cache size.</p><pre class="programlisting">RichObjectFactory.setLRUCacheSize(SimpleNamespace.class, 50); // increases the LRU cache for SimpleNamespace instances to 50</pre></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e630"></a>5. Convenience methods.</h2></div></div></div><p>A number of convenience methods are provided by the RichObjectFactory to allow easy access to some useful default singletons:</p><div class="table"><a name="d0e635"></a><p class="title"><b>Table 5.2. <code class="code">RichObjectFactory</code> convenience methods.</b></p><table summary="RichObjectFactory convenience methods." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td><code class="code">void setDefaultNamespaceName(String name);</code></td><td>Sets the name of the default namespace. This namespace is used when loading files which have no namespace information of their own, and when no namespace has been passed to the file loading routines. It can also be used when creating temporary <code class="code">RichSequence</code> or <code class="code">BioEntry</code> objects, as the namespace parameter is compulsory on these objects.</td></tr><tr><td><code class="code">Namespace getDefaultNamespace();</code></td><td>Returns the default namespace singleton instance (delegates to <code class="code">getObject()</code>).</td></tr><tr><td><code class="code">void setDefaultOntologyName(String name);</code></td><td>Sets the name of the default ontology. When parsing files, new terms are often created. If the file format does not have an ontology of its own, then it will use the default ontology to store these terms. Terms commonly used throughout BioJavaX, including those common to all file formats, are also stored in the default ontology.</td></tr><tr><td><code class="code">ComparableOntology getDefaultOntology();</code></td><td>Returns the default ontology singleton instance (delegates to <code class="code">getObject()</code>).</td></tr><tr><td><code class="code">void setDefaultPositionResolver(PositionResolver pr);</code></td><td>When converting fuzzy locations into actual physical locations, a <code class="code">PositionResolver</code> instance is used. The default one is <code class="code">AveragePositionResolver</code>, which averages out the range of fuzziness to provide a value somewhere in the middle. You can override this setting using this function. All locations that are resolved without explicility specifying a <code class="code">PositionResolver</code> to use will then use this resolver to do the work.</td></tr><tr><td><code class="code">PositionResolver getDefaultPositionResolver();</code></td><td>Returns the default position resolver.</td></tr><tr><td><code class="code">void setDefaultCrossReferenceResolver(CrossReferenceResolver cr);</code></td><td><code class="code">CrossRef</code> instances are links to other databases. When a <code class="code">CrossRef</code> is used in a <code class="code">RichLocation</code> instance, it means that to obtain the symbols (sequence) for that location, it must first retrieve the remote sequence object. The <code class="code">CrossReferenceResolver</code> object specified using this method is used to carry this out. The default implementation of this interface <code class="code">DummyCrossReferenceResolver</code>, which always returns infinitely ambiguous symbol lists and cannot look up any remote sequence objects. Use <code class="code">BioSQLCrossReferenceResolver</code> instead (or use <code class="code">RichObjectFactory.connectToBioSQL(session)</code>) if you are using Hibernate, which is able to actually look up the sequences (if they exist in your database).</td></tr><tr><td><code class="code">CrossReferenceResolver getDefaultCrossReferenceResolver();</code></td><td>Returns the default cross reference resolver.</td></tr><tr><td>void setDefaultRichSequenceHandler(RichSequenceHandler rh);</td><td>Calls to <code class="code">RichSequence</code> methods which reference sequence data will delegate to this handler to carry the requests out. The default implementation is a <code class="code">DummyRichSequenceHandler</code>, which just uses the internal <code class="code">SymbolList</code> of the <code class="code">RichSequence</code> to look up the data. When this is set to a <code class="code">BioSQLRichSequenceHandler</code>, the handler will go to the database to look up the information instead of keeping an in-memory copy of it.</td></tr><tr><td>RichSequenceHandler getDefaultRichSequenceHandler();</td><td>Returns the default rich sequence handler.</td></tr><tr><td><code class="code">void connectToBioSQL(Object session);</code></td><td>Instantiates <code class="code">BioSQLCrossReferenceResolver</code>, <code class="code">BioSQLRichObjectBuilder</code> and <code class="code">BioSQLRichSequenceHandler</code> using the Hibernate session object provided, and sets these objects as the default instances. After this call, the factory will try to look up all object requests in the underlying database first.</td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e771"></a>6. Default settings.</h2></div></div></div><p>The default namespace name is <code class="code">lcl</code>.</p><p>The default ontology name is <code class="code">biojavax</code>.</p><p>The default LRU cache size is 20.</p><p>The default position resolver is <code class="code">AveragePositionResolver</code>.</p><p>The default cross reference resolver is <code class="code">DummyCrossReferenceResolver</code>.</p><p>The default rich sequence handler is <code class="code">DummyRichSequenceHandler</code>.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e801"></a>Chapter 6. Working with sequences.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e804"></a>1. Creating sequences.</h2></div></div></div><p>BioJavaX has a two-tier definition of sequence data.</p><p><code class="code">BioEntry</code> objects correspond to the <code class="code">bioentry</code> table in BioSQL. They do not have any sequence information, and neither do they have any features. They can, however, be annotated, commented, and put into relationships with each other. They can also have cross-references to publications and other databases associated with them.</p><p><code class="code">RichSequence</code> objects extend <code class="code">BioEntry</code> objects by adding in sequence data and a feature table.</p><p>So, when to use them?</p><div class="itemizedlist"><ul type="disc"><li><p><code class="code">BioEntry</code> objects are most useful when performing simple operations such as counting sequences, checking taxonomy data, looking up accessions, or finding out things like which objects refer to a particular PUBMED entry.</p></li><li><p><code class="code">RichSequence</code> objects are useful only when you need access to the sequence data itself, or to the sequence feature table.</p></li><li><p><code class="code">RichSequence</code> objects must be used whenever you wish to pass objects to legacy code that is expecting <code class="code">Sequence</code> objects, as only <code class="code">RichSequence</code> objects implement the <code class="code">Sequence</code> interface. <code class="code">BioEntry</code> objects do not.</p></li></ul></div><p>Throughout the rest of this document, both <code class="code">BioEntry</code> and <code class="code">RichSequence</code> objects will be referred to interchangeably as sequence objects.</p><p>To create a <code class="code">BioEntry</code> object, you need to have at least the following information:</p><div class="itemizedlist"><ul type="disc"><li><p>a <code class="code">Namespace</code> instance to associate the sequence with (use <code class="code">RichObjectFactory.getDefaultNamespace()</code> for an easy way out)</p></li><li><p>a name for the sequence</p></li><li><p>an accession for the sequence</p></li><li><p>a version for the sequence (use 0 if you don't want to bother with versions)</p></li></ul></div><p>To create a <code class="code">RichSequence</code> object, you need to have all the above plus:</p><div class="itemizedlist"><ul type="disc"><li><p>a <code class="code">SymbolList</code> containing the sequence data</p></li><li><p>a version for the sequence data (this is separate from the version of the sequence object)</p></li></ul></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e900"></a>2. Multiple accessions.</h2></div></div></div><p>If you wish to assign multiple accessions to a sequence, you must do so using the special term provided, like this:</p><pre class="programlisting">ComparableTerm accTerm = RichSequence.Terms.getAdditionalAccessionTerm();
Note accession1 = new SimpleNote(accTerm,"A12345",1); // this note has an arbitrary rank of 1
Note accession2 = new SimpleNote(accTerm,"Z56789",2); // this note has an arbitrary rank of 2
...
RichSequence rs = ...; // get a rich sequence from somewhere
rs.getNoteSet().add(accession1); // annotate the rich sequence with the first additional accession
rs.getNoteSet().add(accession2); // annotate the rich sequence with the second additional accession
...
// you can annotate bioentry objects in exactly the same way
BioEntry be = ...; // get a bioentry from somewhere
be.getNoteSet().add(accession1);
be.getNoteSet().add(accession2); </pre><p>See later in this document for more information on how to annotate and comment on sequences.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e909"></a>3. Circular sequences.</h2></div></div></div><p>BioJavaX can flag sequences as being circular, using the <code class="code">setCircular()</code> and <code class="code">getCircular()</code> methods on <code class="code">RichSequence</code> instances. However, as this information is not part of BioSQL, it will be lost when the sequence is persisted to a BioSQL database. Use with care.</p><p>Note that only circular sequences can have features with circular locations associated with them.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e925"></a>Chapter 7. Relationships between sequences.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e928"></a>1. Relating two sequences.</h2></div></div></div><p>Two sequences can be related to each other by using a <code class="code">BioEntryRelationship</code> object to construct the link.</p><p>Relationships are optionally ranked. If you don't want to rank the relationship, use null in the constructor.</p><p>The following code snippet defines a new term "contains" in the default ontology, then creates a relationship that states that sequence A (the parent) contains sequence B (the child):</p><pre class="programlisting">ComparableTerm contains = RichObjectFactory.getDefaultOntology().getOrCreateTerm("contains");
...
RichSequence parent = ...; // get sequence A from somewhere
RichSequence child = ...; // get sequence B from somewhere
BioEntryRelationship relationship = new SimpleBioEntryRelationship(parent,child,contains,null);
parent.addRelationship(relationship); // add the relationship to the parent
...
parent.removeRelationship(relationship); // you can always take it away again later</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e942"></a>2. Querying the relationship.</h2></div></div></div><p>Sequences are only aware of relationships in which they are the parent sequence. A child sequence cannot find out which parent sequences it is related to.</p><p>The following code snippet prints out all the relationships a sequence has with child sequences:</p><pre class="programlisting">RichSequence rs = ...; // get a rich sequence from somewhere
for (Iterator i = rs.getRelationships().iterator(); i.hasNext(); ) {
BioEntryRelationship br = (BioEntryRelationship)i.next();
BioEntry parent = br.getObject(); // parent == rs
BioEntry child = br.getSubject();
ComparableTerm relationship = br.getTerm();
// print out the relationship (eg. "A contains B");
System.out.println(parent.getName()+" "+relationship.getName()+" "+child.getName());
}</pre></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e951"></a>Chapter 8. Reading and writing files.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e954"></a>1. Tools for reading/writing files.</h2></div></div></div><p>BioJavaX provides a replacement set of tools for working with files. This is necessary because the new file parsers must work with the new <code class="code">RichSeqIOListener</code> in order to preserve all the information from the file correctly.</p><p>The tools can all be found in <code class="code">RichSequence.IOTools</code>, a subclass of the <code class="code">RichSequence</code> interface. For each file format there are a number of utility methods in this class for reading a variety of sequence types, and writing them out again. See later sections of this chapter for details on individual formats.</p><p>Here is an example of using the <code class="code">RichSequence.IOTools</code> methods. The example reads a file in Genbank format containing some DNA sequences, then prints them out to standard out (the screen) in EMBL format:</p><pre class="programlisting">BufferedReader br = new BufferedReader(new FileReader("myGenbank.gbk")); // an input GenBank file
Namespace ns = RichObjectFactory.getDefaultNamespace(); // a namespace to override that in the file
RichSequenceIterator seqs = RichSequence.IOTools.readGenbankDNA(br,ns); // we are reading DNA sequences
while (seqs.hasNext()) {
RichSequence rs = seqs.nextRichSequence();
RichSequence.IOTools.writeEMBL(System.out, rs, ns); // write it in EMBL format to standard out
}</pre><p>If you wish to output a number of sequences in one of the XML formats, you have to pass a <code class="code">RichSequenceIterator</code> over your collection of sequences in order for the XML format to group them together into a single file with the correct headers:</p><pre class="programlisting">BufferedReader br = new BufferedReader(new FileReader("myGenbank.gbk")); // an input GenBank file
Namespace ns = RichObjectFactory.getDefaultNamespace(); // a namespace to override that in the file
RichSequenceIterator seqs = RichSequence.IOTools.readGenbankDNA(br,ns); // we are reading DNA sequences
RichSequence.IOTools.writeEMBLxml(System.out, seqs, ns); // write the whole lot in EMBLxml format to standard out</pre><p>If you don't know what format your input file is in, but know it could be one of a fixed set of acceptable formats, then you can use BioJavaX's format-guessing routine to attempt to read it:</p><pre class="programlisting">// Not sure if your input is EMBL or Genbank? Load them both here.
Class.forName("org.biojavax.bio.seq.io.EMBLFormat");
Class.forName("org.biojavax.bio.seq.io.GenbankFormat");
// Now let BioJavaX guess which format you actually should use (using the default namespace)
Namespace ns = RichObjectFactory.getDefaultNamespace();
RichSequenceIterator seqs = RichSequence.IOTools.readFile(new File("myfile.seq"),ns);</pre><p>For those who like to do things the hard way, reading and writing by directly using the <code class="code">RichStreamReader</code> and <code class="code">RichStreamWriter</code> interfaces is described below.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e996"></a>1.1. Reading using <code class="code">RichStreamReader</code>.</h3></div></div></div><p>File reading is based around the concept of a <code class="code">RichStreamReader</code>. This object returns a <code class="code">RichSequenceIterator</code> which iterates over every sequence in the file on demand.</p><p>To construct a <code class="code">RichStreamReader</code>, you will need five things.</p><div class="orderedlist"><ol type="1"><li><p>a <code class="code">BufferedReader</code> instance which is connected to the file you wish to parse.</p></li><li><p>a <code class="code">RichSequenceFormat</code> instance which understands the format of the file (eg. <code class="code">FastaFormat</code>, <code class="code">GenbankFormat</code>, etc.)</p></li><li><p>a <code class="code">SymbolTokenization</code> which understands how to translate the sequence data in the file into a BioJava <code class="code">SymbolList</code>.</p></li><li><p>a <code class="code">RichSequenceBuilderFactory</code> instance which generates instances of <code class="code">RichSequenceBuilder</code>.</p></li><li><p>a <code class="code">Namespace</code> instance to associate the sequences with.</p></li></ol></div><p>The <code class="code">RichSequenceBuilderFactory</code> is best set to one of the predefined constants in the <code class="code">RichSequenceBuilderFactory</code> interface. These constants are defined as:</p><div class="table"><a name="d0e1066"></a><p class="title"><b>Table 8.1. <code class="code">RichSequenceBuilderFactory</code> predefined constants.</b></p><table summary="RichSequenceBuilderFactory predefined constants." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td><code class="code">RichSequenceBuilderFactor.FACTORY</code></td><td>Does not attempt any compression on sequence data.</td></tr><tr><td><code class="code">RichSequenceBuilderFactor.PACKED</code></td><td>Will compress all sequence data using PackedSymbolLists.</td></tr><tr><td><code class="code">RichSequenceBuilderFactor.THRESHOLD</code></td><td>Will compress sequence data using a PackedSymbolList only when the sequence exceeds 5000 bases in length. Otherwise, data is not compressed.</td></tr></tbody></table></div><p>If you set the namespace to null, then the namespace used will depend on the format you are reading. For formats which specify namespaces, the namespace from the file will be used. For formats which do not specify namespaces, the default namespace provided by <code class="code">RichObjectFactory.getDefaultNamespace()</code> will be used.</p><p>The <code class="code">SymbolTokenization</code> should be obtained from the Alphabet that represents the sequence data you are expecting from the file. If you are reading DNA sequences, you should use <code class="code">DNATools.getDNA().getTokenization("token")</code>. Other alphabets with tools classes will have similar methods.</p><p>For an alphabet which does not have a tools class, you can do this:</p><pre class="programlisting">Alphabet a = ...; // get an alphabet instance from somewhere
SymbolTokenization st = a.getTokenization("token");</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1108"></a>1.2. Writing using <code class="code">RichStreamWriter</code>.</h3></div></div></div><p>File output is done using <code class="code">RichStreamWriter</code>. This requires:</p><div class="orderedlist"><ol type="1"><li><p>an <code class="code">OutputStream</code> to write sequences to.</p></li><li><p>a <code class="code">Namespace</code> to use for the sequences.</p></li><li><p>a <code class="code">RichSequenceIterator</code> that provides the sequences to write.</p></li></ol></div><p>The namespace should only be specified when the file format includes namespace information and you wish to override the information associated with the actual sequences. If you do not wish to do this, just set it to null, and the namespace from each individual sequence will be used instead.</p><p>The <code class="code">RichSequenceIterator</code> is an iterator over a set of sequences, exactly the same as the one returned by the <code class="code">RichStreamReader</code>. It is therefore possible to plug a <code class="code">RichStreamReader</code> directly into a <code class="code">RichStreamWriter</code> and convert data from one file format to another with no intermediate steps.</p><p>If you only have one sequence to write, you can wrap it in a temporary <code class="code">RichSequenceIterator</code> by using a call like this:</p><pre class="programlisting">RichSequence rs = ...; // get sequence from somewhere
RichSequenceIterator it = new SingleRichSeqIterator(rs); // wrap it in an iterator</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1161"></a>1.3. Example.</h3></div></div></div><p>The following is an example that will read some DNA sequences from a GenBank file and write them out to standard output (screen) as FASTA using the methods outlined above:</p><pre class="programlisting">SymbolTokenization dna = DNATools.getDNA().getTokenization("token"); // sequences will be DNA sequences
RichSequenceFormat genbank = new GenbankFormat(); // read Genbank
RichSequenceFormat fasta = new FastaFormat(); // write FASTA
RichSequenceBuilderFactory factory = RichSequenceBuilderFactory.THRESHOLD; // compress only longer sequences
Namespace bloggsNS = RichObjectFactory.getObject(
SimpleNamespace.class,
new Object[]{"bloggs"}
); // read/write everything using the 'bloggs' namespace
BufferedReader input = new BufferedReader(new FileReader("mygenbank.file"));// read seqs from "mygenbank.file"
OutputStream output = System.out; // write seqs to STDOUT
RichStreamReader seqsIn = new RichStreamReader(input,genbank,dna,factory,bloggsNS);
RichStreamWriter seqsOut = new RichStreamWriter(output,fasta);
seqsOut.writeStream(seqsIn,bloggsNS); // one-step Genbank to Fasta conversion!</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1168"></a>1.4. Line widths and eliding information.</h3></div></div></div><p>When working at this level, extra methods can be used when direct access to the <code class="code">RichSequenceFormat</code> object is available. These methods are:</p><div class="table"><a name="d0e1176"></a><p class="title"><b>Table 8.2. <code class="code">RichSequenceFormat</code> extra options.</b></p><table summary="RichSequenceFormat extra options." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td><code class="code">get/setLineWidth()</code></td><td>Sets the line width for output. Any lines longer than this will be wrapped. The default for most formats is 80.</td></tr><tr><td><code class="code">get/setElideSymbols()</code></td><td>When set to true, this will skip the sequence data (ie. the addSymbols() method of the RichSeqIOListener will never be called).</td></tr><tr><td><code class="code">get/setElideFeatures()</code></td><td>When set to true, this will skip the feature tables in the file.</td></tr><tr><td><code class="code">get/setElideComments()</code></td><td>When set to true, this will skip all comments in the file.</td></tr><tr><td><code class="code">get/setElideReferences()</code></td><td>When set to true, this will skip all publication cross-references in the file.</td></tr></tbody></table></div><p>Finer control is available when you go even deeper and write your own <code class="code">RichSeqIOListener</code> objects. See later in this document for information on that subject.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1218"></a>1.5. How parsed data becomes a sequence.</h3></div></div></div><p>All fields read from a file, regardless of the format, are passed to an instance of <code class="code">RichSequenceBuilder</code>. In the case of the tools provided in <code class="code">RichSequence.IOTools</code>, or any <code class="code">RichStreamReader</code> using one of the <code class="code">RichSequenceBuilderFactory</code> constants or <code class="code">SimpleRichSequenceBuilderFactory</code>, this is an instance of <code class="code">SimpleRichSequenceBuilder</code>.</p><p><code class="code">SimpleRichSequenceBuilder</code> constructs sequences as follows:</p><div class="table"><a name="d0e1245"></a><p class="title"><b>Table 8.3. <code class="code">SimpleRichSequenceBuilder</code> sequence construction.</b></p><table summary="SimpleRichSequenceBuilder sequence construction." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td><code class="code">startSequence</code></td><td>Resets all the values in the builder to their defaults, ready to parse a whole new sequence.</td></tr><tr><td><code class="code">addSequenceProperty</code></td><td>Assumes that both the key and the value of the property are strings. It uses the key to look up a term with the same name (case-sensitive) in the ontology provided by <code class="code">RichObjectFactory.getDefaultOntology()</code>. If it finds no such term, it creates one. It then adds an annotation to the sequence with that term as the key, using the value provided. The first annotation receives the rank of 0, the second 1, and so on. The annotations are attached to the sequence using <code class="code">setNoteSet()</code> and the accumulated set of notes.</td></tr><tr><td><code class="code">setVersion</code></td><td>Only accepts a single call per sequence. Value is passed directly to the resulting sequence's <code class="code">setVersion</code> method.</td></tr><tr><td><code class="code">setURI</code></td><td><span class="emphasis"><em>Not implemented, throws an exception.</em></span></td></tr><tr><td><code class="code">setSeqVersion</code></td><td>Only accepts a single call per sequence. Value is parsed into a double and passed to the resulting sequence's <code class="code">setSeqVersion</code> method. If the value is null, then 0.0 is used.</td></tr><tr><td><code class="code">setAccession</code></td><td>Value is passed directly to the sequence's <code class="code">setAccession</code> method. Multiple calls will replace the accession, <span class="emphasis"><em>not</em></span> add extra ones. The accession cannot be null.</td></tr><tr><td><code class="code">setDescription</code></td><td>Only accepts a single call per sequence. Value is passed directly to the resulting sequence's <code class="code">setDescription</code> method.</td></tr><tr><td><code class="code">setDivision</code></td><td>Only accepts a single call per sequence. Value is passed directly to the resulting sequence's <code class="code">setDivision</code> method. The division cannot be null.</td></tr><tr><td><code class="code">setIdentifier</code></td><td>Only accepts a single call per sequence. Value is passed directly to the resulting sequence's <code class="code">setIdentifier</code> method.</td></tr><tr><td><code class="code">setName</code></td><td>Only accepts a single call per sequence. Value is passed directly to the resulting sequence's <code class="code">setName</code> method.</td></tr><tr><td><code class="code">setNamespace</code></td><td>Only accepts a single call per sequence. Value is passed directly to the resulting sequence's <code class="code">setNamespace</code> method. The namespace cannot be null.</td></tr><tr><td><code class="code">setComment</code></td><td>Adds the text supplied (which must not be null) as a comment to the sequence using <code class="code">addComment()</code>. Multiple calls will result in multiple comments being added. The first comment is ranked 1, the second comment ranked 2, and so on.</td></tr><tr><td><code class="code">setTaxon</code></td><td>Value is passed to the sequence's <code class="code">setNamespace</code> method. It must not be null. If this method is called repeatedly, only the first call will be accepted. Subsequent calls will result in warnings being printed to standard error. These extra calls will not cause the builder to fail. The value from the initial call will be the one that is used.</td></tr><tr><td><code class="code">startFeature</code></td><td>Tells the builder to start a new feature on this sequence. If the current feature has not yet been ended, then this feature will be a sub-feature of the current feature and associated with it via a <code class="code">RichFeatureRelationship</code>, where the current feature is the parent and this new feature is the child. The relationship will be defined with the term "contains" from <code class="code">RichObjectFactory.getDefaultOntology()</code>. Each feature will be attached to the resulting sequence by calling <code class="code">setParent()</code> on the feature once the sequence has been created.</td></tr><tr><td><code class="code">getCurrentFeature</code></td><td>Returns the current feature, if one has been started. If there is no current feature (eg. it has already ended, or one was never started) then an exception is thrown.</td></tr><tr><td><code class="code">addFeatureProperty</code></td><td>Assumes that both the key and the value of the property are strings. It uses the key to look up a term with the same name (case-sensitive) in the ontology provided by <code class="code">RichObjectFactory.getDefaultOntology()</code>. If it finds no such term, it creates one. It then adds an annotation to the current feature with that term as the key, using the value provided. The first annotation receives the rank of 0, the second 1, and so on. The annotations are attached to the feature using <code class="code">getAnnotation().addNote()</code>.</td></tr><tr><td><code class="code">endFeature</code></td><td>Ends the current feature. If there is no current feature, an exception is thrown.</td></tr><tr><td><code class="code">setRankedDocRef</code></td><td>Adds the given <code class="code">RankedDocRef</code> to the set of publication cross-references which the sequence being built refers to. The value cannot be null. If the same value is provided multiple times, it will only be saved once. Each value is stored by calling <code class="code">addRankedDocRef()</code> on the resulting sequence.</td></tr><tr><td><code class="code">setRankedCrossRef</code></td><td>Adds the given <code class="code">RankedCrossRef</code> to the set of database cross-references which the sequence being built refers to. The value cannot be null. If the same value is provided multiple times, it will only be saved once. Each value is stored by calling <code class="code">addRankedCrossRef()</code> on the resulting sequence.</td></tr><tr><td><code class="code">setRelationship</code></td><td>Adds the given <code class="code">BioEntryRelationship</code> to the set of relationships in which the sequence being built is the parent. The relationship cannot be null. If the same relationship is provided multiple times, it will only be saved once. Each relationship is stored by calling <code class="code">addRelationship()</code> on the resulting sequence.</td></tr><tr><td><code class="code">setCircular</code></td><td>You can call this as many times as you like. Each call will override the value provided by the previous call. The value is passed to the sequence's <code class="code">setCircular</code> method.</td></tr><tr><td><code class="code">addSymbols</code></td><td>Adds symbols to this sequence. You can call it multiple times to set symbols at different locations in the sequence. If any of the symbols found are not in the alphabet accepted by this builder, or if the locations provided to place the symbols at are unacceptable, an exception is thrown. The resulting <code class="code">SymbolList</code> will be the basis upon which the final <code class="code">RichSequence</code> object is built.</td></tr><tr><td><code class="code">endSequence</code></td><td>Tells the builder that we have provided all the information we know. If at this point the name, namespace, or accession have not been provided, or if any of them are null, an exception is thrown.</td></tr><tr><td><code class="code">makeSequence</code></td><td>Constructs a <code class="code">RichSequence</code> object from the information provided, following the rules laid out in this table, and returns it. The <code class="code">RichSequence</code> object does not actually exist until this method has been called.</td></tr><tr><td><code class="code">makeRichSequence</code></td><td>Wrapper for <code class="code">makeSequence</code>.</td></tr></tbody></table></div><p>If you want fine-grained control over every aspect of a file whilst it is being parsed, you must write your own implementation of the <code class="code">RichSeqIOListener</code> interface (which <code class="code">RichSequenceBuilder</code> extends). This is detailed later in this document.</p></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e1501"></a>2. FASTA.</h2></div></div></div><p><code class="code">FastaFormat</code> reads and writes FASTA files, and is able to parse the description line in detail.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1508"></a>2.1. Reading.</h3></div></div></div><p>The description line formats understood are as follows:</p><pre class="programlisting">>gi|<identifier>|<namespace>|<accession>.<version>|<name> <description>
>gi|<identifier>|<namespace>|<accession>|<name> <description>
><namespace>|<accession>.<version>|<name> <description>
><namespace>|<accession>|<name> <description>
><name> <description></pre><p>The description is optional in all cases. The version defaults to 0 if not provided.</p><p>If a non-null <code class="code">Namespace</code> is provided, then the namespace in the file is ignored.</p><p>If a null <code class="code">Namespace</code> is provided, then the namespace from the file is used. If no namespace is specified in the file, then <code class="code">RichObjectFactory.getDefaultNamespace()</code> is used.</p><p>The fields are passed into the <code class="code">RichSeqIOListener</code> as follows:</p><div class="table"><a name="d0e1535"></a><p class="title"><b>Table 8.4. <code class="code">FastaFormat</code> input field destinations.</b></p><table summary="FastaFormat input field destinations." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>identifier</td><td><code class="code">setIdentifier()</code></td></tr><tr><td>namespace</td><td><code class="code">setNamespace()</code></td></tr><tr><td>accession</td><td><code class="code">setAccession()</code></td></tr><tr><td>version</td><td><code class="code">setVersion()</code></td></tr><tr><td>name</td><td><code class="code">setName()</code></td></tr><tr><td>description</td><td><code class="code">setDescription()</code></td></tr><tr><td><sequence data></td><td><code class="code">addSymbols()</code></td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1584"></a>2.2. Writing.</h3></div></div></div><p>Description lines are always output in one of two forms:</p><pre class="programlisting">>gi|<identifier>|<namespace>|<accession>.<version>|<name> <description>
><namespace>|<accession>.<version>|<name> <description></pre><p>The first form is used if the identifier of the sequence object is not null, otherwise the second form is used. In both cases, the description is only output if it is not null.</p><p>The fields are read from the <code class="code">RichSequence</code> object as follows:</p><div class="table"><a name="d0e1598"></a><p class="title"><b>Table 8.5. <code class="code">FastaFormat</code> output field sources.</b></p><table summary="FastaFormat output field sources." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>identifier</td><td><code class="code">getIdentifier()</code></td></tr><tr><td>namespace</td><td><code class="code">getNamespace()</code></td></tr><tr><td>accession</td><td><code class="code">getAccession()</code></td></tr><tr><td>version</td><td><code class="code">getVersion()</code></td></tr><tr><td>name</td><td><code class="code">getName()</code></td></tr><tr><td>description</td><td><code class="code">getDescription()</code></td></tr><tr><td><sequence data></td><td>Sequence is read directly as it is a <code class="code">SymbolList</code>.</td></tr></tbody></table></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e1649"></a>3. GenBank.</h2></div></div></div><p><code class="code">GenbankFormat</code> reads and writes GenBank files, and understands almost all permutations of the location descriptors found in the feature tables.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1656"></a>3.1. Reading.</h3></div></div></div><p>The fields are passed into the <code class="code">RichSeqIOListener</code> as follows:</p><div class="table"><a name="d0e1664"></a><p class="title"><b>Table 8.6. <code class="code">GenBankFormat</code> input field destinations.</b></p><table summary="GenBankFormat input field destinations." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>LOCUS</td><td><code class="code">setName()</code>, <code class="code">addSequenceProperty(Terms.getStrandedTerm())</code>, <code class="code">setCircular()</code>, <code class="code">addSequenceProperty(Terms.getMolTypeTerm())</code>, <code class="code">addSequenceProperty(Terms.getDateUpdatedTerm())</code>, and <code class="code">setDivision()</code>.</td></tr><tr><td>DEFINITION</td><td><code class="code">setDescription()</code></td></tr><tr><td>ACCESSION</td><td>The first one is passed to <code class="code">setAccession()</code>. Subsequent entries are passed to <code class="code">addSequenceProperty(Terms.getAdditionalAccessionTerm())</code>.</td></tr><tr><td>VERSION</td><td>The section before the full stop "." is passed to <code class="code">setAccession()</code>. If it differs from the first accession on the ACCESSION line, then the first accession on the ACCESSION line becomes an additional accession, whilst the accession from the VERSION line becomes the primary accession. The section after the full stop is passed to <code class="code">setVersion()</code>. The GI number is passed to <code class="code">setIdentifier()</code>.</td></tr><tr><td>KEYWORDS</td><td>The line is split up into individual keywords, each of which is passed to <code class="code">addSequenceProperty(Terms.getKeywordTerm())</code>.</td></tr><tr><td>SOURCE</td><td>Ignored.</td></tr><tr><td>ORGANISM</td><td>Ignored.</td></tr><tr><td>REFERENCE</td><td>The coordinates of the reference end up as start and end coordinates of a <code class="code">SimpleRankedDocRef</code> object which is attached to the sequence by calling <code class="code">setRankedDocRef()</code>.</td></tr><tr><td>AUTHORS</td><td>The value is parsed into a set of <code class="code">DocRefAuthor</code> objects using <code class="code">DocRefAuthor.Tools</code>. The resulting set becomes part of the <code class="code">DocRef</code> object which is wrapped using a <code class="code">SimpleRankedDocRef</code> and attached to the sequence.</td></tr><tr><td>TITLE</td><td>The title is passed to the current <code class="code">DocRef</code> object using <code class="code">setTitle()</code>.</td></tr><tr><td>JOURNAL</td><td>The journal is passed to the current <code class="code">DocRef </code>object using <code class="code">setLocation()</code>.</td></tr><tr><td>PUBMED</td><td>A <code class="code">RankedCrossRef</code> object is created pointing to <code class="code">Terms.PUBMED_KEY</code> as the database, and using this value as the accession with a version of 0. It is attached to the sequence using <code class="code">setRankedCrossRef()</code>. If no MEDLINE line is found, this is also associated with the current reference by using <code class="code">setCrossRef()</code> on the <code class="code">DocRef</code> object.</td></tr><tr><td>MEDLINE</td><td>Behaves similarly to PUBMED, but with a database name of <code class="code">Terms.MEDLINE_KEY</code>. It takes precedence over PUBMED and will always be used for the <code class="code">DocRef</code> cross-reference.</td></tr><tr><td>REMARK</td><td>Added to the current reference by calling <code class="code">setRemark()</code> on the <code class="code">DocRef</code> object.</td></tr><tr><td>COMMENT</td><td><code class="code">setComment()</code></td></tr><tr><td>FEATURES</td><td>Each feature is started by calling <code class="code">startFeature()</code>. The source is <code class="code">Terms.getGenBankTerm()</code> whereas the type is obtained from <code class="code">RichObjectFactory.getDefaultOntology().getOrCreateTerm()</code> using the feature name. Qualifiers are added by using <code class="code">addFeatureProperty()</code> with the term key created by <code class="code">RichObjectFactory.getDefaultOntology().getOrCreateTerm()</code> using the qualifier name. There are two special cases of qualifier: <code class="code">db_xref</code>, and <code class="code">organism</code>. Neither end up being stored as qualifiers. A database cross-reference is created for <code class="code">db_xref</code> qualifiers and added to the feature using <code class="code">addRankedCrossRef()</code>, except when the feature type is <code class="code">source</code> and the database name (before the colon) is <code class="code">taxon</code>, in which case the taxon ID is used in conjunction with the <code class="code">organism</code> qualifier to determine the <code class="code">NCBITaxon</code> for this sequence, and passed to the sequence using <code class="code">setTaxon()</code>. Location strings are run through <code class="code">GenBankLocationParser</code> to generate <code class="code">RichLocation</code> instances to attach to the feature.</td></tr><tr><td>BASE</td><td>Ignored.</td></tr><tr><td>ORIGIN</td><td>The sequence is read and passed to <code class="code">addSymbols()</code>.</td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e1906"></a>3.2. Writing.</h3></div></div></div><p>The fields are read from the <code class="code">RichSequence</code> object as follows:</p><div class="table"><a name="d0e1914"></a><p class="title"><b>Table 8.7. <code class="code">GenBankFormat</code> output field sources.</b></p><table summary="GenBankFormat output field sources." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>LOCUS</td><td><code class="code">getName()</code>, <code class="code">length()</code>, <code class="code">getNoteSet(Terms.getStrandedTerm())</code>, <code class="code">getNoteSet(Terms.getMolTypeTerm())</code>, <code class="code">getCircular()</code>, <code class="code">getDivision()</code>, and <code class="code">getNoteSet(Terms.getDateUpdatedTerm())</code></td></tr><tr><td>DEFINITION</td><td><code class="code">getDescription()</code></td></tr><tr><td>ACCESSION</td><td><code class="code">getAccession()</code>, and <code class="code">getNoteSet(Terms.getAdditionalAccessionTerm())</code>.</td></tr><tr><td>VERSION</td><td><code class="code">getAccession()</code>, <code class="code">getIdentifier()</code> and <code class="code">getVersion()</code></td></tr><tr><td>KEYWORDS</td><td><code class="code">getNoteSet(Terms.getKeywordTerm())</code>.</td></tr><tr><td>SOURCE</td><td><code class="code">getNCBITaxon().getDisplayName()</code></td></tr><tr><td>ORGANISM</td><td><code class="code">getNCBITaxon()getDisplayName()</code>, chopped before the first bracket, and <code class="code">getNCBITaxon().getNameHierarchy()</code></td></tr><tr><td>REFERENCE</td><td>Each reference is obtained from <code class="code">getRankedDocRefs()</code>. The coordinates of the reference are from the reference's <code class="code">getStart()</code> and <code class="code">getEnd()</code> methods.</td></tr><tr><td>AUTHORS</td><td>The author string is from the reference's <code class="code">getAuthors()</code> method.</td></tr><tr><td>TITLE</td><td>The title is from the reference's <code class="code">getTitle()</code>.</td></tr><tr><td>JOURNAL</td><td>The journal information is from the reference's <code class="code">getLocation()</code>.</td></tr><tr><td>PUBMED / MEDLINE</td><td>The cross reference returned by <code class="code">getCrossRef()</code> on the reference provides the database name and accession used here.</td></tr><tr><td>REMARK</td><td><code class="code">getRemark()</code> on the current reference object.</td></tr><tr><td>COMMENT</td><td>All the comments returned by <code class="code">getComments()</code> are joined together, separated by newlines.</td></tr><tr><td>FEATURES</td><td>Each feature is output in turn by iterating through <code class="code">getFeatureSet()</code>. For the <code class="code">source </code>feature, the <code class="code">db_xref</code> and <code class="code">organism</code> fields are added to the output by calling <code class="code">getNCBITaxon().getNCBITaxID()</code> and <code class="code">getNCBITaxon().getDisplayName()</code> on the sequence (the latter is chopped before the first bracket if necessary). For all features, extra <code class="code">db_xref</code> qualifiers are output for each cross-reference returned by calling <code class="code">getRankedCrossRefs()</code> on the feature. The other qualifiers for the features are the contents of the feature's annotation, provided by <code class="code">getNoteSet()</code> on the feature. <code class="code">GenBankLocationParser</code> is used to convert the feature's <code class="code">getLocation()</code> output into the correct text format.</td></tr><tr><td>BASE</td><td>Calculated from the sequence data.</td></tr><tr><td>ORIGIN</td><td>The sequence is read directly as it is a <code class="code">SymbolList</code>..</td></tr></tbody></table></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e2107"></a>4. EMBL.</h2></div></div></div><p><code class="code">EMBLFormat</code> reads and writes EMBL files, and understands almost all permutations of the location descriptors found in the feature tables.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e2114"></a>4.1. Reading.</h3></div></div></div><p>The fields are passed into the <code class="code">RichSeqIOListener</code> as follows:</p><div class="table"><a name="d0e2122"></a><p class="title"><b>Table 8.8. <code class="code">EMBLFormat</code> input field destinations.</b></p><table summary="EMBLFormat input field destinations." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>ID</td><td><code class="code">setName()</code>, <code class="code">addSequenceProperty(Terms.getMolTypeTerm())</code>, <code class="code">setDivision()</code>, <code class="code">setCircular()</code></td></tr><tr><td>AC</td><td>First accession goes to <code class="code">setAccession()</code>, all others to <code class="code">addSequenceProperty(Terms.getAdditionalAccessionTerm())</code>.</td></tr><tr><td>SV</td><td>If the accession (before the full stop ".") is different from the first accession on the AC line, then this accession becomes the primary accession, and the first accession on the AC line becomes an additional accession. Everything after the full stop goes to <code class="code">setVersion()</code>.</td></tr><tr><td>DE</td><td><code class="code">setDescription()</code></td></tr><tr><td>DT</td><td>For creation date: <code class="code">addSequenceProperty(Terms.getDateCreatedTerm())</code> and <code class="code">addSequenceProperty(Terms.getRelCreatedTerm())</code>. For last updated date: <code class="code">addSequenceProperty(Terms.getDateUpdatedTerm())</code> and <code class="code">addSequenceProperty(Terms.getRelUpdatedTerm())</code>.</td></tr><tr><td>DR</td><td>Each record is split into a database name, primary accession, and additional accessions. A <code class="code">CrossRef</code> object is constructed from these first two pieces, and annotated with additional accessions using <code class="code">Terms.getAdditionalAccessionTerm()</code>. The whole thing is then given a rank and sent to <code class="code">setRankedCrossRef()</code>.</td></tr><tr><td>OS</td><td>Ignored.</td></tr><tr><td>OC</td><td>Ignored.</td></tr><tr><td>OG</td><td><code class="code">addSequenceProperty(Terms.getOrganelleTerm())</code></td></tr><tr><td>RN</td><td>The number of the reference becomes the rank of the <code class="code">RankedDocRef</code> object later.</td></tr><tr><td>RP</td><td>The values on this line become the start and end of the <code class="code">RankedDocRef</code> object later.</td></tr><tr><td>RX</td><td>Each of these is parsed and the database name and primary accession are used to construct a <code class="code">CrossRef</code> object. All <code class="code">CrossRef</code> objects are ranked and added to the sequence <code class="code">setRankedCrossRef()</code>, and one of them will be added to the current reference using <code class="code">setCrossRef()</code>. The one that is chosen will be MEDLINE, or PUBMED if not present, or DOI if PUBMED not present either.</td></tr><tr><td>RA</td><td>Parsed using <code class="code">DocRefAuthor.Tools.parse()</code> and becomes the set of authors for the <code class="code">DocRef</code> object.</td></tr><tr><td>RG</td><td>Parsed using <code class="code">DocRefAuthor.Tools.parse()</code>, and each consortium is flagged using the <code class="code">setConsortium()</code> method before being added to the set of authors for the <code class="code">DocRef</code> object.</td></tr><tr><td>RT</td><td>The title for <code class="code">setTitle()</code> on the <code class="code">DocRef</code> object.</td></tr><tr><td>RL</td><td>The location for the <code class="code">setLocation()</code> method on the <code class="code">DocRef</code> object.</td></tr><tr><td>RC</td><td>Used for <code class="code">setRemark()</code> on the <code class="code">DocRef</code> object.</td></tr><tr><td>KW</td><td>Each keyword is sent individually to <code class="code">addSequenceProperty(Terms.getKeywordTerm())</code></td></tr><tr><td>CC</td><td><code class="code">setComment()</code></td></tr><tr><td>FH</td><td>Ignored.</td></tr><tr><td>FT</td><td>As per the GenBankFormat - please see the section on GenBank parsing.</td></tr><tr><td>CO</td><td>Causes an exception as contigs are not supported.</td></tr><tr><td>AH</td><td>Causes an exception as TPAs are not supported.</td></tr><tr><td>SQ</td><td>Sequence data is passed to <code class="code">addSymbols()</code>.</td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e2348"></a>4.2. Writing.</h3></div></div></div><p>The fields are read from the <code class="code">RichSequence</code> object as follows:</p><div class="table"><a name="d0e2356"></a><p class="title"><b>Table 8.9. <code class="code">EMBLFormat</code> output field sources.</b></p><table summary="EMBLFormat output field sources." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>ID</td><td><code class="code">getName()</code>, <code class="code">getNoteSet(Terms.getMolTypeTerm())</code>, <code class="code">getDivision()</code>, <code class="code">getCircular()</code></td></tr><tr><td>AC</td><td><code class="code">getAccession()</code>, and <code class="code">getNoteSet(Terms.getAdditionalAccessionTerm())</code>.</td></tr><tr><td>SV</td><td><code class="code">getAccession()</code> and <code class="code">getVersion()</code>.</td></tr><tr><td>DE</td><td><code class="code">getDescription()</code></td></tr><tr><td>DT</td><td>For creation date: <code class="code">getNoteSet(Terms.getDateCreatedTerm())</code> and <code class="code">getNoteSet(Terms.getRelCreatedTerm())</code>. For last updated date: <code class="code">getNoteSet(Terms.getDateUpdatedTerm())</code> and <code class="code">getNoteSetTerms.getRelUpdatedTerm())</code>. If date created is null, then the update date is duplicated and used here as well.</td></tr><tr><td>DR</td><td><code class="code">getRankedCrossRef()</code>, using <code class="code">getNoteSet(Terms.getAdditionalAccessionTerm())</code> to generate additional accessions.</td></tr><tr><td>OS</td><td><code class="code">getNCBITaxon().getDisplayName()</code></td></tr><tr><td>OC</td><td><code class="code">getNCBITaxon()getDisplayName()</code>, chopped before the first bracket, and <code class="code">getNCBITaxon().getNameHierarchy()</code>.</td></tr><tr><td>OG</td><td><code class="code">getNoteSet(Terms.getOrganelleTerm())</code></td></tr><tr><td>RN</td><td>Each reference returned by <code class="code">getRankedDocRefs()</code> is iterated over. The rank of the <code class="code">RankedDocRef</code> object is output here.</td></tr><tr><td>RP</td><td>The start and end coordinates of the <code class="code">RankedDocRef</code> object.</td></tr><tr><td>RX</td><td>The <code class="code">getCrossRef()</code> output from the <code class="code">DocRef</code> object.</td></tr><tr><td>RA</td><td>The <code class="code">getAuthors()</code> output from the <code class="code">DocRef</code> object, with the consortiums removed.</td></tr><tr><td>RG</td><td>The <code class="code">getAuthors()</code> output from the <code class="code">DocRef</code> object, with all except consortiums removed.</td></tr><tr><td>RT</td><td>The <code class="code">getTitle()</code> from the <code class="code">DocRef</code>.</td></tr><tr><td>RL</td><td>The <code class="code">getLocation()</code> from the <code class="code">DocRef</code>.</td></tr><tr><td>RC</td><td>The <code class="code">getRemark()</code> from the <code class="code">DocRef</code>.</td></tr><tr><td>KW</td><td><code class="code">getNoteSet(Terms.getKeywordTerm())</code>.</td></tr><tr><td>CC</td><td>One comment section per entry in <code class="code">getComments()</code>.</td></tr><tr><td>FH</td><td>No fields necessary here.</td></tr><tr><td>FT</td><td>As per the GenBankFormat - please see the section on GenBank parsing.</td></tr><tr><td>CO</td><td>Never generated.</td></tr><tr><td>AH</td><td>Never generated.</td></tr><tr><td>SQ</td><td>Sequence counts are generated, then sequence is read directly as it is a <code class="code">SymbolList</code>.</td></tr></tbody></table></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e2581"></a>5. UniProt.</h2></div></div></div><p><code class="code">UniProtFormat</code> reads and writes UniProt files.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e2588"></a>5.1. Reading.</h3></div></div></div><p>The fields are passed into the <code class="code">RichSeqIOListener</code> as follows:</p><div class="table"><a name="d0e2596"></a><p class="title"><b>Table 8.10. <code class="code">UniProtFormat</code> input field destinations.</b></p><table summary="UniProtFormat input field destinations." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>ID</td><td><code class="code">setName()</code>, <code class="code">addSequenceProperty(Terms.getMolTypeTerm())</code>, <code class="code">addSequenceProperty(Terms.getDataClassTerm())</code>, <code class="code">setDivision()</code></td></tr><tr><td>AC</td><td>First accession goes to <code class="code">setAccession()</code>, all others to <code class="code">addSequenceProperty(Terms.getAdditionalAccessionTerm())</code>.</td></tr><tr><td>DE</td><td><code class="code">setDescription()</code></td></tr><tr><td>DT</td><td>For creation date: <code class="code">addSequenceProperty(Terms.getDateCreatedTerm())</code> and <code class="code">addSequenceProperty(Terms.getRelCreatedTerm())</code>. For last sequence updated date: <code class="code">addSequenceProperty(Terms.getDateUpdatedTerm())</code> and <code class="code">addSequenceProperty(Terms.getRelUpdatedTerm())</code>. For last annotation updated date: <code class="code">addSequenceProperty(Terms.getDateAnnotatedTerm())</code> and <code class="code">addSequenceProperty(Terms.getRelAnnotatedTerm())</code>.</td></tr><tr><td>DR</td><td>Each record is split into a database name, primary accession, and additional accessions. A <code class="code">CrossRef</code> object is constructed from these first two pieces, and annotated with additional accessions using <code class="code">Terms.getAdditionalAccessionTerm()</code>. The whole thing is then given a rank and sent to <code class="code">setRankedCrossRef()</code>.</td></tr><tr><td>OS</td><td>First named species is used as the scientific name to construct an <code class="code">NCBITaxon</code> object, along with the tax ID from the OX line, and passed to <code class="code">setTaxon()</code>. The second name, if present, is the common name. Subsequent names are synonyms.</td></tr><tr><td>OC</td><td>Ignored.</td></tr><tr><td>OX</td><td>See details for the OS line.</td></tr><tr><td>OG</td><td><code class="code">addSequenceProperty(Terms.getOrganelleTerm())</code></td></tr><tr><td>GN</td><td>Gene names are passed to <code class="code">addSequenceProperty(Terms.getGeneNameTerm())</code>. Gene synonyms are passed to <code class="code">addSequenceProperty(Terms.getGeneSynonymTerm())</code>. Ordered locus names are passed to <code class="code">addSequenceProperty(Terms.getOrderedLocusNameTerm())</code>. ORF names are passed to <code class="code">addSequenceProperty(Terms.getORFNameTerm())</code>. The values have a number and a colon prefixed, where the number refers to the sequence order of the current gene.</td></tr><tr><td>RN</td><td>The number of the reference becomes the rank of the <code class="code">RankedDocRef</code> object later.</td></tr><tr><td>RP</td><td>The whole value is passed to <code class="code">setRemark()</code>. If it contains the words 'SEQUENCE OF', then the sequence position is parsed out and becomes the start and end of the <code class="code">RankedDocRef</code> object later.</td></tr><tr><td>RX</td><td>Each of these is parsed and the database name and primary accession are used to construct a <code class="code">CrossRef</code> object. All <code class="code">CrossRef</code> objects are ranked and added to the sequence <code class="code">setRankedCrossRef()</code>, and one of them will be added to the current reference using <code class="code">setCrossRef()</code>. The one that is chosen will be MEDLINE, or PUBMED if not present, or DOI if PUBMED not present either.</td></tr><tr><td>RA</td><td>Parsed using <code class="code">DocRefAuthor.Tools.parse()</code> and becomes the set of authors for the <code class="code">DocRef</code> object.</td></tr><tr><td>RG</td><td>Parsed using <code class="code">DocRefAuthor.Tools.parse()</code>, and each consortium is flagged using the <code class="code">setConsortium()</code> method before being added to the set of authors for the <code class="code">DocRef</code> object.</td></tr><tr><td>RT</td><td>The title for <code class="code">setTitle()</code> on the <code class="code">DocRef</code> object.</td></tr><tr><td>RL</td><td>The location for the <code class="code">setLocation()</code> method on the <code class="code">DocRef</code> object.</td></tr><tr><td>RC</td><td>Comments are key-value pairs. Species comments are passed to <code class="code">addSequenceProperty(Terms.getSpeciesTerm())</code>. Strain comments are passed to <code class="code">addSequenceProperty(Terms.getStrainTerm())</code>. Tissue comments are passed to <code class="code">addSequenceProperty(Terms.getTissueTerm())</code>. Transposon comments are passed to <code class="code">addSequenceProperty(Terms.getTransposonTerm())</code>. Plasmid comments are passed to <code class="code">addSequenceProperty(Terms.getPlasmidTerm())</code>. The values have a number and a colon prefixed, where the number refers to the rank of the current <code class="code">RankedDocRef</code>.</td></tr><tr><td>KW</td><td>Each keyword is sent individually to <code class="code">addSequenceProperty(Terms.getKeywordTerm())</code></td></tr><tr><td>CC</td><td>If the comment is parseable using <code class="code">UniProtCommentParser</code> then the value is passed to <code class="code">setComment()</code>. Otherwise, it is assumed to be the copyright message that comes with UniProt records, and is passed to <code class="code">addSequenceProperty(Terms.getCopyrightTerm())</code>.</td></tr><tr><td>FT</td><td>Each feature encountered triggers a call to <code class="code">startFeature()</code>, and calls <code class="code">endFeature()</code> on completion. The location is parsed out using <code class="code">UniProtLocationParser</code>. The source term is <code class="code">Terms.getUniProtTerm()</code>, whereas the type term is a term from <code class="code">RichObjectFactory.getDefaultOntology().getOrCreateTerm()</code> equivalent to the name of the feature. The feature description is stored using <code class="code">addFeatureProperty(Terms.getFeatureDescTerm())</code>. Subsequent lines beginning with '/' are added as qualifiers. The only qualifier with a predefined term is 'FTId', which is represented by <code class="code">Terms.getFTIdTerm()</code>. All others encountered have terms generated from <code class="code">RichObjectFactory.getDefaultOntology().getOrCreateTerm()</code> with names equivalent to the name of the qualifier. Qualifiers are added using <code class="code">addFeatureProperty()</code>. UniProt uses its own unique set of feature names. No attempt is made to translate other feature names to/from this set.</td></tr><tr><td>SQ</td><td>Sequence data is passed to <code class="code">addSymbols()</code>.</td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e2883"></a>5.2. Writing.</h3></div></div></div><p>The fields are read from the <code class="code">RichSequence</code> object as follows:</p><div class="table"><a name="d0e2891"></a><p class="title"><b>Table 8.11. <code class="code">UniProtFormat</code> output field sources.</b></p><table summary="UniProtFormat output field sources." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td>ID</td><td><code class="code">getName()</code>, <code class="code">getNoteSet(Terms.getMolTypeTerm())</code>, <code class="code">getNoteSet(Terms.getDataClassTerm())</code>, <code class="code">getDivision()</code></td></tr><tr><td>AC</td><td><code class="code">getAccession()</code>, and <code class="code">getNoteSet(Terms.getAdditionalAccessionTerm())</code>.</td></tr><tr><td>DE</td><td><code class="code">getDescription()</code></td></tr><tr><td>DT</td><td>For creation date: <code class="code">getNoteSet(Terms.getDateCreatedTerm())</code> and <code class="code">getNoteSet(Terms.getRelCreatedTerm())</code>. For last updated date: <code class="code">getNoteSet(Terms.getDateUpdatedTerm())</code> and <code class="code">getNoteSetTerms.getRelUpdatedTerm())</code>. For last annotation date: <code class="code">getNoteSet(Terms.getDateAnnotatedTerm())</code> and <code class="code">getNoteSetTerms.getRelAnnotatedTerm())</code>. If date created or date annotated is null, then the update date is duplicated and used here as well.</td></tr><tr><td>DR</td><td><code class="code">getRankedCrossRef()</code>, using <code class="code">getNoteSet(Terms.getAdditionalAccessionTerm())</code> to generate additional accessions.</td></tr><tr><td>OS</td><td><code class="code">getNCBITaxon().getDisplayName()</code> followed by all synonyms from <code class="code">getNames(NCBITaxon.SYNONYM)</code> in brackets.</td></tr><tr><td>OC</td><td><code class="code">getNCBITaxon().getNameHierarchy()</code>.</td></tr><tr><td>OG</td><td><code class="code">getNoteSet(Terms.getOrganelleTerm())</code></td></tr><tr><td>OX</td><td><code class="code">getNCBITaxon().getNCBITaxID()</code></td></tr><tr><td>GN</td><td>Gene names are written from <code class="code">getNoteSet(Terms.getGeneNameTerm())</code>. Gene synonyms are written from <code class="code">getNoteSet(Terms.getGeneSynonymTerm())</code>. Ordered locus names are written from <code class="code">getNoteSet(Terms.getOrderedLocusNameTerm())</code>. ORF names are written from <code class="code">getNoteSet(Terms.getORFNameTerm())</code>. As the values have a number and a colon prefixed, where the number refers to the sequence order of the current gene, these values are used to keep the correct names grouped together. This prefix is not included in the output.</td></tr><tr><td>RN</td><td>Each reference returned by <code class="code">getRankedDocRefs()</code> is iterated over. The rank of the <code class="code">RankedDocRef</code> object is output here.</td></tr><tr><td>RP</td><td>The <code class="code">getRemark()</code> from the <code class="code">DocRef</code>.</td></tr><tr><td>RX</td><td>The <code class="code">getCrossRef()</code> output from the <code class="code">DocRef</code> object.</td></tr><tr><td>RA</td><td>The <code class="code">getAuthors()</code> output from the <code class="code">DocRef</code> object, with the consortiums removed.</td></tr><tr><td>RG</td><td>The <code class="code">getAuthors()</code> output from the <code class="code">DocRef</code> object, with all except consortiums removed.</td></tr><tr><td>RT</td><td>The <code class="code">getTitle()</code> from the <code class="code">DocRef</code>.</td></tr><tr><td>RL</td><td>The <code class="code">getLocation()</code> from the <code class="code">DocRef</code>.</td></tr><tr><td>RC</td><td>Comments are key-value pairs. Species comments are from <code class="code">getNoteSet(Terms.getSpeciesTerm())</code>. Strain comments are from <code class="code">getNoteSet(Terms.getStrainTerm())</code>. Tissue comments are from <code class="code">getNoteSet(Terms.getTissueTerm())</code>. Transposon comments are from <code class="code">getNoteSet(Terms.getTransposonTerm())</code>. Plasmid comments are from <code class="code">getNoteSet(Terms.getPlasmidTerm())</code>. As the values have a number and a colon prefixed, where the number refers to the rank of the current <code class="code">RankedDocRef</code>, this is used to match the appropriate comments with each reference. This prefix is not included in the output.</td></tr><tr><td>KW</td><td><code class="code">getNoteSet(Terms.getKeywordTerm())</code>.</td></tr><tr><td>CC</td><td>One comment section per entry in <code class="code">getComments()</code>.</td></tr><tr><td>FT</td><td>Each feature is written out using <code class="code">UniProtLocationParser</code> to construct the location string from the feature's <code class="code">getLocation()</code> output, with the feature name being the <code class="code">getType()</code> of the feature and the description being <code class="code">getNoteSet(Terms.getFeatureDescTerm())</code> on the feature. The FTId, if present in the feature from <code class="code">getNoteSet(Terms.<code class="code">getFTIdTerm()</code>)</code>, is written out underneath. No other qualifiers are written out. UniProt uses its own unique set of feature names. No attempt is made to translate other feature names to/from this set.</td></tr><tr><td>SQ</td><td>Sequence counts are generated, then sequence is read directly as it is a <code class="code">SymbolList</code>.</td></tr></tbody></table></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3154"></a>6. INSDSeq (XML).</h2></div></div></div><p>For parsing files that conform to <code class="code">http://www.ebi.ac.uk/embl/Documentation/DTD/INSDSeq_v1.3.dtd.txt</code>.</p><p><code class="code">INSDSeqFormat</code> is similar to the GenBank flat-file format in the way it organises information. Data will end up in the same places and using the same annotation terms. There are no additional annotation terms involved which are not also present in the GenBank flat-file format.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3166"></a>7. EMBLxml (XML).</h2></div></div></div><p>For parsing files that conform to <code class="code">http://www.ebi.ac.uk/embl/Documentation/DTD/EMBL_dtd.txt</code>.</p><p><code class="code">EMBLxmlFormat</code> is very similar to the EMBL flat-file format. Data will be parsed in much the same way and end up in the same locations. There are no additional annotation terms involved which are not also present in the EMBL flat-file format.</p><p>The only major difference between EMBL flat-file and EMBL XML is the location tags. In XML, they are highly structured. The parser gets round this complexity by constructing Genbank-style location strings out of the XML hierarchies. These strings are then passed to <code class="code">GenbankLocationParser</code> for parsing into <code class="code">RichLocation</code> objects. On output, the location tags are constructed directly from the <code class="code">RichLocation</code> objects.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3189"></a>8. UniProtXML (XML).</h2></div></div></div><p>For parsing files that conform to <code class="code">http://www.ebi.uniprot.org/support/docs/uniprot.xsd</code>.</p><p><code class="code">UniProtXMLFormat</code> is very complex. The parser attempts to treat it in the same way as normal UniProt data, and information will end up in the same locations.</p><p>Throughout the format, evidence <span class="emphasis"><em>attributes</em></span> (not tags) are ignored. There is simply no way to fit them into the BioJavaX object model.</p><p>Like the UniProt flat-file format, locations are passed through the <code class="code">UniProtLocationParser</code>. Fuzziness may not be correctly interpreted as frequently not enough information is supplied to be able to construct the mininum requirements of a <code class="code">Position</code> object. You may see exceptions being thrown on files which attempt to specify fuzziness without relation to a specific base or range of bases.</p><p>Comments are parsed and converted into flat-file UniProt comments using the <code class="code">UniProtCommentParser</code>, and converted back again when outputting in this format. This allows for greater interoperability between the two formats, and also allows the UniProt XML comment data to be stored in the plain-text format expected by databases such as BioSQL. Some comments have been renamed in UniProt XML as opposed to the flat-file format. These comments will be parsed and converted to use the flat-file naming convention inside BioJavaX, but when they are output again, they will go back to their correct UniProt XML names. This is to increase interoperability between the two UniProt formats.</p><p>UniProt XML uses its own unique set of feature names, different even from the flat-file UniProt format. No attempt is made to translate other feature names to/from this set.</p><p>The UniProt XML format has no concept of a sequence description. However, it does have a protein tag which describes the structure of the sequence. This is parsed into a single protein description string and used as the value for <code class="code">setDescription()</code>. Each part of the protein description is enclosed in square brackets and prefixed by the word 'Contains' for domains, and 'Includes' for components. Attempting to write a sequence that has a description which does not conform to this standard may produce interesting results.</p><p>Keywords in UniProt XML have identifier numbers associated with them. A special ontology, <code class="code">Terms.getUniprotKWOnto()</code>, is used to store these keywords and their identifiers as they are encountered over time. If a keyword is encountered with an unknown identifier during output, then the word 'UNKNOWN' is output in place of the identifier.</p><p>The secondary/tertiary/additional accessions for database cross-references in UniProt XML have hard-coded names which depend on the position of the accession and the name of the database. If the database name does not match one of the known ones, or an unexpected accession is found, then the name used will be <code class="code">Terms.getAdditionalAccessionTerm()</code>.</p><p>A number of additional annotation terms are used by UniProt XML. These are:</p><div class="table"><a name="d0e3238"></a><p class="title"><b>Table 8.12. Additional <code class="code">UniProtXMLFormat</code> annotation terms.</b></p><table summary="Additional UniProtXMLFormat annotation terms." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td><code class="code">Terms.getProteinTypeTerm()</code></td><td>Used to store the <code class="code">type</code> attribute from the <code class="code">protein</code> tag.</td></tr><tr><td><code class="code">Terms.getEvidenceCategoryTerm()</code></td><td>Used to store the <code class="code">category</code> attribute of the <code class="code">evidence</code> tag.</td></tr><tr><td><code class="code">Terms.getEvidenceTypeTerm()</code></td><td>Used to store the <code class="code">type</code> attribute of the <code class="code">evidence</code> tag.</td></tr><tr><td><code class="code">Terms.getEvidenceDateTerm()</code></td><td>Used to store the <code class="code">date</code> attribute of the <code class="code">evidence</code> tag.</td></tr><tr><td><code class="code">Terms.getEvidenceAttrTerm()</code></td><td>Used to store the <code class="code">attribute</code> attribute of the <code class="code">evidence</code> tag.</td></tr><tr><td><code class="code">Terms.getFeatureRefTerm()</code></td><td>Used to store the <code class="code">ref</code> attribute of the <code class="code">feature</code> tag.</td></tr><tr><td><code class="code">Terms.getFeatureOriginalTerm()</code></td><td>Used to store the value of the <code class="code">original</code> sub-tag of the <code class="code">feature</code> tag.</td></tr><tr><td><code class="code">Terms.getFeatureVariationTerm()</code></td><td>Used to store the value of the <code class="code">variation</code> sub-tag of the <code class="code">feature</code> tag.</td></tr><tr><td><code class="code">Terms.getFeatureStatusTerm()</code></td><td>Used to store the <code class="code">status</code> attribute of the <code class="code">feature</code> tag.</td></tr><tr><td><code class="code">Terms.getLocationSequenceTerm()</code></td><td>Used to store the <code class="code">seq</code> attribute of the <code class="code">location</code> sub-tag of the <code class="code">feature</code> tag.</td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3369"></a>9. New formats</h2></div></div></div><p>If you want to add a new format, the best thing to do is to extend <code class="code">RichSequenceFormat.BasicFormat</code> and go from there. In order to make your class work with the automatic format-guesser (<code class="code">RichSequence.IOTools.readFile()</code>) you'll need to implement <code class="code">canRead()</code> and <code class="code">guessSymbolTokenization()</code>, and add a static initializer block to your class, similar to this:</p><pre class="programlisting">public class MyFormat extends RichSequenceFormat.BasicFormat {
static {
RichSequence.IOTools.registerFormat(MyFormat.class);
}
// implement the rest of the class here ...
}</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3388"></a>10. NCBI Taxonomy data.</h2></div></div></div><p>The NCBI taxonomy loader operates outside the standed file parsing framework, as it is not dealing with a single file and does not generate sequence objects. Instead, it provides separate functions for reading the <code class="code">nodes.dmp</code> and <code class="code">names.dmp</code> files line-by-line, and returning the corresponding <code class="code">NCBITaxon</code> object for each line of the file. An example to load the taxonomy data follows:</p><pre class="programlisting">NCBITaxonomyLoader l = new SimpleNCBITaxonomyLoader();
BufferedReader nodes = new BufferedReader(new FileReader("nodes.dmp"));
BufferedReader names = new BufferedReader(new FileReader("names.dmp"));
NCBITaxon t;
while ((t=l.readNode(nodes))!=null); // read all the nodes first
while ((t=l.readName(names))!=null); // then read all the names
// if your LRU cache is big enough, it'll now hold fully-populated instances
// of all the taxon objects. Not much use unless you're using a database!</pre><p>Note that this is most effective when using BioJavaX with Hibernate to persist data to the database. You do not need to do anything apart from wrap the above code in a transaction, and it will be persisted for you.</p><p>Note that you may have trouble with duplicate <code class="code">NCBITaxon</code> objects or names going missing if you have an LRU cache in <code class="code">RichObjectFactory</code> that is too small. This issue is avoided altogether when using the <code class="code">BioSQLRichObjectFactory</code>.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e3417"></a>Chapter 9. Creative file parsing with <code class="code">RichSeqIOListener</code>.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3423"></a>1. Using <code class="code">RichSeqIOListener</code>s directly.</h2></div></div></div><p>In order to do creative file parsing, you need to start using very low level BioJava APIs. This involves setting up a RichSeqIOListener and allowing it to communicate directly with the RichSequenceFormat instances that parse files. You have to choose whether you want just to listen to data as it is read from the file, or whether you want to use these events to construct a RichSequence object.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e3431"></a>1.1. Listening to events only.</h3></div></div></div><p>You need to write a class which implements <code class="code">RichSeqIOListener</code>. The easiest way to do this is to extend <code class="code">RichSeqIOAdapter</code>, which is a very simple implementation which ignores everything and returns dummy empty features whenever <code class="code">getCurrentFeature()</code> is called.</p><p>You can then use your class like this (see the earlier section on <code class="code">RichStreamReader</code> for how to construct the various other objects required):</p><pre class="programlisting">BufferedReader input = ...; // your input file
Namespace ns = ...; // the namespace to read sequences into
SymbolTokenization st = ...; // the tokenization used to parse sequence data
RichSeqIOListener listener = ...; // your custom listener object
boolean moreSeqsAvailable = true; // assume there is at least one sequence in the file
while (moreSeqsAvailable) {
moreSeqsAvailable = format.readRichSequence(input, st, listener, ns);
// your listener will have received all the information for the current sequence by this stage
}</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e3452"></a>1.2. Constructing sequences from events.</h3></div></div></div><p>You need to write a class which implements both <code class="code">RichSeqIOListener</code> and <code class="code">RichSequenceBuilder</code>. Again you could just extend <code class="code">RichSeqIOAdapter</code>, and implement the extra methods required by <code class="code">RichSequenceBuilder</code> to make it fully functional. You will obviously need to store information passed to your instance as parsing goes along, in order to be able to construct the sequence objects when <code class="code">makeRichSequence()</code> is called at the end.</p><p>Your <code class="code">RichSequenceBuilder</code> is now fully compatible with the <code class="code">RichStreamReader</code> framework outlined earlier in this document, but you will also need to create a <code class="code">RichSequenceBuilderFactory</code> implementation to work with it. The simplest form of such a factory (assuming a custom builder named <code class="code">CustomRichSequenceBuilder</code>) looks like this:</p><pre class="programlisting">public class CustomRichSequenceBuilderFactory implements RichSequenceBuilderFactory {
public CustomRichSequenceBuilderFactory() {}
public SequenceBuilder makeSequenceBuilder() {
return new CustomRichSequenceBuilder();
}
}</pre></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3488"></a>2. Parsing only specific fields.</h2></div></div></div><p>The basic <code class="code">RichSeqIOAdapter</code> class ignores all data passed to it. This is the simplest form of a <code class="code">RichSeqIOListener</code>. Building from this base, you can construct specialist <code class="code">RichSeqIOListener</code> implementations that perform very specific tasks very efficiently. For instance, a listener that counts all the sequences in a file would look like this:</p><pre class="programlisting">public class MyListener extends RichSeqIOAdapter {
private int seqCount;
public MyListener() {
super();
this.seqCount = 0;
}
public void startSequence() { this.seqCount++; }
public void getSeqCount() { return this.seqCount; }
}</pre><p>You could then call <code class="code">getSeqCount()</code> on this class after parsing a file to find out exactly how many sequences it contained.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e3509"></a>Chapter 10. Publication cross-references.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3512"></a>1. Everything is a 'journal article'.</h2></div></div></div><p>Owing to the way in which BioSQL stores publication cross-references, there is no way to distinguish between different types of publication. This is mirrored in the BioJavaX object model in the <code class="code">DocRef</code> interface.</p><p>As journal articles are the most common type of publication cross-reference, everything is assumed by BioJavaX to be a journal article.</p><p>BioJavaX makes no attempt to parse information out from textual publication location descriptions (eg. the LOCATION line in GenBank files). Likewise, when it encounters XML publication location descriptions (such as those found in UniProtXML) it merely concatenates all the data together into a single string. When writing out in XML format it always uses the plain-text option wherever possible unless forced to use the journal-article specific option by an XML DTD. These descriptions are stored using <code class="code">setLocation()</code> on the <code class="code">DocRef</code> object.</p><p>The only piece of information which it attempts to parse (other than the title) is the author data. It parses each author into a <code class="code">DocRefAuthor</code>, and stores a set of these with each <code class="code">DocRef</code> object. Tools are provided in <code class="code">DocRefAuthor.Tools</code> for converting these sets to/from a single string for use in situations such as the AUTHOR tag in GenBank files, or when persisting to a BioSQL database.</p><p><code class="code">DocRef</code> instances must be wrapped in a <code class="code">RankedDocRef</code> before they can be associated with a sequence via <code class="code">addRankedDocRef()</code>. The usual default rank is 0.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3551"></a>2. Editors and consortiums as authors.</h2></div></div></div><p>When dealing in plain text, authors who are editors are suffixed with " (ed.)". Authors who are consortiums are suffixed with " (consortium)". The <code class="code">DocRefAuthor.Tools</code> parses these suffixes (in any order) and uses <code class="code">setEditor()</code> and <code class="code">setConsortium()</code> on the <code class="code">DocRefAuthor</code> object to indicate what it found. When converting <code class="code">DocRefAuthor</code> objects to plain text it will also append these suffixes as necessary.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e3571"></a>Chapter 11. Database cross-references.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3574"></a>1. Database names.</h2></div></div></div><p>Cross-references to other databases are defined as <code class="code">CrossRef</code> objects. To associate a cross-reference with a particular sequence, you need to assign it a rank before adding it to the sequence using <code class="code">addRankedCrossRef()</code>. To do this, wrap it in a <code class="code">RankedCrossRef</code> object.</p><p>Database names are case-sensitive. When using cross-references, be very aware of this. The various file parsers do not make much effort to convert the database names they find to a single case policy, as several of the formats insist on different ones.</p><p>If you will be persisting lots of new data regularly to your datbase, keep an eye on this. Some kind of SQL script to do a periodic tidy-up might be handy. If you come up with one and feel it would be useful for others too, please feel free to send it in and we'll add it below.</p><p>Common database names can be found as constants (eg. <code class="code">PUBMED_KEY</code>) in <code class="code">RichSequence.Terms</code>.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3600"></a>2. Accessions and versions.</h2></div></div></div><p>All database cross-references have at least one accession, which is the primary accession for that reference. The version is also compulsory, although often it is just left as zero. Only primary accessions have explicitly separate versions - secondary or tertiary accessions, if they have versions at all, will have the versions included in the accession itself.</p><p>Secondary, ternary, quaternary etc. accessions are stored as annotations on the cross-reference. These secondary accession annotations must all have the key <code class="code">RichSequence.Terms.getAdditionalAccessionTerm()</code> if they are to be understood across all parts of BioJavaX.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e3610"></a>Chapter 12. Working with <code class="code">RichLocation</code> objects.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3616"></a>1. Working with locations.</h2></div></div></div><p>In BioJavaX, all locations are instances of classes which implement the <code class="code">RichLocation</code> interface. These are very complex objects, so need to be used with care.</p><p>All locations use <code class="code">Position</code> objects to define their end points. <code class="code">Position</code> objects have a start coordinate, and for fuzzy ones an end coordinate too along with a symbol indicating what kind of range the two points encompass (eg. the "." or "^" symbols from GenBank-style locations). If the start or end coordinate of a fuzzy position is fuzzy in itself (eg. "<" or ">" from GenBank-style locations), then flags can be set on the object to indicate this.</p><p><code class="code">Location</code>s have ranks which are used to sort them. If persisted to a database, the location will be flattened out into a set of simple locations, ordered by their rank in ascending order. The complex location constructed from these when retrieving them from the database will have its members in the same order. It is important then to monitor the ranks in your locations and make sure they are in the correct order before persisting them. Note that the locations produced by the <code class="code">UniProtLocationParser</code> and <code class="code">GenbankLocationParser</code> will always be correctly ranked ready for persisting to a database.</p><p>The simplest kind of location describes a single point or range between two points on a sequence, with optional fuzziness at either end. This is implemented by the <code class="code">SimpleRichLocation</code> class.</p><p>This example describes the GenBank-style location string "56":</p><pre class="programlisting">Position pos = new SimplePosition(56);
RichLocation loc = new SimpleRichLocation(pos,0); // the 0 is an arbitrary value for the rank of this location</pre><p>This example describes the GenBank-style location string "(23^34)..57>":</p><pre class="programlisting">Position min = new SimplePosition(false,false,23,34,"^"); // two falses = not fuzzy at all
Position max = new SimplePosition(false,true,57); // false = non-fuzzy start, true = fuzzy end
RichLocation loc = new SimpleRichLocation(min,max,0); // the 0 is an arbitrary value for the rank of this location</pre><p>This example describes the GenBank-style location string "complement((23^34)..57>)":</p><pre class="programlisting">Position min = new SimplePosition(false,false,23,34,"^"); // two falses = not fuzzy at all
Position max = new SimplePosition(false,true,57); // false = non-fuzzy start, true = fuzzy end
RichLocation loc = new SimpleRichLocation(min,max,0,Strand.NEGATIVE_STRAND); </pre><p>This example describes the GenBank-style location string "A12345.3:complement((23^34)..57>)":</p><pre class="programlisting">CrossRef cr = new SimpleCrossRef("GenBank","A12345",3); // version 3 of accession A12345 in the GenBank database
Position min = new SimplePosition(false,false,23,34,"^"); // two falses = not fuzzy at all
Position max = new SimplePosition(false,true,57); // false = non-fuzzy start, true = fuzzy end
RichLocation loc = new SimpleRichLocation(min,max,Strand.NEGATIVE_STRAND,cr);</pre><p>If you require locations that cover more than one range, you <span class="emphasis"><em>must</em></span> use the <code class="code">RichLocation.Tools</code> methods to help you. If you don't, you run a serious risk of making nonsense locations that will give unpredictable results.</p><p>A complex location is constructed from a collection of <code class="code">RichLocation</code> instances. Any member of the collection which is already a complex location is flattened out into its member <code class="code">SimpleRichLocation</code> objects first (see later section on flattening locations) before the new location is constructed. The construction process attempts to minimise the number of these simple locations by merging the ones that overlap. Therefore the total number of member locations (blocks) in the resulting complex location may be less than the number of locations you originally passed in as input.</p><p>To construct a complex location from a set of existing <code class="code">RichLocation</code> instances, follow this example:</p><pre class="programlisting">RichLocation first = ...; // some arbitrary location
RichLocation second = ...; // some other location
Collection members = Arrays.asList(new RichLocation[]{first,second});
RichLocation combined = RichLocation.Tools.construct(members);</pre><p>The <code class="code">construct()</code> method will return one of four different types of <code class="code">RichLocation</code> objects, depending on the members passed in:</p><div class="table"><a name="d0e3694"></a><p class="title"><b>Table 12.1. <code class="code">RichLocation.Tools.construct()</code> result types.</b></p><table summary="RichLocation.Tools.construct() result types." border="1"><colgroup><col width="50%"><col width="50%"></colgroup><tbody><tr><td><code class="code">EmptyRichLocation</code></td><td>If the input collection was empty, or only contained a single entry which was an instance of <code class="code">EmptyRichLocation</code> itself.</td></tr><tr><td><code class="code">SimpleRichLocation</code></td><td>If all the members in the input collection overlap and are on the same strand of the same sequence, the result will be a single location covering the entire overlapping range.</td></tr><tr><td><code class="code">CompoundRichLocation</code></td><td>If all the members in the input collection are on the same strand of the same sequence, but after merging overlapping locations there are still gaps, then a <code class="code">CompoundRichLocation</code> is returned containing one <code class="code">SimpleRichLocation</code> per merged region. All the members are guaranteed to be on the same strand of the same sequence. The strand and cross-ref of the location returned will be consistent with its members. The min and max of the location will correspond to the min and max of all the members combined.</td></tr><tr><td><code class="code">MultiSourceCompoundRichLocation</code></td><td>As per <code class="code">CompoundRichLocation</code>, but members may appear on different strands or even different (remote) sequences. The min, max, strand and cross-ref of the location returned are meaningless, and should not be used. You should instead interrogate each member location (block) for this information as required.</td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3737"></a>2. Strandedness.</h2></div></div></div><p>All <code class="code">SimpleRichLocation</code> and <code class="code">CompoundRichLocation</code> objects have a strand assigned to them. The various strands available are defined as constants in <code class="code">RichLocation.Strand</code>. If two locations have different strands, then they will never be found together in the same <code class="code">CompoundRichLocation</code>, but they may occur together in a <code class="code">MultiSourceCompoundRichLocation</code>.</p><p>In all cases, location coordinates are given w.r.t. the 5' end of the positive strand, with the first base numbered as 1. This is to make overlap, union, and intersection calculations easier.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3759"></a>3. Remote locations.</h2></div></div></div><p>Locations are generally sequence-agnostic until they are applied to a specific sequence, usually through a feature. However, some locations specifically refer to an individual sequence, and are assigned a <code class="code">CrossRef</code> instance to indicate this. These are remote locations. A null value indicates that the location is not remote.</p><p>The sequences backing remote locations are retrieved using a <code class="code">CrossReferenceResolver</code>, the default one being supplied by <code class="code">RichObjectFactory.getDefaultCrossReferenceResolver()</code>. You can override the use of this default either by changing the default in the <code class="code">RichObjectFactory</code>, or calling <code class="code">setCrossReferenceResolver()</code> directly on the location object. The default one does not look up remote sequences at all, and always returns null for sequence objects, and <code class="code">InfinitelyAmbiguousSymbolList</code> instances for symbol list requests. The one supplied for use with Hibernate does attempt to look sequences up in the underlying database, but if it cannot find them it will exhibit similar behaviour.</p><p>The job of this resolver is to obtain sequence data for the remote sequence. If the resolver cannot locate the sequence, the location may throw an exception when any operation requiring the services of the resolver is attempted.</p><p>If you are using a database with BioJavaX and that sequence is to be found in the same database, then make sure that the database name given to the <code class="code">CrossRef</code> instance is the same as the <span class="emphasis"><em>namespace</em></span> of the sequence in your database, and that the accessions and versions are the same.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3794"></a>4. Resolving fuzziness.</h2></div></div></div><p>Fuzziness is all well and good until you try and work out whether one sequence overlaps another, or try and store the location in a database like BioSQL that has no concept of fuzziness. In these kinds of situation, you have to resolve the fuzziness to a specific coordinate before you can use it.</p><p>Locations will resolve positions as necessary using the position resolver supplied by <code class="code">RichObjectFactory.getDefaultPositionResolver()</code>. You can replace this default resolver for all locations by using the appropriate methods in <code class="code">RichObjectFactory</code>, or you can change it for this location only by calling <code class="code">setPositionResolver()</code> on the location object. A number of useful ones are provided as sub-classes of the <code class="code">PositionResolver</code> interface.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3813"></a>5. Translation.</h2></div></div></div><p>Locations can be moved left or right by a fixed number of bases by using the <code class="code">translate()</code> method. This method returns a new location with all members offset by the value specified. A negative offset will move locations towards the 5' end of the positive strand, whilst a positive offset will move them towards the 3' end.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3821"></a>6. Empty locations.</h2></div></div></div><p>The empty location is represented by a singleton instance of <code class="code">EmptyRichLocation</code>, available as a constant as <code class="code">RichLocation.EMPTY_LOCATION</code>.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3832"></a>7. Circular locations.</h2></div></div></div><p>Locations are circular if a call is made to <code class="code">setCircularLength()</code> with a value greater than zero. The value indicates the length of the circular sequence that this location overlays. This is important when it comes to calculating overlaps, unions and other operations where the wrap-around point for the coordinates must be known.</p><p>A circular location cannot be applied to a non-circular sequence. Neither can it be applied to a circular sequence with a length that is not the same as the one returned by the <code class="code">getCircularLength()</code> method of the location.</p><p>The concept of circularity is not understood by BioSQL, so this information will be lost if you persist it to a database.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3847"></a>8. Union.</h2></div></div></div><p>The union of any two locations X and Y that do not overlap (see section on overlapping locations), or that overlap but on different strands, is simply a complex location with X and Y as members.</p><p>The union of two linear locations X and Y that overlap on the same strand is a single simple location that covers the entire area from X.min to Y.max.</p><p>The union of circular location X with any other location Y that overlaps on the same strand is a single simple location that covers the region from the 5' most of X.min and Y.min to the 3' most of X.max and Y.max w.r.t. the positive strand.</p><p>Complex locations will perform the above steps on each pair of member locations in turn, and the union will be the combination set of all unique locations that these pair-wise intersections produce. Any overlapping locations on the same strand within this set will be merged into single, larger locations.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3858"></a>9. Intersection.</h2></div></div></div><p>Locations never intersect if they do not overlap (see section on overlapping locations). The intersection operation will return the empty location.</p><p>If two linear locations X and Y overlap each other on the same strand, then the intersection is a single simple location covering the overlapping region.</p><p>If any two locations X and Y overlap each other on different strands, then the intersection is a complex location containing only the portions of X and Y that overlap each other.</p><p>If a circular location X overlaps any other location Y on the same strand, then the resulting single simple circular location will cover the region from the 3' most of X.min and Y.min to the 5' most of X.max and Y.max w.r.t. the positive strand.</p><p>Complex locations will perform the above steps on each pair of member locations in turn, and the intersection will be the set of all unique locations that these pair-wise intersections produce. Any overlapping locations on the same strand within this set will be merged into single, larger locations.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3871"></a>10. Overlaps.</h2></div></div></div><p>Locations never overlap locations which are on a different remote sequence. However, locations on opposite strands may overlap each other.</p><p>Circular locations of different circular lengths never overlap each other. Circular locations never overlap linear locations.</p><p>Complex locations test each individual member in turn for overlap. The empty location never overlaps anything.</p><p>Linear locations X and Y overlap iff X.min <= Y.max and X.max >= Y.min.</p><p>Circular locations (of the same circular length) X and Y overlap iff X.min <= Y.max-N and X.max >= Y.min-N where N is some multiple of the circular length of either location.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3884"></a>11. Contains.</h2></div></div></div><p>There are two types of contains operation - one tests the presence of a particular point coordinate, the other tests whether this location entirely encompasses another location.</p><p>Complex locations make the test against each member in turn. The empty location never will never contain anything.</p><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e3891"></a>11.1. Point coordinates.</h3></div></div></div><p>For linear locations, a location contains a point if that point falls on or between the min and max of this location. If the min or max of this location is fuzzy, it is resolved into a single point first before the test is made.</p><p>For circular locations, the point is defined to be contained by a location if the point +/- some multiple of the circular length of the location lies between the min and max of the location.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e3898"></a>11.2. Other locations.</h3></div></div></div><p>Locations never contain locations which are on a different strand or remote sequence.</p><p>A linear location X contains another linear location Y iff X.min <= Y.min and X.max >= Y.max.</p><p>A circular location X contains any other location Y iff X.min <= Y.min-N and X.max >= Y.max-N where N is some multiple of the circular length of the location X.</p></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3907"></a>12. Obtaining the symbols for a location.</h2></div></div></div><p>The symbols for a location are obtained by calling <code class="code">symbols()</code> on the location object and passing in the reference sequence which the location must be applied to. If the location contains coordinates that are outside the range of the reference sequence, an exception will be thrown.</p><p>The location will iterate through each of its members (or just itself if it is a <code class="code">SimpleRichLocation</code>) and concatenate the results of calling <code class="code">symbols()</code> on each of them in turn. The concatenated sequence is then returned. This means that the order of the members is important. It will always be the same as the order in which the members were specified to <code class="code">RichLocation.Tools.construct()</code>, if that was the way you put this location together.</p><p>Where it comes across a remote location that refers to a sequence other than the one passed in for reference, the <code class="code">CrossReferenceResolver</code> of that location is used to obtain the remote sequence. The default <code class="code">CrossReferenceResolver</code>, <code class="code">DummyCrossReferenceResolver</code>, will return a number of ambiguity symbols equivalent to the length of the remote location. The Hibernate version, <code class="code">BioSQLCrossReferenceResolver</code>, will return the actual sequence from the database, but otherwise will behave the same way if the remote sequence cannot be found.</p><p>The sequences of locations on the negative strand will be reverse complemented before concatenation to the results. Hence it is important that you construct complex locations on the negative strand with the member locations appearing in order from 3' to 5' end of the positive strand if you want the <code class="code">symbols()</code> call to return sensible results.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e3945"></a>Chapter 13. Features</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3948"></a>1. Adding features to a <code class="code">RichSequence</code>.</h2></div></div></div><p>The best way to create a new feature is like this:</p><pre class="programlisting">Feature.Template templ = new RichFeature.Template(); // create a feature template
templ.location = ...; // assign the feature template a location, type, and source
templ.typeTerm = ...;
templ.sourceTerm = ...;
templ.annotation = new SimpleRichAnnotation(); // assign the rest of the necessary stuff
templ.featureRelationshipSet = new TreeSet();
templ.rankedCrossRefs = new TreeSet();
RichSequence rs = ...; // get a sequence from somewhere
RichFeature feat = rs.createFeature(RichFeature.Template()); // make a new feature on that sequence</pre><p>Alternatively, you can start with a completely empty dummy feature and just customise the bits you need:</p><pre class="programlisting">RichSequence rs = ...; // get a sequence
RichFeature feat = RichFeature.Tools.makeEmptyFeature(); // make an empty feature
feat.setParent(rs): // associate sequence with feature
rs.getFeatureSet().add(feat); // associate feature with sequence
// customise the feature here, eg. location, type, source etc.</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3962"></a>2. Qualifiers as annotations.</h2></div></div></div><p>All feature qualifiers are stored as annotations. Qualifier annotations have a <code class="code">ComparableTerm</code> as key, and a <code class="code">String</code> as the value. Multiple qualifiers with the same term are allowed but only if the values are distinct. Use the rank of the annotation to preserve order.</p><p>To go through all the qualifiers on a particular feature is quite straightforward:</p><pre class="programlisting">RichFeature feat = ...; // get the feature from somewhere
for (Iterator i = feat.getNoteSet().iterator(); i.hasNext; ) {
// get the next note
Note n = (Note)i.next();
// read it
String key = n.getKey().getName();
String value = n.getValue();
int rank = n.getRank();
// print the qualifier out in key=value (rank) format
System.out.println(key+"="+value+" ("+rank+")");
}</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3977"></a>3. Obtaining the symbols for a feature.</h2></div></div></div><p>The symbols for a feature are simply the result of a delegated call to the <code class="code">symbols()</code> method of the feature's <code class="code">Location</code> object, using the feature's parent object as the reference sequence for the location. See the section on locations in this document for details on how the symbols are obtained.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e3988"></a>Chapter 14. Relationships between features.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e3991"></a>1. Relating two features.</h2></div></div></div><p>Two features can be related to each other by using a <code class="code">RichFeatureRelationship</code> object to construct the link.</p><p>Relationships have compulsory ranks. Use 0 if you don't want to bother with this.</p><p>The following code snippet defines a new term "contains" in the default ontology, then creates a relationship that states that feature A (the parent) contains feature B (the child):</p><pre class="programlisting">ComparableTerm contains = RichObjectFactory.getDefaultOntology().getOrCreateTerm("contains");
...
RichFeature parent = ...; // get feature A from somewhere
RichFeature child = ...; // get feature B from somewhere
RichFeatureRelationship relationship = new RichFeatureRelationship(parent,child,contains,0);
parent.addFeatureRelationship(relationship); // add the relationship to the parent
...
parent.removeFeatureRelationship(relationship); // you can always take it away again later</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4005"></a>2. Querying the relationship.</h2></div></div></div><p>Features are aware of all relationships in which they are the parent feature.</p><p>The following code snippet prints out all the relationships to child features within a parent feature:</p><pre class="programlisting">RichFeature feature = ...; // get a feature from somewhere
for (Iterator i = feature.getFeatureRelationshipSet().iterator(); i.hasNext(); ) {
RichFeatureRelationship fr = (RichFeatureRelationship)i.next();
RichFeature parent = fr.getObject(); // parent == feature
RichFeature child = fr.getSubject();
ComparableTerm relationship = fr.getTerm();
// print out the relationship (eg. "A contains B");
System.out.println(parent.getName()+" "+relationship.getName()+" "+child.getName());
}</pre></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e4014"></a>Chapter 15. Annotations and Comments.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4017"></a>1. Annotations.</h2></div></div></div><p>The original BioJava allowed annotations to take the form of any object as the key, with any other object as the value. BioJavaX restricts this significantly in order to make life easier when dealing with databases. The new requirement, for <code class="code">RichAnnotation</code> objects, is that they keys are all instances of <code class="code">ComparableTerm</code>, and the values are all instances of <code class="code">String</code>.</p><p>Anything which is annotatable (eg. <code class="code">BioEntry</code>, <code class="code">RichFeature</code>, etc.) will implement <code class="code">RichAnnotatable</code>. You can then use <code class="code">getAnnotation()</code> to obtain the <code class="code">RichAnnotation</code> object and start annotating with it.</p><p>To obtain the <code class="code">ComparableTerm</code> objects to use as keys, the simplest method is to call <code class="code">RichObjectFactory.getDefaultOntology().getOrCreateTerm("myterm")</code>.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4056"></a>2. Comments.</h2></div></div></div><p>Sequences can have free-text comments (in the form of a <code class="code">String</code> instance wrapped in a <code class="code">Comment</code> instance) associated with them. Each comment is ranked. Duplicate comments with identical text and rank will be ignored. The number of comments allowed is unlimited.</p><p>To add a comment, call <code class="code">addComment()</code> on the sequence object.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4072"></a>3. UniProt structured comments.</h2></div></div></div><p>When parsing UniProt and UniProtXML files, comments take on a structured form. This is represented in text form by special formatting, but in order to parse this information out succesfully (particularly important when writing UniProtXML) a separate parser is required to transform the structured text into a usable object.</p><p>This parser is the <code class="code">UniProtCommentParser</code>. It has two main methods, <code class="code">parseComment()</code> for converting structured text into an object, and <code class="code">generate()</code> for converting the object back into structured text. The 'object' is actually the parser itself, which has a number of methods for accessing information from the parsed comment, or setting information to be written out next time <code class="code"><code class="code">generate()</code></code> is called.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e4092"></a>Chapter 16. Namespaces.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4095"></a>1. Obtaining <code class="code">Namespace</code> instances.</h2></div></div></div><p>All sequences in BioJavaX must belong to a namespace, by being associated with an instance of the <code class="code">Namespace</code> interface. This is in line with BioSQL.</p><p>A default namespace is provided by the <code class="code">RichObjectFactory</code>:</p><pre class="programlisting">// get the default namespace
Namespace defaultNS = RichObjectFactory.getDefaultNamespace();
...
// make a custom namespace
Namespace customNS = (Namespace)RichObjectFactory.getObject(SimpleNamespace.class, new Object[]{"myNameSpace"});
...
// load a namespace from BioSQL, or create it if it doesn't exist yet
Namespace biosqlNS = (Namespace)BioSQLRichObjectFactory.getObject(SimpleNamespace.class, new Object[]{"myBioSQLNameSpace"});
...
// change the default namespace to "bloggs"
RichObjectFactory.setDefaultNamespaceName("bloggs");</pre></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e4113"></a>Chapter 17. NCBI Taxonomy.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4116"></a>1. Traversing from child to parent.</h2></div></div></div><pre class="programlisting">NCBITaxon child = ...; // some taxon object you want the parent of
Integer parentNCBITaxID = new Integer(child.getParentNCBITaxID());
NCBITaxon parent = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class,new Object[]{parentNCBITaxID});</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4121"></a>2. Traversing from parent to child.</h2></div></div></div><p>This cannot be done using the BioJavaX API.</p><p>But, you can do it using HQL if you are reading your taxonomy information from a database. See the section on BioSQL and Hibernate for details about setting BioJavaX for use with a database. The query you are looking for is this:</p><pre class="programlisting">NCBITaxon parent = ...; // some taxon object you want to get the immediate children of
Query q = session.createQuery("from Taxon where parentNCBITaxID = :parentNCBITaxID");
q.setInteger("parentNCBITaxID",parent.getNCBITaxID());
List children = q.list(); // children will now contain all the child taxon objects</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4130"></a>3. Finding taxons by name.</h2></div></div></div><p>This also cannot be done using the BioJavaX API.</p><p>Again, you can do it using HQL if you are reading your taxonomy information from a database. The query you are looking for is this:</p><pre class="programlisting">Query q = session.createQuery("from Taxon as taxon join taxon.nameSet as taxonName "+
"where taxonName.nameClass=:nameClass and taxonName.name=:name");
q.setString("nameClass",NCBITaxon.SCIENTIFIC);
q.setString("name","Homo sapiens");
List taxons = q.list(); // taxons will now contain all matching taxon objects</pre></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e4139"></a>Chapter 18. BioEntry and RichSequence Databases</h2></div></div></div><p><code class="code">BioJavaX</code> allows both <code class="code">BioEntry</code> and <code class="code">RichSequence</code> objects to be collected together in a single group. <code class="code">BioEntry</code> objects can be collected together inside implementations of <code class="code">BioEntryDB</code>, whilst <code class="code">RichSequence</code> objects can be collected inside implementations of <code class="code">RichSequenceDB</code> (which extends <code class="code">BioEntryDB</code>). These are both very similar to the existing <code class="code">SequenceDB</code> interface in BioJava, and in fact <code class="code">RichSequenceDB</code> extends <code class="code">SequenceDB</code> and therefore can be used as a drop-in replacement.</p><p>An abstract implementation of each of these interfaces is provided, along with a simple hash-based implementation.</p><p>The idea of a collection such as this is to provide a wrapper to some kind of behind-the-scenes database. The hash-based implementations, <code class="code">HashBioEntryDB</code> and <code class="code">HashRichSequenceDB</code>, simply provide an in-memory database where sequences are stored in a <code class="code">HashMap</code>, whereas the <code class="code">GenbankRichSequenceDB</code> implementation is a read-only implementation which downloads and parses Genbank records on-demand from the NCBI website.</p><p>There is also a pair of convenience implementations called <code class="code">BioSQLBioEntryDB</code> and <code class="code">BioSQLRichSequenceDB</code> which wrap a Hibernate session connected to a BioSQL database and allow <code class="code">BioEntry</code> and <code class="code">RichSequence</code> objects to be read from, added to and deleted from BioSQL. See the relevant section in the chapter on BioSQL and Hibernate for details.</p></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e4206"></a>Chapter 19. BioSQL and Hibernate.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4209"></a>1. Introduction to Hibernate.</h2></div></div></div><p>BioJavaX uses Hibernate to manage persistence of BioJavaX objects to/from a database.</p><p>Hibernate works by parsing a set of mapping files that tell it how to translate between objects/fields and tables/columns. It tracks changes, writes and executes all the SQL required, and does its best to keep everything consistent and efficient so that your application never needs to talk directly to the database. It also enforces all primary/foreign-key relations between objects.</p><p>Hibernate revolves around the use of JavaBeans. A single JavaBean usually represents a single table. This is the case in BioJavaX. Each column of the table is represented by a field with a standard getter/setter pair of methods within the bean. Hibernate uses these get/set methods to load and save the values to and from the database. Foreign-keys are represented by defining the field as an instance of the bean representing the foreign table. One-to-many relationships are made by defining the field as an instance of a <code class="code">Collection</code>, where each member of the collection is the bean representing the foreign table.</p><p>BioJavaX contains virtually no query code, and provides no API for querying the database. This is because the API <span class="emphasis"><em>is</em></span> the object model. You do not need to have anything more than a Hibernate session to be able to read and write BioJavaX objects directly to the database.</p><p>Mapping files are provided only for BioSQL, as this is the database schema that BioJavaX was designed to imitate, but there is no reason why mapping files could not be created for other database schemas. Please feel free to contribute them if you do so.</p><p>The BioSQL mapping files use lazy-loading extensively. This means that data usually will not get loaded until you try to access it using the appropriate getter/setter method of the object. You can change this behaviour by editing the mapping files.</p><p>Queries are constructed not using SQL but using the Hibernate Query Language, or HQL. You can find out more about HQL and the Hibernate project at their website: <a href="http://www.hibernate.org/" target="_top">http://www.hibernate.org/</a></p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4234"></a>2. Hibernate object-relational mappings.</h2></div></div></div><p>The following table describes which object in BioJavaX ends up in which table in BioSQL. The first column is the name of the BioSQL table, the second is the mapping name to be used in HQL to query the table, and the third column is the class of object you will get when the query returns results:</p><div class="table"><a name="d0e4239"></a><p class="title"><b>Table 19.1. Hibernate object-relational mappings.</b></p><table summary="Hibernate object-relational mappings." border="1"><colgroup><col width="33%"><col width="33%"><col width="34%"></colgroup><tbody><tr><td>biodatabase</td><td>Namespace</td><td><code class="code">SimpleNamespace</code></td></tr><tr><td>taxon</td><td>Taxon</td><td><code class="code">SimpleNCBITaxon</code></td></tr><tr><td>bioentry</td><td>BioEntry</td><td><code class="code">SimpleBioEntry</code></td></tr><tr><td>taxon_name</td><td>(use properties of NCBITaxon)</td><td><code class="code">SimpleNCBITaxonName</code></td></tr><tr><td>biosequence (including the sequence data)</td><td>Sequence</td><td><code class="code">SimpleRichSequence</code></td></tr><tr><td>biosequence (excluding the sequence data)</td><td>ThinSequence</td><td>ThinRichSequence</td></tr><tr><td>bioentry_relationship</td><td>BioEntryRelationship</td><td><code class="code">SimpleBioEntryRelationship</code></td></tr><tr><td>comment</td><td>Comment</td><td><code class="code">SimpleComment</code></td></tr><tr><td>dbxref</td><td>CrossRef</td><td><code class="code">SimpleCrossRef</code></td></tr><tr><td>bioentry_dbxref</td><td>(use properties of BioEntry)</td><td><code class="code">SimpleRankedCrossRef</code></td></tr><tr><td>reference</td><td>DocRef</td><td><code class="code">SimpleDocRef</code></td></tr><tr><td>bioentry_reference</td><td>(use properties of BioEntry)</td><td><code class="code">SimpleRankedDocRef</code></td></tr><tr><td>dbxref_qualifer_value</td><td>(use properties of CrossRef)</td><td><code class="code">SimpleNote</code></td></tr><tr><td>bioentry_qualifier_value</td><td>(use properties of BioEntry)</td><td><code class="code">SimpleNote</code></td></tr><tr><td>ontology</td><td>Ontology</td><td><code class="code">ComparableOntology</code></td></tr><tr><td>term</td><td>Term</td><td><code class="code">ComparableTerm</code></td></tr><tr><td>term_relationship</td><td>Triple</td><td><code class="code">ComparableTriple</code></td></tr><tr><td>term_synonym</td><td>(use properties of Term)</td><td><code class="code">String</code></td></tr><tr><td>term_dbxref</td><td>(use properties of Term)</td><td><code class="code">SimpleRankedCrossRef</code></td></tr><tr><td>seqfeature</td><td>Feature</td><td><code class="code">SimpleRichFeature</code></td></tr><tr><td>seqfeature_qualifier_value</td><td>(use properties of Feature)</td><td><code class="code">SimpleNote</code></td></tr><tr><td>seqfeature_dbxref</td><td>(use properties of Feature)</td><td><code class="code">SimpleRankedCrossRef</code></td></tr><tr><td>seqfeature_relationship</td><td>FeatureRelationship</td><td><code class="code">SimpleRichFeatureRelationship</code></td></tr><tr><td>location</td><td>Location</td><td><code class="code">SimpleRichLocation</code>, <code class="code">CompoundRichLocation</code>, or <code class="code">EmptyRichLocation</code></td></tr><tr><td>location_qualifier_value</td><td>(use properties of Location)</td><td><code class="code">SimpleNote</code></td></tr><tr><td>seqfeature_path</td><td>-</td><td>-</td></tr><tr><td>bioentry_path</td><td>-</td><td>-</td></tr><tr><td>term_path</td><td>-</td><td>-</td></tr></tbody></table></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4470"></a>3. Configuring your application to use Hibernate and BioSQL.</h2></div></div></div><p>To use Hibernate with your application, you need to do five things:</p><div class="orderedlist"><ol type="1"><li><p>Install Hibernate.</p></li><li><p>Copy and configure the Hibernate mapping files for your database.</p></li><li><p>Create a Hibernate session and connect it to BioJavaX.</p></li><li><p>Open a transaction.</p></li><li><p>Read/write objects from the database.</p></li><li><p>End the transaction.</p></li><li><p>Close the Hibernate session.</p></li></ol></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e4497"></a>3.1. Installing Hibernate.</h3></div></div></div><p>Hibernate consists of a number of JAR files, downloadable from their website at <a href="http://www.hibernate.org/" target="_top">http://www.hibernate.org/</a>. You should add these JAR files to your classpath. You will also need to download the JAR file for your database's JDBC driver, and add that to your classpath too.</p><p><em><span class="remark">Note for Oracle users:</span></em> the mapping files supplied for Oracle BioSQL are designed to work only with Oracle 9i or better database and Oracle 9i or better JDBC drivers.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e4509"></a>3.2. Copying and configuring the mapping files.</h3></div></div></div><p>BioJavaX is supplied with four sets of mapping files, all of which define the mapping between BioJavaX objects and a BioSQL database. The four sets are for Oracle, PostGreSQL, MySQL, and HSQLDB.</p><p>You will find the mapping files in the source package <code class="code">org.biojavax.bio.seq.db.biosql.*</code>. Choose the set you want and copy them to one of the following places:</p><div class="itemizedlist"><ul type="disc"><li><p>The root of your <code class="code">.jar</code> file if your application is compiled as a JAR.</p></li><li><p>The current working directory of your application.</p></li></ul></div><p>To configure Hibernate, you must edit the copy you made of the <code class="code">hibernate.cfg.xml</code> file. Near the top is a section that looks like this:</p><pre class="programlisting"><property name="connection.datasource">java:comp/env/jdbc/YOUR_JNDI_DATASOURCE_GOES_HERE</property>
<!-- OR... (for testing only)...
<property name="connection.driver_class">oracle.jdbc.driver.OracleDriver</property>
<property name="connection.url">jdbc:oracle:thin:@MYSERVER:MYPORT:MYSID</property>
<property name="connection.username">MYUSER</property>
<property name="connection.password">MYPASSWORD</property>
<property name="connection.pool_size">20</property>
--></pre><p>The exact details will vary according to which database you are using.</p><p>You will see that the default way of using Hibernate is through a JNDI datasource, usually supplied by a servlet container such as Tomcat. In this case, you should modify the <code class="code">connection.datasource</code> parameter to reflect the name of your JNDI datasource.</p><p>If you are not using JNDI, then comment that line out and uncomment the section marked 'testing only'. This section allows you to configure Hibernate to use a JDBC connection to talk to your database. Please read more about this at <a href="http://www.hibernate.org/" target="_top">http://www.hibernate.org/</a> if you intend to use JDBC directly, as there are several caveats regarding connection pooling that must be taken into consideration. The configuration shown above is recommended only for development, and is not suitable either for production code or for performance testing.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e4548"></a>3.3. Opening and closing sessions.</h3></div></div></div><p>Hibernate deals in sessions, which must be opened before a database can be used, and closed again at the end in order to bring the database back into a consistent state. Hibernate will attempt to clean-up automatically if you forget to close the session, but it is better to be safe than sorry and close it explicitly.</p><p>BioJavaX <span class="emphasis"><em>must</em></span> be told about the session in order to be able to use it to manage database singleton objects such as namespaces or taxon definitions. If you fail to connect the session to BioJavaX, you will almost certainly end up with 'unique constraint violation' exceptions being thrown left, right and centre. So be careful!</p><p>You <span class="emphasis"><em>must</em></span> connect the session to BioJavaX <span class="emphasis"><em>before</em></span> doing any operations with it at all. It should usually be the first or very nearly the first line in your code.</p><p>To open a Hibernate session and connect it to BioJavaX:</p><pre class="programlisting">SessionFactory sessionFactory = new Configuration().configure().buildSessionFactory(); // load Hibernate config
Session session = sessionFactory.openSession(); // open the session
RichObjectFactory.connectToBioSQL(session); // connect it to BioJavaX</pre><p>To close the Hibernate session:</p><pre class="programlisting">session.close();</pre><p>Note that the line that loads the Hibernate configuration only needs to be done once, regardless of how many sessions you open, as long as you keep a reference to your <code class="code">sessionFactory</code> somewhere handy.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e4579"></a>3.4. Transactions.</h3></div></div></div><p>If you are going to be writing objects to the database, you must use transactions. If you are only reading from the database, then transactions are recommended but not compulsory. It's probably safest to use them anyway then you needn't worry about it.</p><p>A transaction defines a unit of work. BioJavaX never commits or rolls back anything, so this is left entirely up to the user to decide when to do so. Transactions are opened with a single line of code, and rolled back or committed with another single line of code. You'd usually use them in a construct such as this:</p><pre class="programlisting">Transaction tx = session.beginTransaction(); // begin the transaction
try {
// do some stuff with BioJavaX objects here.
...
tx.commit(); // commit the transaction if all went well
} catch (Exception e) {
tx.rollback(); // roll back the transaction if something went wrong
e.printStackTrace(); // tell the user what went wrong
}</pre><p>Once a transaction object has been committed or rolled back you cannot use it any more and must open a new one.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h3 class="title"><a name="d0e4590"></a>3.5. Complete example.</h3></div></div></div><p>This example iterates through every namespace it can find in the database, and prints the names out. It also prints out the names of all the sequences in each namespace, whether they be <code class="code">BioEntry</code> or <code class="code">RichSequence</code> instances. If it finds any sequences where the name is equal to 'bloggs', it changes their description to "XYZ". It then commits any changes it has made and exits.</p><p>This example shows just how easy it is to read and write to the database. No SQL required!</p><pre class="programlisting">SessionFactory sessionFactory = new Configuration().configure().buildSessionFactory();
Session session = sessionFactory.openSession();
RichObjectFactory.connectToBioSQL(session);
Transaction tx = session.beginTransaction();
try {
// print out all the namespaces in the database
Query q = session.createQuery("from Namespace");
List namespaces = q.list(); // retrieve all the namespaces from the db
for (Iterator i = namespaces.iterator(); i.hasNext(); ) {
Namespace ns = (Namespace)i.next();
System.out.println(ns.getName()); // print out the name of the namespace
// print out all the sequences in the namespace
Query sq = session.createQuery("from BioEntry where namespace= :nsp");
// set the named parameter "nsp" to ns
sq.setParameter("nsp",ns);
List sequences = sq.list();
for (Iterator j = sequences.iterator(); j.hasNext(); ) {
BioEntry be = (BioEntry)j.next(); // RichSequences are BioEntrys too
System.out.println(" "+be.getName()); // print out the name of the sequence
// if the sequence is called bloggs, change its description to XYZ
if (be.getName().equals("bloggs")) {
be.setDescription("XYZ");
}
}
}
// commit and tidy up
tx.commit();
System.out.println("Changes committed.");
// all sequences called bloggs now have a description "XYZ" in the database
} catch (Exception e) {
tx.rollback();
System.out.println("Changes rolled back.");
e.printStackTrace();
}
session.close();</pre></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4605"></a>4. Flattened locations.</h2></div></div></div><p>BioSQL does not have a concept of hierarchical locations. It allows multiple locations per feature, but it does not allow locations to have sub-locations or references to other locations. This means that the hierarchical location model allowed in BioJavaX must be flattened out into a one-level collection of simple locations before it can be persisted.</p><p>This flattening is done by <code class="code">RichLocation.Tools.flatten()</code>. It only takes place at the point the user tries to save the location to the database, at which point not only does the database copy get flattened, but the in-memory one does too. The flattened location will logically represent the exact same area as the hierarchical original, but it will be constructed differently. The symbols returned by both the original and the flattened locations should be identical, as would the results of any union, intersection, contains, or overlaps operation.</p><p>The circularity of locations will be lost altogether when persisted to BioSQL.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4617"></a>5. Persisting objects.</h2></div></div></div><p>Any object created by using methods from <code class="code">RichObjectFactory</code> will automatically attach itself to the database and persist when the transaction is committed.</p><p>Any object you create directly yourself must be explicitly attached to the database using the appropriate Hibernate mapping name from the table earlier in this chapter. If the object you persist has properties that are other mappable objects, they will be persisted too in a cascading fashion.</p><p>For example, to persist a <code class="code">RichSequence</code> object that you have just created, do this (inside a transaction):</p><pre class="programlisting">RichSequence rs = ...; // some sequence you've made
session.saveOrUpdate("Sequence",rs); // persist the sequence</pre><p>Nothing will actually get saved to the database until you commit the transaction. If you rollback the transaction or exit without committing first, all changes will be lost.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4636"></a>6. Loading objects.</h2></div></div></div><p>Loading objects involves having to learn some HQL. The simplest cases are very easy, however it can get quite complex quite quickly. The thing you have to remember is that you are querying objects, <span class="emphasis"><em>not</em></span> the database. As such, your results may include objects that have been persisted but not committed.</p><p>The simplest HQL query is the equivalent of a SQL <code class="code">select * from sometable</code>. This is how you use it to select all namespaces from the database:</p><pre class="programlisting">Query q = session.createQuery("from Namespace");
List namespaces = q.list(); // namespaces now contains all the Namespace objects in the database</pre><p>To set constraints your query should refer to object parameters, not table columns. The following query selects the namespace that is called 'bloggs':</p><pre class="programlisting">Query q = session.createQuery("from Namespace where name=:name");
q.setString("name","bloggs");
List namespaces = q.list(); // should only contain one Namespace object. Empty if none found.
Namespace ns = (Namespace)q.uniqueObject(); // alternative notation for queries with single-row results</pre><p>You don't have to worry about foreign keys, and can just join objects directly without specifying which field to use. This query returns all <code class="code">RichSequence</code> objects that have a comment that contains the word "rubbish" with a rank of 0:</p><pre class="programlisting">Query q = session.createQuery("select rs from Sequence as rs join Comment as c where c.comment like :comment and rank=:rank");
q.setString("comment","%rubbish%"); // % symbol means match any string
q.setInteger("rank",0);
List sequences = q.list(); // a list of all matching RichSequence objects.</pre><p>This query demonstrates the (unique) case of <code class="code">BioEntry</code> and <code class="code">RichSequence</code> being represented as a single Hibernate mapping, hence no join required to access fields from either table:</p><pre class="programlisting">Query q = session.createQuery("from Sequence where length>:length and name=:name");
q.setInteger("length",200);
q.setString("name","joe");
List sequences = q.list();</pre><p>This query demonstrates how you can use other BioJavaX objects in the where clause without having to do any work at all. It returns all sequences that belong in a particular namespace:</p><pre class="programlisting">Namespace ns = ...; // get a namespace from somewhere, eg. RichObjectFactory.getDefaultNamespace()
Query q = session.createQuery("from Sequence where namespace=:namespace");
q.setParameter("namespace",ns); // plug the namespace object right in!
List sequences = q.list();</pre><p>There's no way this tutorial could ever hope to teach you all about HQL! The best thing to do is go to the Hibernate website and read up on it there: <a href="http://www.hibernate.org/" target="_top">http://www.hibernate.org/</a>.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4681"></a>7. Loading individual values from objects.</h2></div></div></div><p>You might not always want to retrieve lists of objects from the database. This query retrieves the names of sequences:</p><pre class="programlisting">Query q = session.createQuery("select name from Sequence");
List names = q.list(); // list will contain String instances containing the names</pre><p>This one returns all the lengths of sequences, which are integers. Note the use of <code class="code">sequenceLength</code>, which is the object parameter, and not <code class="code">length</code> which is the database table column name:</p><pre class="programlisting">Query q = session.createQuery("select sequenceLength from Sequence");
List lengths = q.list(); // list will contain Integer instances containing the lengths</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4698"></a>8. Deleting objects.</h2></div></div></div><p>Objects can be removed from the database by calling:</p><pre class="programlisting">session.delete(obj); // where obj is some persistent object</pre><p>Only when the transaction is committed will they actually be deleted. If the transaction is rolled back, the objects will come back to life.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4707"></a>9. Auto-generating the BioSQL schema.</h2></div></div></div><p>One nice side-effect of the Hibernate mappings is that they are able to completely regenerate the database schema required to support their functionality. Whilst this does not usually create a schema that is identical to the one you started with, it will function in the same way and produce the same results, and can be handy for development or testing purposes only.</p><p>It is not recommended that the generated scripts be used for production databases without some manual checking and fine-tuning, and it is most certainly not recommended to use the generated scripts in place of any 'official' schema generation scripts such as those that are provided by the BioSQL project.</p><p>Here is the code to generate the DDL from the Hibernate mappings. It will be printed to standard output (usually the screen)</p><pre class="programlisting">Configuration cfg = new Configuration().configure();
new SchemaExport(cfg).create(true, false);</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4718"></a>10. Reading/writing objects as XML.</h2></div></div></div><p>There is a bug in Hibernate which prevents this function from working 100% correctly (bug details <a href="http://opensource2.atlassian.com/projects/hibernate/browse/HHH-796" target="_top">here</a>), however the code is supplied as an example for when the bug is fixed.</p><p>The snippet below will query the database for all DocRef objects, then output an XML representation of them to standard out:</p><pre class="programlisting">Document doc = DocumentHelper.createDocument();
Element root = doc.addElement("myRootNode"); // some arbitrary name for the XML root node
Session dom4jSession = session.getSession(EntityMode.DOM4J);
Query q = dom4jSession.createQuery("from DocRef");
List results = q.list();
for (Iterator i = results.iterator(); i.hasNext(); ) {
Element rs = (Element)i.next();
root.add(rs)
}
session.close();
// Pretty print the document to System.out
OutputFormat format = OutputFormat.createPrettyPrint();
XMLWriter writer = new XMLWriter(System.out, format);//
writer.write(doc);</pre><p>Reading them back and saving them to the database is similar:</p><pre class="programlisting">Document doc = ...; // open an XML document with some kind of org.dom4j.io.SAXReader
List results = doc.selectNodes("//docref"); // use the node-name from the class tag of DocRef.hbm.xml mapping file
Transaction tx = session.beginTransaction();
Session dom4jSession = session.getSession(EntityMode.DOM4J);
for (Iterator i = results.iterator(); i.hasNext(); ) {
Object rs = (Object)i.next();
dom4jSession.saveOrUpdate("DocRef",rs);
}
tx.commit();
session.close();</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4734"></a>11. BioEntryDB and RichSequenceDB convenience wrappers</h2></div></div></div><p>BioJavaX supplies two convenience wrappers for the Hibernate+BioSQL combination which allow simple read/write access of <code class="code">BioEntry</code> and <code class="code">RichSequence</code> objects directly to/from a BioSQL database. They are designed for convenience not flexibility, so it is always best to use the full method outlined in the rest of this chapter, but if you are in a hurry then these should work just fine.</p><p>These two wrappers depend on your database having unique values in the <code class="code">name</code> column of the <code class="code">BioEntry</code> table. If this is not the case, then they will not work for you and you should use the full method instead.</p><p>If you use <code class="code">BioSQLBioEntryDB</code> then the objects you get from the database are <code class="code">BioEntry</code> objects and will not have the sequence data attached to them. This may increase performance if you are dealing with large sequences and do not need the sequence data. <code class="code">BioSQLRichSequenceDB</code> loads <code class="code">RichSequence</code> objects which means that sequence data comes along for the ride.</p><p>The wrappers allow sequences to be read from, added to and deleted from the database in a single command. The easiest way to demonstrate this is by a code example in which a sequence is read from the database, another sequence is added, and a third is deleted:</p><pre class="programlisting">Session sess = ...; // connect to BioSQL using Hibernate and establish a session
RichObjectFactory.connectToBioSQL(sess); // bind BioJavaX to the Hibernate session
RichSequenceDB db = new BioSQLRichSequenceDB(sess); // create the RichSequenceDB wrapper around the Hibernate session
RichSequence seq1 = db.getRichSequence("joe"); // load the sequence where name='joe'
RichSequence seq2 = ...; // create a sequence somehow
db.addRichSequence(seq2); // add it to the database
db.removeRichSequence("bloggs"); // delete the sequence where name='bloggs'
sess.close(); // disconnect from the database</pre><p>The code above is non-transactional, but it can be made to be transactional by doing something like this (based on the example in the JavaDocs for the <code class="code">Session</code> object in Hibernate):</p><pre class="programlisting">Session sess = ...; // connect to BioSQL using Hibernate and establish a session
RichObjectFactory.connectToBioSQL(sess); // bind BioJavaX to the Hibernate session
RichSequenceDB db = new BioSQLRichSequenceDB(sess); // create the RichSequenceDB wrapper around the Hibernate session
Transaction tx;
try {
tx = sess.beginTransaction(); // begin the transaction
// do some work inside the transaction, eg. db.addRichSequence(seq)
tx.commit(); // commit the transaction
} catch (Exception e) {
if (tx!=null) tx.rollback(); // rollback in case of error
throw e;
} finally {
sess.close(); // disconnect from the database
}</pre></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4778"></a>12. BioSQLFeatureFilter</h2></div></div></div><p>You can apply any <code class="code">FeatureFilter</code> to a <code class="code">BioSQLRichSequenceDB</code> instance using the <code class="code">filter()</code> method, just like you could in the existing <code class="code">SequenceDB</code> classes. However, <code class="code">BioSQLRichSequenceDB</code> handles it slightly differently.</p><p>First, it attempts to convert every member of the <code class="code">FeatureFilter</code> into an equivalent <code class="code">BioSQLFeatureFilter</code> (if it isn't already one), which knows how to apply that filter directly to the database. The <code class="code">BioSQLFeatureFilter</code> interface provides two methods in order to allow this - one returns a Hibernate <code class="code">Criterion</code> instance which represents the query required to select features of that kind, and the other returns a Map containing any table aliases the <code class="code">Criterion</code> uses.</p><p>After having run a database query on any convertible <code class="code">FeatureFilter</code>s, it then passes all the results through the <code class="code">accept()</code> method of the original <code class="code">FeatureFilter</code> to see if those features should be returned as results. This is because not all <code class="code">FeatureFilter</code>s can be converted into <code class="code">BioSQLFeatureFilter</code>s, and so it cannot be guaranteed that the <code class="code">Criterion</code> from the first step will have removed all ineligible candidates.</p><p>If you wish to write your own <code class="code">FeatureFilter</code> implementations, you should use <code class="code">BioSQLFeatureFilter</code> wherever possible in order to optimise the first (faster) step of this process and remove as much work as possible from the second (slower) step.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4843"></a>13. <code class="code">ThinSequence</code>s and <code class="code">Feature</code>s</h2></div></div></div><p>Usually when you access <code class="code">Feature</code> objects, you aren't interested in the entire sequence data for the sequence the feature belongs to. BioJavaX will therefore only load a <code class="code">ThinRichSequence</code> to represent the <code class="code">Feature</code>'s parent sequence if you load the <code class="code">Feature</code> separately (eg. through the <code class="code">FeatureFilter</code> mechanism). The <code class="code">ThinSequence</code> loads sequence data direct from the database every time it is asked for it, and only loads the portion that was requested. Sequence data is not cached, so they are very memory-efficient if not a little slower to work with. If you wish to edit the parent sequence of a <code class="code">Feature</code>, you must first directly load from the database yourself the true <code class="code">SimpleRichSequence</code> object and edit that instead.</p><p>To load the full <code class="code">SimpleRichSequence</code> version of a <code class="code">ThinRichSequence</code>, the following simple call will work:</p><pre class="programlisting">RichSequence simpleSeq = db.fullyLoadRichSequence(thinSeq); // where db is an instance of BioSQLRichSequenceDB</pre><p>This two-level sequence loading is made possible by the <code class="code">RichSequenceHandler</code>, which is used by <code class="code">RichSequence</code> implementations to load sequence data on demand. The default implementation is the <code class="code">DummyRichSequenceHandler</code>, which simply passes all calls on to the internal <code class="code">SymbolList</code> inside each <code class="code">RichSequence</code> object. By changing this to a <code class="code">BioSQLRichSequenceHandler</code>, calls are converted into HQL statements and executed against the database instead.</p></div></div><div class="chapter" lang="en"><div class="titlepage"><div><div><h2 class="title"><a name="d0e4908"></a>Chapter 20. Genetic Algorithms.</h2></div></div></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4911"></a>1. Overview.</h2></div></div></div><p>With the introduction of the <code class="code">org.biojavax.ga</code> package it is now possible to generate Genetic Algorithms using BioJava.</p><p>Genetic Algorithms are a class of simulation, optimization or problem solving algorithms that attempt to evolve a solution to a problem. The solution being evolved is encoded as a 'chromosome' which is typically a binary string although other encodings are possible. At each generation (iteration) a population of chromosomes is available. Like real chromsomes they mutate and recombine with some frequency at each generation. Critically, after each round of potential mutation and recombination the chromosomes that encode the best solution are favoured for replication. Thus, there is a trend towards increasingly good solutions in the population.</p><p>The functions and stopping criteria are all Java interfaces so custom implementations are possible. The only requirement for the <code class="code">GeneticAlgorithm</code> is that is has a <code class="code">Population</code>, a <code class="code">MutationFunction</code>, a <code class="code">CrossOverFunction</code>, a <code class="code">FitnessFunction</code>, a <code class="code">SelectionFunction</code> and a <code class="code">GAStoppingCriteria</code>. The actual implementations used are interchangeable. Further, the 'chromosome(s)' of the <code class="code">Organism</code>s in a <code class="code">Population</code> are just BioJava <code class="code">SymbolList</code>s and any <code class="code">Alphabet</code> could be used to encode a solution.</p></div><div class="section" lang="en"><div class="titlepage"><div><div><h2 class="title" style="clear: both"><a name="d0e4956"></a>2. Example listing.</h2></div></div></div><p>The example below demonstrates a very simple genetic algorithm constructed using the GA framework. The framework is designed to be very flexible and uses an interchangeable parts philosophy.</p><p>The core interface is the <code class="code">GeneticAlgorithm</code> with its default implementation, <code class="code">SimpleGeneticAlgorithm</code>. The <code class="code">GeneticAlgorithm</code> takes any <code class="code">Population</code> of <code class="code">Organism</code>s and iterates through the generations. At each step a <code class="code">MutationFunction</code> and a <code class="code">CrossOverFunction</code> are responsible for introducing variation.</p><p>A <code class="code">FitnessFunction</code> is responsible for determining the fitness of each <code class="code">Organism</code> in the context of it's parent <code class="code">Population</code>. Because fitness can be calculated in the context of a <code class="code">Population</code> it is possible to model competition within a <code class="code">Population</code>. The <code class="code">Organism</code>s to be selected for replication are nominated by the <code class="code">SelectionFunction</code> usually on the basis of their fitness.</p><p>The <code class="code">GeneticAlgorithm</code> will stop iterating when the <code class="code">GAStoppingCriteria</code> tells it to. This may be when a suitable solution has been reached or after a finite number of generations.</p><pre class="programlisting">public class GADemo{
public static void main(String[] args) throws Exception{
//print the header
System.out.println("gen,average_fitness,best_fitness");
//a uniform Distribution over the binary Alphabet
Distribution bin_dist = new UniformDistribution(GATools.getBinaryAlphabet());
//initialize the population
Population pop = new SimplePopulation("demo population");
//add 100 organisms
for(int i = 0; i < 100; i++){
Organism o = new SimpleOrganism("organism"+i);
//make 1 random chromosome for each organism
SymbolList[] ch = new SymbolList[1];
//the symbols are randomly sampled from bin_dist
ch[0] = new SimpleSymbolList(DistributionTools.generateSequence(
"", bin_dist, 100));
//set the organisms chromosome to be ch
o.setChromosomes(ch);
//add to organism to the population pop
pop.addOrganism(o);
}
//created a SelectionFunction
SelectionFunction sf = new ProportionalSelection();
//set its FitnessFunction
sf.setFitnessFunction(new DemoFitness());
//create a new CrossOverFunction
CrossOverFunction cf = new SimpleCrossOverFunction();
//set the max number of cross overs per chromosome
cf.setMaxCrossOvers(1);
//set a uniform cross over probability of 0.01
cf.setCrossOverProbs(new double[]{0.01});
//create a new MutationFunction
MutationFunction mf = new SimpleMutationFunction();
//set a uniform MutationProbability of 0.0001
mf.setMutationProbs(new double[]{0.0001});
//set the mutation spectrum of the function to be a standard
//mutation distribution over the binary Alphabet
mf.setMutationSpectrum(
GATools.standardMutationDistribution(GATools.getBinaryAlphabet()));
//make a GeneticAlgorithm with the above functions
GeneticAlgorithm genAlg = new SimpleGeneticAlgorithm(pop, mf, cf, sf);
//run the Algorithm until the criteria of DemoStopping are met
genAlg.run(new DemoStopping());
}
/**
* Basic implementation of GAStopping Criteria
*
*/
static class DemoStopping implements GAStoppingCriteria{
/**
* Determines when to stop the Algorithm
*/
public boolean stop (GeneticAlgorithm genAlg){
System.out.print(genAlg.getGeneration()+",");
Population pop = genAlg.getPopulation();
double totalFit = 0.0;
FitnessFunction ff = genAlg.getSelectionFunction().getFitnessFunction();
double fit = 0.0;
double bestFitness = 0.0;
for (Iterator it = pop.organisms(); it.hasNext(); ) {
Organism o = (Organism)it.next();
fit = ff.fitness(o, pop, genAlg);
bestFitness = Math.max(fit, bestFitness);
totalFit += fit;
}
//print the average fitness
System.out.print((totalFit/ (double) pop.size())+",");
//print the best fitness
System.out.println(bestFitness);
//fitness is 75.0 so stop the algorithm
if(bestFitness >= 75.0){
System.out.println("Organism found with Fitness of 75%");
return true;
}
//no organism is fit enough, continue the algorithm
return false;
}
}
/**
* A fitness function bases on the most "one" rich chromosome in the organism.
*
*/
static class DemoFitness implements FitnessFunction{
public double fitness(Organism o, Population p, GeneticAlgorithm genAlg){
double bestfit = 0.0;
for (int i = 0; i < o.getChromosomes().length; i++) {
SymbolList csome = o.getChromosomes()[i];
double fit = 0.0;
for(int j = 1; j <= csome.length(); j++){
if(csome.symbolAt(j) == GATools.one())
fit++;
}
bestfit = Math.max(fit, bestfit);
}
return bestfit;
}
}
}</pre></div></div></div></body></html>