[Biojava-dev] patch: GenbankSequenceDB, Retrieved sequences in batch mode

Laurent Jourdren jourdren@dsvidf.cea.fr
Thu, 16 Jan 2003 12:05:42 +0100


--------------Boundary-00=_I50TASCQH5IDH2N8Y303
Content-Type: text/plain;
  charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable


=09Hello,

=09I make a patch to retrieve sequences objects from Entrez 'en masse', r=
ather=20
than one at time using NCBI Entrez Programming Utilities. In this case, I=
 use=20
EFetch method with http post.=20
=20

=09Laurent.

--=20
Laurent Jourdren
Service de G=E9nomique Fonctionnelle
Commissariat =E0 L'Energie Atomique
2 rue Gaston Cr=E9mieux - CP 22
91057 Evry Cedex
Tel: 01.60.87.34.76
email: jourdren@dsvidf.cea.fr
--------------Boundary-00=_I50TASCQH5IDH2N8Y303
Content-Type: text/x-diff;
  charset="us-ascii";
  name="GenbankSequenceDB.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="GenbankSequenceDB.patch"

*** GenbankSequenceDB.java.old	2003-01-16 11:35:41.000000000 +0100
--- GenbankSequenceDB.java	2003-01-16 11:48:05.000000000 +0100
***************
*** 22,27 ****
--- 22,29 ----
  
  import java.net.*;
  import java.io.*;
+ import java.util.Set;
+ import java.util.Iterator;
  import org.biojava.bio.symbol.*;
  import org.biojava.bio.seq.io.*;
  import org.biojava.bio.seq.DNATools;
***************
*** 29,34 ****
--- 31,39 ----
  import org.biojava.bio.seq.Sequence;
  import org.biojava.bio.BioException;
  import org.biojava.bio.seq.SequenceIterator;
+ import org.biojava.bio.seq.db.SequenceDB;
+ import org.biojava.bio.seq.db.HashSequenceDB;
+ import org.biojava.utils.ChangeVetoException;
  
  /**
   * This class contains functions accessing DNA sequences in Genbank format.
***************
*** 42,47 ****
--- 47,54 ----
    private static String DBName="Genbank";//predefined the database name -- Genbank
    private boolean IOExceptionFound=false;//check if IOException is found
    private boolean ExceptionFound=false;//check if any exception is found
+   private static final String urlBatchSequences =
+     "http://www.ncbi.nlm.nih.gov:80/entrez/eutils/efetch.fcgi";
    
    static 
    {
***************
*** 135,138 ****
--- 142,260 ----
    {
  	return ExceptionFound;  
    }
+ 
+   /**
+    * Create the Http Post Request to fetch (in batch mode) a list of sequence 
+    * with Genbank.
+    * @param url URL of the request
+    * @param list List of sequence identifier
+    * @return The Post request.
+    */
+   private String makeBatchRequest(URL url, Set list) {
+ 
+     StringBuffer params = new StringBuffer();
+     params.append("db=nucleotide&rettype=gb&id=");
+ 
+     for (Iterator i = list.iterator(); i.hasNext();) {
+       String idSequence = (String) i.next();
+       params.append(idSequence);
+       params.append(",");
+     }
+ 
+     StringBuffer header = new StringBuffer();
+     header.append("POST ");
+     header.append(url.getPath());
+     header.append(
+       " HTTP/1.0\r\n"
+         + "Connection: close\r\n"
+         + "Accept: text/html, text/plain\r\n"
+         + "Host: ");
+ 
+     header.append(url.getHost());
+     header.append(
+       "\r\n"
+         + "User-Agent: Biojava/GenbankSequenceDB\r\n"
+         + "Content-Type: application/x-www-form-urlencoded\r\n"
+         + "Content-Length: ");
+     header.append(params.length());
+     header.append("\r\n\r\n");
+ 
+     StringBuffer request = new StringBuffer();
+     request.append(header);
+     request.append(params);
+ 
+     return request.toString();
+   }
+ 
+   /**
+    * Retrieve sequences from a Genbank
+    * 
+    * @param list List of NCBI sequence number (GI), accession, accession.version, 
+    * fasta or seqid.
+    * @return The database object (HashSequenceDB) with downloaded sequences.
+    */
+   public SequenceDB getSequences(Set list) throws BioException {
+ 
+     return getSequences(list, null);
+   }
+ 
+   /**
+    * Retrieve sequences from a Genbank
+    * 
+    * @param list List of NCBI sequence number (GI), accession, accession.version, 
+    * fasta or seqid.
+    * @param database Where to store sequences. if database is null, use an 
+    * HashSequenceDB Objet.
+    * @return The database object with downloaded sequences.
+    */
+   public SequenceDB getSequences(Set list, SequenceDB database)
+     throws BioException {
+ 
+     if (database == null)
+       database = new HashSequenceDB();
+ 
+     try {
+ 
+       URL url = new URL(urlBatchSequences);
+       int port = url.getPort();
+       String hostname = url.getHost();
+ 
+       //Open the connection and the streams
+       Socket s = new Socket(hostname, port);
+ 
+       InputStream sin = s.getInputStream();
+       BufferedReader fromServer =
+         new BufferedReader(new InputStreamReader(sin));
+       OutputStream sout = s.getOutputStream();
+       PrintWriter toServer = new PrintWriter(new OutputStreamWriter(sout));
+ 
+       // Put the Post request to the server
+       toServer.print(makeBatchRequest(url, list));
+       toServer.flush();
+ 
+       // Delete response headers
+       boolean finEntete = false;
+       for (String l = null;
+         ((l = fromServer.readLine()) != null) && (!finEntete);
+         )
+         if (l.equals(""))
+           finEntete = true;
+ 
+       SequenceIterator seqI = SeqIOTools.readGenbank(fromServer);
+ 
+       while (seqI.hasNext())
+         database.addSequence(seqI.nextSequence());
+ 
+     } catch (MalformedURLException e) {
+       throw new BioException("Exception found in GenbankSequenceDB -- getSequences");
+     } catch (IOException e) {
+       throw new BioException("Exception found in GenbankSequenceDB -- getSequences");
+     } catch (BioException e) {
+       throw new BioException("Exception found in GenbankSequenceDB -- getSequences");
+     } catch (ChangeVetoException e) {
+       throw new BioException("Exception found in GenbankSequenceDB -- getSequences");
+     }
+ 
+     return database;
+   }
  }

--------------Boundary-00=_I50TASCQH5IDH2N8Y303--