[Biojava-l] GenBank parsing change
Simon Foote
simon.foote@nrc.ca
Thu, 03 Jan 2002 09:58:12 -0500
No objections here.
I made the change, and ran my script that generates a blast database of
all the bacterial proteins in GenBank using the release 127 gbbct files
and it worked fine. Also, worked fine on all the daily update files
upto today.
Only had to make a slight modification to my script to catch a few
records that for some unexplained reason, didn't have a type (ie.
DNA,RNA, etc) in the LOCUS line, thus causing an incorrect number of
tokens exception.
Aside from that it parsed all the files flawlessly.
Cheers,
Simon Foote
--
Bioinformatics Specialist
Institute for Biological Sciences
National Research Council of Canada
[T] 613-991-4342 [F] 613-952-9092
Scott Markel wrote:
>NCBI has changed the formatting of the GenBank LOCUS line for release
>127, released on 15 December. The change allows for larger sequences
>and longer locus names. It also allows a tokenization based parsing
>rather than a column based parsing. See section 1.4.1 ("LOCUS line
>format change : to accomodate longer names and sequences") in
>ftp://ftp.ncbi.nih.gov/genbank/release.notes/gb126.release.notes for
>details.
>
>We thought about changing the parsing to handle only the new format. It
>would be nice to be able to just tokenize on white space and get rid of
>the old format, but that would have caused us, and presumably others,
>some maintenance headaches since the old files still exist. The
>approach we took was to support both formats.
>
>To handle this issue for our customers we've made the following change
>to the GenbankContext class in biojava/bio/seq/io/GenbankFormat.java.
>
>Any objections to us checking this change in so that others can use it
>too?
>
>Scott
>
>PS We haven't yet made the corresponding change to the writer, but there
>we would just support the new format, just like NCBI does.
>
>-----------------------------------------------------------------------
>
>current BioJava code -
>
> private void processHeaderLine(String line)
> throws ParseException
> {
> if(line.startsWith(GenbankFormat.LOCUS_TAG))
> {
> // the LOCUS line is a special case because it contains the
> // locus, size, molecule type, GenBank division, and the date
> // of last modification.
> this.saveSeqAnno();
> StringTokenizer lineTokens = new StringTokenizer(line);
> headerTag = lineTokens.nextToken();
> headerTagText = new StringBuffer(lineTokens.nextToken());
>
> this.saveSeqAnno();
> headerTag = GenbankFormat.SIZE_TAG;
> headerTagText = new StringBuffer(lineTokens.nextToken());
> // read past 'bp'
> lineTokens.nextToken();
>
> // At this point there are three optional fields, strand number,
> // type, and circularity.
> if(line.charAt(34) != ' ')
> {
> this.saveSeqAnno();
> headerTag = GenbankFormat.STRAND_NUMBER_TAG;
> if(line.charAt(37) == ' ')
> headerTagText = new StringBuffer(lineTokens.nextToken());
> else // Both STRAND and TYPE fields are in the token
> {
> String fields = lineTokens.nextToken();
> headerTagText = new StringBuffer(fields.substring(0,3));
>
> this.saveSeqAnno();
> headerTag = GenbankFormat.TYPE_TAG;
> headerTagText = new StringBuffer(fields.substring(3));
> }
> }
> else
> if(line.charAt(37) != ' ')
> {
> this.saveSeqAnno();
> headerTag = GenbankFormat.TYPE_TAG;// Check this; may be under PROP
> headerTagText = new StringBuffer(lineTokens.nextToken());
> }
>
> if(line.charAt(43) != ' ')
> {
> this.saveSeqAnno();
> headerTag = GenbankFormat.CIRCULAR_TAG;
> headerTagText = new StringBuffer(lineTokens.nextToken());
> }
>
> this.saveSeqAnno();
> headerTag = GenbankFormat.DIVISION_TAG; // May be under PROP
> headerTagText = new StringBuffer(lineTokens.nextToken());
>
> this.saveSeqAnno();
> headerTag = GenbankFormat.DATE_TAG;
> headerTagText = new StringBuffer(lineTokens.nextToken());
> }
> else if(line.startsWith(GenbankFormat.VERSION_TAG))
> {
> // VERSION line is a special case because it contains both
> // the VERSION field and the GI number
> this.saveSeqAnno();
> StringTokenizer lineTokens = new StringTokenizer(line);
> headerTag = lineTokens.nextToken();
> headerTagText = new StringBuffer(lineTokens.nextToken());
>
> String nextToken = lineTokens.nextToken();
> if(nextToken.startsWith(GenbankFormat.GI_TAG))
> {
> this.saveSeqAnno();
> headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
> headerTagText =
> new StringBuffer(nextToken.substring(3));
> }
> }
> else if (hasHeaderTag(line))
> { // line has a header tag
> this.saveSeqAnno();
> headerTag = line.substring(0, TAG_LENGTH).trim();
> headerTagText = new StringBuffer(line.substring(TAG_LENGTH));
> }
> else
> { // keep appending tag text value
> headerTagText.append(" " + line.substring(TAG_LENGTH));
> }
> }
>
>-----------------------------------------------------------------------
>
>modified code -
>
> private void processHeaderLine(String line)
> throws ParseException
> {
> if(line.startsWith(GenbankFormat.LOCUS_TAG))
> {
> // Genbank changed the format of the Locus line for release 127.
> // The new format is incompatible with the old.
> if(this.isLocusLinePre127(line))
> {
> this.parseLocusLinePre127(line);
> }
> else
> {
> this.parseLocusLinePost127(line);
> }
> }
> else if (line.startsWith(GenbankFormat.VERSION_TAG))
> {
> // VERSION line is a special case because it contains both
> // the VERSION field and the GI number
> this.saveSeqAnno();
> StringTokenizer lineTokens = new StringTokenizer(line);
> headerTag = lineTokens.nextToken();
> headerTagText = new StringBuffer(lineTokens.nextToken());
>
> if (lineTokens.hasMoreTokens()) {
> String nextToken = lineTokens.nextToken();
> if(nextToken.startsWith(GenbankFormat.GI_TAG))
> {
> this.saveSeqAnno();
> headerTag = GenbankFormat.GI_TAG; // Possibly should be UID?
> headerTagText =
> new StringBuffer(nextToken.substring(3));
> }
> }
> }
> else if (hasHeaderTag(line))
> { // line has a header tag
> this.saveSeqAnno();
> headerTag = line.substring(0, TAG_LENGTH).trim();
> headerTagText = new StringBuffer(line.substring(TAG_LENGTH));
> }
> // gbpri1.seq (Release 125.0) has a line which is not
> // TAG_LENGTH long. Patch offered by Ron Kuhn (rkuhn@cellomics.com)
> else if (line.length() >= TAG_LENGTH)
> { // keep appending tag text value
> headerTagText.append(" " + line.substring(TAG_LENGTH));
> }
> }
>
> /**
> * Checks which version of the locus line format is used. The algorithm
> * switches on the size of the line; <75 means pre-127, otherwise it's 127.
> *
> * @param theLine the line to check the format of.
> * @return TRUE if the line is in Genbank release 126 or earlier format.
> * FALSE otherwise
> */
> private boolean isLocusLinePre127(String theLine)
> {
> return (theLine.length() < 75);
> }
>
> /**
> * Parses the locus line assuming it is in pre release 127 format.
> *
> * @param theLine Locus line to parse.
> * @throws ParseException If the line is too short.
> */
> private void parseLocusLinePre127(String theLine)
> throws ParseException
> {
> if (theLine.length() < 73)
> {
> throw new ParseException("LOCUS line too short [" + theLine
>+ "]");
> }
>
> saveSeqAnno2(GenbankFormat.LOCUS_TAG, theLine.substring(12, 22));
> saveSeqAnno2(GenbankFormat.SIZE_TAG, theLine.substring(22, 29));
> saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG, theLine.substring(33,
>35));
> saveSeqAnno2(GenbankFormat.TYPE_TAG, theLine.substring(36, 41));
> saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, theLine.substring(42,
>52));
> saveSeqAnno2(GenbankFormat.DIVISION_TAG, theLine.substring(52,
>55));
> saveSeqAnno2(GenbankFormat.DATE_TAG, theLine.substring(62, 73));
> }
>
> /**
> * Parses the locus line assuming it is in post release 127 format.
> *
> * @param theLine Locus line to parse.
> * @throws ParseException If the line is too short.
> */
> private void parseLocusLinePost127(String theLine)
> throws ParseException
> {
> if (theLine.length() < 79)
> {
> throw new ParseException("LOCUS line too short [" + theLine
>+ "]");
> }
>
> StringTokenizer locusTokens = new StringTokenizer(theLine);
> if(locusTokens.countTokens() != 8)
> {
> throw new ParseException("LOCUS line incorrectly tokenized
>[" + theLine + "]");
> }
> // LOCUS tag; not stored
> locusTokens.nextToken();
> // Locus name
> saveSeqAnno2(GenbankFormat.LOCUS_TAG, locusTokens.nextToken());
> // Sequence length
> saveSeqAnno2(GenbankFormat.SIZE_TAG, locusTokens.nextToken());
> // "bp"; not stored
> locusTokens.nextToken();
> // Strand information
> // Both the strand and type are in the same token. The strand
> // information is an optional part, so this is a bit hairy
> String strandString = locusTokens.nextToken();
> StringTokenizer strandTokens = new StringTokenizer(strandString,
>"-");
> if(strandTokens.countTokens() > 1)
> {
> saveSeqAnno2(GenbankFormat.STRAND_NUMBER_TAG,
>strandTokens.nextToken());
> }
> saveSeqAnno2(GenbankFormat.TYPE_TAG, strandTokens.nextToken());
> // Circularity
> saveSeqAnno2(GenbankFormat.CIRCULAR_TAG, locusTokens.nextToken());
> // Division code
> saveSeqAnno2(GenbankFormat.DIVISION_TAG, locusTokens.nextToken());
> // Date in dd-MMM-yyyy format
> saveSeqAnno2(GenbankFormat.DATE_TAG, locusTokens.nextToken());
> }
>
> /**
> * Passes the tag and the text to the listener.
> *
> * @throws ParseException Thrown when an error occurs parsing the file
> */
> private void saveSeqAnno()
> throws ParseException
> {
> if (!headerTag.equals(""))
> { // save tag and its text
> listener.addSequenceProperty(headerTag, headerTagText.toString());
> headerTag = "";
> headerTagText = new StringBuffer("");
> }
> }
>
> /**
> * Private method to process a header tag and associated value.
> *
> * @param tag The tag to add
> * @param value The value of the associated tag
> * @throws ParseException Thrown when an error occurs parsing the file
> */
> private void saveSeqAnno2(String tag, String value)
> throws ParseException
> {
> value = value.trim(); // strip whitespace
> if (value.length() > 0) {
> this.saveSeqAnno();
> headerTag = tag;
> headerTagText = new StringBuffer(value);
> }
> }
>
>-----------------------------------------------------------------------
>