[Bioperl-l] GenBankParser comparison to bioperl parser

Lincoln Stein lstein@cshl.org
Thu, 12 Sep 2002 09:07:02 -0400


Hi John,

I love benchmarks.  Particularly ones that challenge people to do better.

Just for comparison, here's the same script written with Boulder::Genbank (you 
can get Boulder from CPAN if you need it).  Please let me know how it does on 
your test system.

Lincoln


	#!/usr/bin/perl
	use strict;
	use Boulder::Genbank;
	use constant FILE =>'gunzip -c /am/db/gb-ff/gbbct1.seq.gz |';
	# use constant FILE =>'gunzip -c test.gb.gz |';

	my $gb = Boulder::Genbank->newFh(-accessor=>'File',
								 -path    => FILE);
	while (my $entry = <$gb>) {
		  my $accession  = $entry->Accession;
		  my ($locus)    = $entry->Locus =~ /^(\S+)/;
		  my $definition = $entry->Definition;
		  print ">gi|$accession|$locus|$definition\n";
		  my $sequence = $entry->Sequence;
		  $sequence =~ s/(.{1,60})/$1\n/g;  # wrap
		  print $sequence;
	}

Lincoln

On Wednesday 11 September 2002 3:58 pm, John Kloss wrote:
> I was asked to provided some coarses tests demonstrating the features and
> speed of the GenBankParser in comparsion to the parser provided in bioperl.
> I just learned how to use the bioperl interface, so please let me know if I
> did anything stupid.
>
> In all tests I used the same flat file
>
> 	bct1.seq	size gzipped 73679187 bytes.
>
> All tests run on a realtively loaded Sun Enterprise 4500 8 x 400Mhz sparc
> with 10Gbs ram.
>
> 1st test.  Convert GenBank flat file to fasta.
>
> ########################################
> Bioperl parser code:
>
> #!/usr/local/bin/perl -w
>
> use Bio::SeqIO;
>
> $in = Bio::SeqIO->newFh(
>         '-file'         => "gzip -cd /am/db/gb-ff/gbbct1.seq.gz |",
>         '-format'       => 'Genbank' );
>
> $out = Bio::SeqIO->newFh(
>         '-file'         => "> output1.txt",
>         '-format'       => 'fasta' );
>
> print $out $_ while <$in>;
>
> ########################################
> Time:
>
> real    17m4.008s
> user    15m19.500s
> sys     0m20.850s
>
>
> ########################################
> GenBankParser code:
>
> #!/usr/local/bin/perl -w
>
> use GenBankParser qw( DEFINITION LOCUS ORIGIN VERSION );
>
> open( FH, "gzip -cd /am/db/gb-ff/gbbct1.seq.gz |" );
>
> (new GenBankParser)->parse_file( \*FH, sub {
>     my $parser = shift;
>
>     print ">gi|", $parser->VERSION->gi,
> 	  "|gb|", $parser->VERSION->accession,
> 	  "."   , $parser->VERSION->version,
> 	  "|"   , $parser->LOCUS->locus,
> 	  " "   , $parser->DEFINITION, "\n";
>
>     my   $seq   = $parser->ORIGIN;
>     pos( $seq ) = 0;
>
>     print $1, "\n" while $seq =~ m/\G (.{1,60}) /gx;
> } );
>
>
> ########################################
> Time:
>
> real    1m35.797s
> user    1m13.600s
> sys     0m6.350s
>
>
> Next program, dump out the locus identifer, accession number, molecular
> type, gi, and definition line.
>
> ########################################
> Bioperl code:
>
> #!/usr/local/bin/perl -w
>
> use Bio::SeqIO;
>
> $in = Bio::SeqIO->new(
>         '-file'         => "gzip -cd /am/db/gb-ff/gbbct1.seq.gz |",
>         '-format'       => 'Genbank' );
>
>
> while (my $seq = $in->next_seq( )) {
>
>     print $seq->display_id( ),          "\n";
>     print $seq->accession_number( ),    "\n";
>     print $seq->alphabet( ),            "\n";
>     print $seq->primary_id( ),          "\n";
>     print $seq->desc( ),                "\n";
> }
>
> ########################################
> Time:
>
> real    16m29.771s
> user    14m43.560s
> sys     0m18.280s
>
>
> ########################################
> GenBankParser code:
>
> #!/usr/local/bin/perl -w
>
> use GenBankParser qw( DEFINITION LOCUS VERSION );
>
> open( FH, "gzip -cd /am/db/gb-ff/gbbct1.seq.gz |" );
>
> (new GenBankParser)->parse_file( \*FH, sub {
>     my $parser = shift;
>
>     print $parser->LOCUS->locus,        "\n";
>     print $parser->VERSION->accession,  "\n";
>     print $parser->LOCUS->type,         "\n";
>     print $parser->VERSION->gi,         "\n";
>     print $parser->DEFINITION,          "\n";
> } );
>
>
> ########################################
> Time:
>
> real    0m51.068s
> user    0m40.300s
> sys     0m3.580s
>
>
> Last program, print out gt2fasta format.  That is protien in fasta format.
> First is GenBankParser code:
>
> #!/usr/local/bin/perl -w
>
> use strict;
> use GenBankParser qw( DEFINITION FEATURES SOURCE LOCUS );
> use GenBankParser::FEATURES qw( CDS );
>
> open( FH, "gzip -cd /am/db/gb-ff/gbbct.seq.gz |");
>
> (new GenBankParser)->parse_file( \*FH, sub {
> 	my $Parser = shift;
>
> 	my $cdscnt = 0;
> 	for my $cds (@{ $Parser->FEATURES->CDS }) {
>
> 		$cdscnt++;
>
> 		#
> 		# Skip pseudo genes
> 		#
> 		next if $cds->pseudo;
>
> 		my $definition =
> 			  $cds->product ||
> 			  $cds->gene    ||
> 			(($cds->note) ? join "; ",
> 				$Parser->DEFINITION, $cds->note :
> 				$Parser->DEFINITION );
>
> 		my $organism = $Parser->SOURCE->organism;
>
> 		print	">gi|", $cds->gi,
> 			"|gp|",	$cds->accession,
> 			"."   , $cds->version,
> 			"|"   , $Parser->LOCUS->locus,
> 			"_"   , $cdscnt,
> 			" "   , $definition,
> 			" ["  , $organism,
> 			"]\n";
>
> 		my   $seq   = $cds->translation;
> 		pos( $seq ) = 0;
>
> 		print $1, "\n" while $seq =~ m/\G (.{1,58}) /gx;
> 	}
> });
>
>
> ########################################
> Time:
>
> real    3m58.278s
> user    2m54.370s
> sys     0m5.720s
>
> And BioPerl code.  I just learned the bioperl api so I may not be doing
> things the most efficient way.  If so please tell me an I'll rewrite and
> rerun my tests.
>
> #!/usr/local/bin/perl -w
>
> use Bio::SeqIO;
> use Bio::SeqFeature::Generic;
>
> $in = Bio::SeqIO->new(
> 	'-file' 	=> "gzip -cd /am/db/gb-ff/gbbct1.seq.gz |",
> 	'-format'	=> 'Genbank' );
>
> while (my $seq = $in->next_seq( )) {
>
>     my $cdscnt = 0;
>
>     foreach my $feature ( $seq->top_SeqFeatures ) {
>
> 	if ( $feature->primary_tag( ) eq 'CDS' ) {
>
> 	    $cdscnt++;
>
> 	    next if $feature->has_tag( 'pseudo' );
>
> 	    my $definition = undef;
>
> 	    if    ( $feature->has_tag( 'product' ) ) {
> 		$definition = ($feature->each_tag_value( 'product' ))[0];
> 	    }
> 	    elsif ( $feature->has_tag( 'gene' ) ) {
> 		$definition = ($feature->each_tag_value( 'gene'    ))[0];
> 	    }
> 	    elsif ( $feature->has_tag( 'note' ) ) {
> 		$definition = join "; ",
> 			       $seq->desc,
> 			      ($feature->each_tag_value( 'note'    ))[0];
> 	    }
> 	    else {
> 		$definition = $seq->desc;
> 	    }
>
> 	    my $organism = $seq->species->species;
>
> 	    my $accession = undef;
> 	    my $version   = undef;
> 	    my $gi        = undef;
>
> 	    if ( $feature->has_tag( 'db_xref' ) ) {
> 		foreach my $tag ( $feature->each_tag_value( 'db_xref' ) ) {
> 		    $gi = $1, last if $tag =~ /^GI:(\d+)$/;
> 		}
> 	    }
>
> 	    if ( $feature->has_tag( 'protein_id' ) ) {
> 		($feature->each_tag_value( 'protein_id' ))[0] =~
> 		    /^([^.]+)\.(\d+)$/;
>
> 		$accession = $1;
> 		$version   = $2;
> 	    }
>
> 	    print ">gi|", $gi,
> 		  "|gp|", $accession,
> 		  "."   , $version,
> 		  "|"   , $seq->display_id,
> 		  "_"   , $cdscnt,
> 		  " "   , $definition,
> 		  " ["  , $organism,
> 		  "]\n";
>
> 	    my    $seq   = ($feature->each_tag_value( 'translation' ))[0];
> 	    pos ( $seq ) = 0;
>
> 	    print $1, "\n" while $seq =~ m/\G (.{1,58}) /gx;
> 	}
>     }
> }
>
> ########################################
> Time:
>
> real    17m17.897s
> user    15m23.400s
> sys     0m19.490s
>
>
> So in all cases the bioperl genbank parsers was 5 to 17 times as slow as
> the GenBankParser.  I think that's pretty dramatic.
>
> I also feel that the GenBankParser interface is a little more intuitive--
> mirroring almost exactly the format of the actual flat file.  Of course,
> I'm biased in this regard, being the author and all :).
>
> 	John Kloss.
>
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l@bioperl.org
> http://bioperl.org/mailman/listinfo/bioperl-l