#!/usr/bin/perl -w #$Id$ =head1 NAME download_query_genbank - script to query Genbank and retrieve records =head1 USAGE download_query_genbank --query "Neurospora[ORGN]" --db nucest -o Ncrassa_ESTs.fa --format fasta download_query_genbank --queryfile 'filewithquery' --db nucest -o Ncrassa_ESTs.fa --format fasta =head2 Other options Provide ONE of: --query query string OR --queryfile profile file with query OR --gi --gifile file with list of GIs to download Database type: --db database (nucest, protein, nucleotide) -f --format sequence file output format -v --verbose debugging output =head2 Query options --maxids maximum number of IDs to retrieve in a set --reldate --maxdate maxdate for a record --mindate minimum date for record --datetype edat or mdat (entered or modified) =head1 AUTHOR Jason Stajich Jason Stajich, jason-AT-bioperl.org =cut use strict; use Bio::DB::GenBank; use Bio::DB::GenPept; use Bio::DB::Query::GenBank; use Bio::SeqIO; use Getopt::Long; use Data::Dumper; use Term::ProgressBar; my ($queryfile,$outfile,$format,$debug,$count,$seqnum,$prgs,$retformat,%options);# DIMITAR: added $count, $seqnum, $prgs and $retformat $format = 'fasta'; $options{'-maxids'} = '100'; $options{'-db'} = 'nucleotide'; # can be nucleotide, nucest, protein my $gifile; GetOptions( 'h|help' => sub { exec('perldoc', $0); exit(0); }, 'v|verbose' => \$debug, 'f|format:s' => \$format, 'queryfile:s' => \$queryfile, 'o|out|outfile:s' => \$outfile, 'gi|gifile|gis:s' => \$gifile, 'rf|retformat:s' => \$retformat, # DB::Query options 'd|db:s' => \$options{'-db'}, 'mindate:s' => \$options{'-mindate'}, 'maxdate:s' => \$options{'-maxdate'}, 'reldate:s' => \$options{'-reldate'}, 'datetype:s' => \$options{'-datetype'}, # edat or mdat 'maxids:i' => \$options{'-maxids'}, 'q|query:s' => \$options{'-query'}, ); my $out; if( $outfile ) { $out = Bio::SeqIO->new(-format => $format, -file => ">$outfile", -flush => 1); } else { $out = Bio::SeqIO->new(-format => $format); # write to STDOUT } my $dbh; $dbh = Bio::DB::GenBank->new(-verbose => $debug, -format => $retformat); my $query; if( $gifile ) { my @ids; open( my $fh => $gifile ) || die $!; while(<$fh>) { push @ids, split; } close($fh); while( @ids ) { my @mini_ids = splice(@ids, 0, $options{'-maxids'}); $query = Bio::DB::Query::GenBank->new(%options, -ids => \@mini_ids, ); my $stream = $dbh->get_Stream_by_query($query); while( my $seq = $stream->next_seq ) { $out->write_seq($seq); } } exit; } elsif( $options{'-query'}) { $query = Bio::DB::Query::GenBank->new(%options); #DIMITAR# $count=$query->count; print " Entries for Download: $count :\n"; $prgs=progress($count); #DIMITAR END# } elsif( $queryfile ) { open(my $fh => $queryfile) || die $!; while(<$queryfile>) { chomp; $options{'-query'} .= $_; } $query = Bio::DB::Query::GenBank->new(%options); close($fh); } else { die("no query string or gifile\n"); } my $stream = $dbh->get_Stream_by_query($query); print "STREAM:",Dumper( $stream )," :\n"; $seqnum=1;#DIMITAR while( my $seq = $stream->next_seq ) { check_progress($prgs,$seqnum,$count);#DIMITAR $out->write_seq($seq); $seqnum++;#DIMITAR } #### DIMITAR ### sub progress{ my $max=shift; my $ret=Term::ProgressBar->new($max); return $ret; } sub check_progress{ my($prog,$count,$max)=@_; if( $count < $max ) { $prog->update($count); }elsif($count == $max){ $prog->update($max); } } ### END DIMITAR ####