#!/usr/bin/perl -w #$Id$ =head1 NAME download_query_genbank - script to query Genbank and retrieve records =head1 USAGE download_query_genbank --query "Neurospora[ORGN]" --db nucest -o Ncrassa_ESTs.fa --format fasta download_query_genbank --queryfile 'filewithquery' --db nucest -o Ncrassa_ESTs.fa --format fasta =head2 Other options Provide ONE of: --query query string OR --queryfile profile file with query OR --gi --gifile file with list of GIs to download Database type: --db database (nucest, protein, nucleotide) -f --format sequence file output format -v --verbose debugging output =head2 Query options --maxids maximum number of IDs to retrieve in a set --reldate --maxdate maxdate for a record --mindate minimum date for record --datetype edat or mdat (entered or modified) =head1 AUTHOR Jason Stajich Jason Stajich, jason-AT-bioperl.org =cut use strict; use Bio::DB::GenBank; use Bio::DB::GenPept; use Bio::DB::Query::GenBank; use Bio::SeqIO; use Getopt::Long; use Data::Dumper; use Term::ProgressBar; my ($queryfile,$outfile,$format,$debug,%options); my ($count,$seqnum,$prgs,$retformat,$fhout);# DIMITAR $format = 'fasta'; $options{'-maxids'} = '100'; $options{'-db'} = 'nucleotide'; # can be nucleotide, nucest, protein my $gifile; GetOptions( 'h|help' => sub { exec('perldoc', $0); exit(0); }, 'v|verbose' => \$debug, 'f|format:s' => \$format, 'queryfile:s' => \$queryfile, 'o|out|outfile:s' => \$outfile, 'gi|gifile|gis:s' => \$gifile, 'rf|retformat:s' => \$retformat, # DB::Query options 'd|db:s' => \$options{'-db'}, 'mindate:s' => \$options{'-mindate'}, 'maxdate:s' => \$options{'-maxdate'}, 'reldate:s' => \$options{'-reldate'}, 'datetype:s' => \$options{'-datetype'}, # edat or mdat 'maxids:i' => \$options{'-maxids'}, 'q|query:s' => \$options{'-query'}, ); my $out; #dimitar# if($retformat eq 'fasta'){ open($fhout,'>',$outfile) or die "problem with output:$!\n"; }else{ $out = Bio::SeqIO->new( -format => $format, -file => ">$outfile", ); } #end dimitar# #original# #if( $outfile ) { # $out = Bio::SeqIO->new(-format => $format, # -file => ">$outfile", # -flush => 1); #} else { # $out = Bio::SeqIO->new(-format => $format); # write to STDOUT #} #end original# my $dbh; $dbh = Bio::DB::GenPept->new( -verbose => $debug, -format => $retformat); my $query; if( $gifile ) { my @ids; open( my $fh => $gifile ) || die $!; while(<$fh>) { push @ids, split; } close($fh); while( @ids ) { my @mini_ids = splice(@ids, 0, $options{'-maxids'}); $query = Bio::DB::Query::GenBank->new(%options, -ids => \@mini_ids, ); my $stream = $dbh->get_Stream_by_query($query); while( my $seq = $stream->next_seq ) { $out->write_seq($seq); } } exit; } elsif( $options{'-query'}) { $query = Bio::DB::Query::GenBank->new(%options); #DIMITAR# $count=$query->count; print " Entries for Download: $count :\n"; $prgs=progress($count); #DIMITAR END# } elsif( $queryfile ) { open(my $fh => $queryfile) || die $!; while(<$queryfile>) { chomp; $options{'-query'} .= $_; } $query = Bio::DB::Query::GenBank->new(%options); close($fh); } else { die("no query string or gifile\n"); } my $stream = $dbh->get_Stream_by_query($query); $seqnum=1;#DIMITAR while( my $seq = $stream->next_seq ) { #DIMITAR my($gi,$locus,$refnum,$desc,$seqstr); if($retformat eq 'fasta'){ check_progress($prgs,$seqnum,$count); $locus=$seq->display_id; $refnum=$seq->accession_number; $gi=$seq->primary_id; $desc=$seq->desc; $desc=~s/\.$//; $seqstr=$seq->seq; print $fhout ">gi\|$gi\|ref\|$refnum\|$locus $desc\n$seqstr\n"; }else{ check_progress($prgs,$seqnum,$count); $out->write_seq($seq); } $seqnum++; #DIMITAR # $out->write_seq($seq);#original } close $fhout if ($retformat eq 'fasta'); #### DIMITAR ### sub progress{ my $max=shift; my $ret=Term::ProgressBar->new($max); return $ret; } sub check_progress{ my($prog,$count,$max)=@_; if( $count < $max ) { $prog->update($count); }elsif($count == $max){ $prog->update($max); } } ### END DIMITAR ####