[Bioperl-l] Strangeness in parsing blast file

Nabil Hafez nabil at broad.mit.edu
Mon Jul 31 18:57:48 UTC 2006


I have figured out the problem - not a problem with Bioperl.
In my create_sample_file() subroutine I  defined

$/ = '>'; #define fasta record input seperator

when it should have been this

local $/ = "\n>";

the use of local made a big difference.  Thanks to all for your help.
Nabil Hafez


Nabil Hafez wrote:
> Hi,
>     I am having a somewhat similar problem to what was posted in
> http://bioperl.org/pipermail/bioperl-l/2006-May/021416.html
> however, I have read through all of that thread and I don't believe what 
> I am
> experiencing is the exact same problem.  I also realize that the Bioperl 
> version 1.5.1
> fixes a problem with blast parsing. 
> 
> My problem: 
>     My blastresults file parses fine and everything works swimmingly if 
> I pass
> the blast output file  by name   such as
> $blast_result = 'test.blastout'; 
> 
> however when I do
> $blast_result = &do_blast($sample_fasta);
> 
> even though in both cases $blast_result evaluate to "test.blastout", the 
> parsing doesn't work, more specifically
> it gets an undefined variable for $result in while( my $result = 
> $report_obj->next_result ) {
> 
> Sorr y for the long email - any help would be appreciated,
> Thanks - Nabil
> 
> 
> The code...non releavant parts trimmed for size constraints....debugging 
> from working and non-working
> versions after the code.
> 
> use strict;
> use Bio::SearchIO;
> use Getopt::Std;
> use List::Util qw(shuffle);
> use Benchmark;
> 
> my ($inputfile, $samplefile, $blastfile, $blast_result, $blast_report, 
> $blast_verbose); #files generated
> 
> 
> #------------------#
> # Subroutine Calls #
> #------------------#
> 
> my $test = &create_sample_file($inputfile); #inputfile being a fasta 
> file containing nucleotide sequence
> $blast_result = &do_blast($test);
> ##$blast_result = 'test.blastout';  #when this is uncommented and 
> replace the previous two lines with test.blastout being normal blast 
> output - the script works fine. 
> &parse_blast($blast_result);
> 
> 
> #######################
> # create_sample_file
> #
> #      Input: Original Fasta File
> #
> #      Output: Fasta file containing randomly sampled reads
> #
> #
> sub create_sample_file {
>      my $in = shift;
>      my $linecount = 0;
>      my @lines;
> 
>      $samplefile = $in . "_sample";
> 
>     #Determine total # of reads in input fasta
>     $totalreads = `$grep -c '>' $inputfile`;
>     $totalreads =~ s/\s+//;
>     chomp $totalreads;
> 
>    if ($totalreads > 1000) { #sample if more than 1000 reads
>        $sample_reads = sprintf("%.0f", $totalreads * 
> ($per_to_sample/100)); #number of reads to sample
>    }
>    else { #otherwise use all reads
>        $sample_reads = $totalreads;
>    }
> 
>    $/ = '>'; #define fasta record input seperator
> 
>    open (IN, "<$in") or die "Cannot open $in $!\n";
>    open (OUT, ">$samplefile") or die "Cannot open $samplefile $!\n";
> 
> 
>      while (<IN>) { #read lines into an array
>         chomp;
>         push (@lines, $_);
>        }
> 
>      @lines = shuffle(@lines); #shuffle array
>      foreach (@lines)  {
>            print OUT ">$_" if $linecount <= $sample_reads; #output to 
> file sampled number of reads
>            $linecount++;
>      }
> 
>     close IN;
>     close OUT;
> 
>     return $samplefile;
> 
> }#end create_sample_file
> 
> 
> #######################
> # do_blast
> #
> #      Input: Fasta File containing SCREENED sampled reads
> #
> #      Output: Blast File
> #
> #
> 
> sub do_blast {
>    my $bf = shift;
>    my $blastoutput = $bf . ".blastout";
> 
>     print "Blasting against $db ...\n";
> 
>    `blast/blast-2.2.10/bin/megablast -d /prodinfo/proddata_ntblastdb/nt 
> -e 1e-50 -p 99 -D 2 -i test -o test.blastout`;
> 
>     return $blastoutput;
> 
> }#end do_blast
> 
> 
> 
> #######################
> # parse_blast
> #
> #      Input: Blast file
> #
> #      Output: Creates hash containing best hit for each read
> #
> #
> 
> sub parse_blast {
>   my $blastoutfile = shift;
> 
>   if (! -e $blastoutfile) {
>     die "$blastoutfile does not exist $!\n";
>   }
> 
>   print "Parsing blast hits ...\n";
> 
> 
>    my $report_obj = new Bio::SearchIO(-verbose => 1,
>                                       -format => 'blast',
>                                       -file => $blastoutfile);
> 
> 
>     die "no valid $report_obj" unless defined $report_obj;
> 
> 
>    while( my $result = $report_obj->next_result ) {
>               die "no valid $result" unless defined $result;
>      while( my $hit = $result->next_hit ) {
>        while( my $hsp = $hit->next_hsp ) {
>              my $name =  $result->query_name;
>              my $hitDesc = $hit->description;
>              my $length = $hsp->length('total');
>              my $per_id = sprintf("%.2f", $hsp->percent_identity);
>              my $eval = $hsp->evalue;
>              next if (defined $blast_results{$name} && 
> $blast_results{$name}->[0] > $length); #only keep best hit for any read
>             $blast_results{$name} = [$length, $per_id, $eval, $hitDesc]; 
> #store in hash of arrays
>         }
>       }
>     }
> 
> } #end parse_blast
> 
> 
> 
> 
> 
> Debug of NON-working blast-parse:
> 
> main::(454/scripts/fasta_blasta_mb.pl:151):
> 151:    my $sample_fasta = &create_sample_file($inputfile);
>   DB<2> n
> main::(454/scripts/fasta_blasta_mb.pl:152):
> 152:    $blast_result = &do_blast($sample_fasta);
>   DB<2> x $sample_fasta
> 0  'G782.2005-08-16-16-48.fasta_sample'
>   DB<3> n
> Blasting against NT ...
> main::(454/scripts/fasta_blasta_mb.pl:154):
> 154:    &parse_blast($blast_result);
>   DB<3> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:293):
> 293:      my $blastoutfile = shift;
>   DB<3> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:295):
> 295:      if (! -e $blastoutfile) {
>   DB<3> x $blastoutfile
> 0  'G782.2005-08-16-16-48.fasta_sample.blastout'
>   DB<4> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:299):
> 299:      print "Parsing blast hits ...\n";
>   DB<4> s
> Parsing blast hits ...
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
> 302:       my $report_obj = new Bio::SearchIO(-verbose => 1,
> 303:                                          -format => 'blast',
> 304:                                          -file => 
> $blastoutfile);#or die "Could not open blast report $!";
>   DB<4> s
> Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
> 129:      my($caller, at args) = @_;
>   DB<4> r
> scalar context return from Bio::SearchIO::new: '_file' => 
> 'G782.2005-08-16-16-48.fasta_sample.blastout'
> '_filehandle' => GLOB(0x8cef40c)
>    -> *Symbol::GEN1
>          FileHandle({*Symbol::GEN1}) => fileno(3)
> '_flush_on_write' => 1
> '_handler' => 
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
>    '_factories' => HASH(0x95054c0)
>       'hit' => Bio::Factory::ObjectFactory=HASH(0x95017b8)
>          '_loaded_types' => HASH(0x9506c0c)
>             'Bio::Search::Hit::BlastHit' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::Hit::HitI'
>          'type' => 'Bio::Search::Hit::BlastHit'
>       'hsp' => Bio::Factory::ObjectFactory=HASH(0x9500e10)
>          '_loaded_types' => HASH(0x9506c18)
>             'Bio::Search::HSP::GenericHSP' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::HSP::HSPI'
>          'type' => 'Bio::Search::HSP::GenericHSP'
>       'iteration' => Bio::Factory::ObjectFactory=HASH(0x9506c60)
>          '_loaded_types' => HASH(0x9506af8)
>             'Bio::Search::Iteration::GenericIteration' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::Iteration::IterationI'
>          'type' => 'Bio::Search::Iteration::GenericIteration'
>       'result' => Bio::Factory::ObjectFactory=HASH(0x9504c80)
>          '_loaded_types' => HASH(0x9501f74)
>             'Bio::Search::Result::BlastResult' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::Result::ResultI'
>          'type' => 'Bio::Search::Result::BlastResult'
>    '_inclusion_threshold' => 0.001
>    '_root_verbose' => 1
> '_handler_cache' => 
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x9505724)
>    -> REUSED_ADDRESS
> '_notfirsttime' => 0
> '_reporttype' => ''
> '_root_cleanup_methods' => ARRAY(0x8cde434)
>    0  CODE(0x82a9aec)
>       -> &Bio::Root::IO::_io_cleanup in 
> /util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
>    1  CODE(0x82a9aec)
>       -> REUSED_ADDRESS
> '_root_verbose' => 1
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
> 307:        die "no valid $report_obj" unless defined $report_obj;
>   DB<4> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
> 310:       while( my $result = $report_obj->next_result ) {
>   DB<4> s
> Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
> 389:       my ($self) = @_;
>   DB<4> r
> scalar context return from Bio::SearchIO::blast::next_result: undef
> Bio::SearchIO::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:438):
> 438:        my $self = shift;
>   DB<4> r
> scalar context return from Bio::SearchIO::DESTROY: ''
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404:        my $self = shift;
>   DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404:        my $self = shift;
>   DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404:        my $self = shift;
>   DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404:        my $self = shift;
>   DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> Bio::Root::Root::DESTROY(/util/lib/perl5/site_perl/5.8.0/Bio/Root/Root.pm:404):
> 404:        my $self = shift;
>   DB<4> r
> scalar context return from Bio::Root::Root::DESTROY: undef
> main::(454/scripts/fasta_blasta_mb.pl:155):
> 155:    &output_results();
>   DB<4> x $result
> 0  undef
> 
> 
> 
> Debug of WORKING blast-parse:
> Parsing blast hits ...
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:302):
> 302:       my $report_obj = new Bio::SearchIO(-verbose => 1,
> 303:                                          -format => 'blast',
> 304:                                          -file => 
> $blastoutfile);#or die "Could not open blast report $!";
>   DB<3> s
> Bio::SearchIO::new(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO.pm:129):
> 129:      my($caller, at args) = @_;
>   DB<3> r
> scalar context return from Bio::SearchIO::new: '_file' => 
> 'G782.2005-08-16-16-48.fasta_sample.blastout'
> '_filehandle' => GLOB(0x8763100)
>    -> *Symbol::GEN1
>          FileHandle({*Symbol::GEN1}) => fileno(3)
> '_flush_on_write' => 1
> '_handler' => 
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
>    '_factories' => HASH(0x8ab1594)
>       'hit' => Bio::Factory::ObjectFactory=HASH(0x8a7b7c0)
>          '_loaded_types' => HASH(0x8abee10)
>             'Bio::Search::Hit::BlastHit' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::Hit::HitI'
>          'type' => 'Bio::Search::Hit::BlastHit'
>       'hsp' => Bio::Factory::ObjectFactory=HASH(0x8a87200)
>          '_loaded_types' => HASH(0x8abee1c)
>             'Bio::Search::HSP::GenericHSP' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::HSP::HSPI'
>          'type' => 'Bio::Search::HSP::GenericHSP'
>       'iteration' => Bio::Factory::ObjectFactory=HASH(0x8abee64)
>          '_loaded_types' => HASH(0x8abecfc)
>             'Bio::Search::Iteration::GenericIteration' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::Iteration::IterationI'
>          'type' => 'Bio::Search::Iteration::GenericIteration'
>       'result' => Bio::Factory::ObjectFactory=HASH(0x8a81a84)
>          '_loaded_types' => HASH(0x8a96ce8)
>             'Bio::Search::Result::BlastResult' => 1
>          '_root_verbose' => 0
>          'interface' => 'Bio::Search::Result::ResultI'
>          'type' => 'Bio::Search::Result::BlastResult'
>    '_inclusion_threshold' => 0.001
>    '_root_verbose' => 1
> '_handler_cache' => 
> Bio::SearchIO::IteratedSearchResultEventBuilder=HASH(0x8ab3be4)
>    -> REUSED_ADDRESS
> '_notfirsttime' => 0
> '_reporttype' => ''
> '_root_cleanup_methods' => ARRAY(0x8762efc)
>    0  CODE(0x82a9aec)
>       -> &Bio::Root::IO::_io_cleanup in 
> /util/lib/perl5/site_perl/5.8.0/Bio/Root/IO.pm:544-570
>    1  CODE(0x82a9aec)
>       -> REUSED_ADDRESS
> '_root_verbose' => 1
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:307):
> 307:        die "no valid $report_obj" unless defined $report_obj;
>   DB<3> s
> main::parse_blast(454/scripts/fasta_blasta_mb.pl:310):
> 310:       while( my $result = $report_obj->next_result ) {
>   DB<3> s
> Bio::SearchIO::blast::next_result(/util/lib/perl5/site_perl/5.8.0/Bio/SearchIO/blast.pm:389):
> 389:       my ($self) = @_;
>   DB<3> r
> blast.pm: unrecognized line Reference: Zheng Zhang, Scott Schwartz, 
> Lukas Wagner, and Webb Miller (2000),
> blast.pm: unrecognized line "A greedy algorithm for aligning DNA 
> sequences",
> blast.pm: unrecognized line J Comput Biol 2000; 7(1-2):203-14.
> blast.pm: unrecognized 
> line                                                                  
> Score    E
> Got NCBI HSP score=354, evalue 0.0
> scalar context return from Bio::SearchIO::blast::next_result: 
> '_algorithm' => 'MEGABLAST'
> '_algorithm_version' => '2.2.10 [Oct-19-2004]'
> '_dbentries' => 4249067
> '_dbletters' => 17735149364
> '_dbname' => 'All GenBank+EMBL+DDBJ+PDB sequences (but no EST, 
> STS,GSS,environmental samples or phase 0, 1 or 2 HTGS sequences) '
> '_hitindex' => 0
> '_hits' => ARRAY(0x8b2acd0)
>      empty array
> '_inclusion_threshold' => 0.001
> '_iteration_count' => 1
> '_iteration_index' => 0
> '_iterations' => ARRAY(0x8b2ac4c)
>    0  Bio::Search::Iteration::GenericIteration=HASH(0x8b1cacc)
>       '_newhits_below_threshold' => ARRAY(0x8b1ca84)
>          0  Bio::Search::Hit::BlastHit=HASH(0x8b1cf64)
>             '_accession' => 'AE004091'
>             '_algorithm' => 'MEGABLAST'
>             '_description' => 'Pseudomonas aeruginosa PAO1, complete genome'
>             '_hsps' => ARRAY(0x8b1ceb0)
>                0  Bio::Search::HSP::GenericHSP=HASH(0x8b2098c)
>                   '_algorithm' => 'MEGABLAST'
>                   '_frac_conserved' => HASH(0x8b266a0)
>                      'hit' => 0.991803278688525
>                      'query' => 0.991803278688525
>                      'total' => 0.991803278688525
>                   '_frac_identical' => HASH(0x8b2658c)
>                      'hit' => 0.991803278688525
>                      'query' => 0.991803278688525
>                      'total' => 0.991803278688525
>                   '_gaps' => HASH(0x8b24d94)
>                      'hit' => 0
>                      'query' => 0
>                      'total' => 0
>                   '_gsf_tag_hash' => HASH(0x8b20998)
>                        empty hash
>                   '_hit_string' => 
> 'cctgacctccgctcaactgcgcaaatacgccagcgccggtcggccgttccccgaagggcgcctgctggccgcctcctgccacgacgcggaggaactggccctggctgcctcgatgggagtggagttcgtcaccctttcgccggtacagccgaccgagagccatcccggcgagccggcgctgggttgggacaaggccgccgaactgatcgccggcttcaaccagccggtctacctgctgggtggcctcggtccgcagcaagccgagcaggcttgggagcatggagcccagggcgtggcgggtatccgtgcgttctggccgggcggcctttgacggtggaatgaagaaaaaaggaggcttcggcctcc'
>                   '_homology_string' => 
> '|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 
> ||||||||||||||||||||||||||||||||||||||||| 
> ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 
> |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||'  
> etc......
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l



More information about the Bioperl-l mailing list