[Biopython] NCBIXML: hit start and end

Mic mictadlo at gmail.com
Wed Apr 24 05:55:06 UTC 2013


Hi,
I have tried to rewrite the Perl code to Biopython

sub retrieve {
    my $blast_report = $options->{'blast'};
    my $max_hits  = $options->{'maxhits'};
    my $searchio     = new Bio::SearchIO(
        -format => 'blast',
        -file   => $blast_report
    );

    while ( my $result = $searchio->next_result ) {
        my $query_name = $result->query_name();
        my $count_unirefs   = 0;
        my %hit_names_count = ();
        while ( my $hit = $result->next_hit ) {
            $count_unirefs++;


            my $count_hsp = 0;
            my @plushsps  = ();
            my @minhsps   = ();

            while ( my $hsp = $hit->next_hsp ) {
                $count_hsp++;
                my $query_start = $hsp->start('query');
                my $query_end   = $hsp->end('query');
                my $hit_start   = $hsp->start('hit');
                my $hit_end     = $hsp->end('hit');
                my $strand      = $hsp->strand();
                my $hit_desc    = $hit->description();


                my @hsp_data    = ($query_start, $query_end, $hit_start,
$hit_end, $hit_desc);


            }
        }

    }
}


Biopython code:
---------------
from Bio import SeqIO
from Bio.Blast import NCBIXML

def retrieve_hits_data():

    max_hits = 5  # Change to args

    with open("test/x.xml") as bf:
        blast_records = NCBIXML.parse(bf)

        for blast_record in blast_records:
            print blast_record.query
            print
            for alignment in blast_record.alignments:
                print 'sequence:', alignment.title
                print alignment.hit_id
                print alignment.hit_def
                print 'length:', alignment.length

                for hsp in alignment.hsps:
                    print "HSPs"
                    print "----"
                    print 'e value:', hsp.expect
                    #print hsp.query
                    #print hsp.match
                    #print hsp.sbjct
                    print hsp.score
                    print hsp.bits
                    print hsp.num_alignments
                    print hsp.identities
                    print hsp.positives
                    print hsp.gaps
                    print hsp.align_length
                    print hsp.strand
                    print hsp.frame
                    print hsp.query_start
                    print hsp.query_end
                    #print hsp.hit_start
                    #print hsp.hit_end
                    print hsp.sbjct_start
                    print hsp.sbjct_end


retrieve_hits_data()


Output from Biopython code:
XA10_v3.0-snap.1

XA10_v3.0-snap.2

XA10_v3.0-snap.3

XA10_v3.0-snap.4

sequence: UniRef90_Q9FX16 F12G12.10 protein n=1 Tax=Arabidopsis thaliana
RepID=Q9FX16_ARATH
UniRef90_Q9FX16
F12G12.10 protein n=1 Tax=Arabidopsis thaliana RepID=Q9FX16_ARATH
length: 308
HSPs
----
e value: 8.30308e-88
709.0
277.715
None
146
192
10
285
(None, None)
(0, 0)
10
290
8
286


How do I get hsp->start('hit') and hsp->end('hit') from the bioperl code in
Biopython?
Why does blast_record.query appears immediately in sequence and not after
the other two for loops has finished?

Thank you in advance.

Mic



More information about the Biopython mailing list