[Bioperl-l] bioperl count sam reads

Tue May 22 14:19:32 UTC 2012

You are doing it the hard way. The easy way is to use the pileup() method
and provide a callback that counts the type of base that you wish (i.e.
don't count gaps).

Lincoln

On Wed, May 2, 2012 at 9:39 PM, Mark Aquino <maquino at knome.com> wrote:

> Hi all,
>
> I'm a little stumped as to how to successfully count the depths of all
> reads at a specific locus in a sam/bam file.  I know I can do this with
> GATK DepthOfCoverage but I wanted to do some more customized things with my
> script yet I haven't figured out how to get the right base.  I was a bit
> surprised there wasn't (or that it's not well documented) a method to get
> the individuals read's base at a specific position while getting the
> $refbase is quite easy.  (I'm betting such a method exists and is just not
> documented well)
>
> At any rate, gaps in the alignment are the cause for my problems, so if
> anyone knows a simpler way to do call the bases correctly, or a clever
> algorithm to deal with this issue, it would be much appreciated.  Here's
> what I have for code and it works except in cases where there are multiple
> gaps in the reference sequence, e.g. the alignment below should be T-T here
> not C-C but is shifted due to the second gap.
>
>
> #!/progs/bin/perl
> use strict;
> use warnings;
> use Bio::DB::Sam;
> use Bio::DB::Bam::AlignWrapper;
> use Pod::Usage;
> use Getopt::Long;
> use Bio::DB::Bam::Pileup;
> use Term::ANSIColor;
>
> my $sam = Bio::DB::Sam->new(-bam  =>$BAM,
>         -fasta=> $FASTA);
> getBases($chr, $pos, $pos);
>
>
> sub getBases {
>     my $print = 1;
>     my ($chr, $start_query, $end_query) = @_;
>     my @alignments = $sam->get_features_by_location(-seq_id => $chr,
>             -start  => $start_query,
>             -end    => $end_query);
>     my $refbase;
>     my ($a_count, $t_count, $g_count, $c_count, $n_count, $del_count,
> $ins_count) = (0, 0, 0, 0, 0, 0, 0);
>     for my $a (@alignments) {
>
>         my $start  = $a->start;
>         my $end    = $a->end;
>
>         my $query_start = $a->query->start;
>         my $query_end   = $a->query->end;
>         my $ref_dna   = $a->dna;        # reference sequence bases
>         my ($ref, $matches, $query) = $a->padded_alignment;
>         my $offset = 0;
>         if ($ref =~ /^([-]+)[ATCG]+/){
>             $offset = length($1);
>         }
>         #print "$offset\n";
>         $refbase = $sam->segment($chr,$start_query,$start_query)->dna;
>
>         printAlignment($ref, $matches, $query, $start_query, $start,
> $offset);
>         my $base = substr($query, $start_query-$start+$offset, 1);
>         if (!$base){
>             next;
>         }
>         $a_count++ if ($base eq "A");
>         $t_count++ if ($base eq "T");
>         $c_count++ if ($base eq "C");
>         $g_count++ if ($base eq "G");
>         $n_count++ if ($base eq "N");
>         $del_count++ if ($base eq "-");
>         my @scores    = $a->qscore;     # per-base quality scores
>         my $match_qual= $a->qual;       # quality of the match
>     }
>     my $total_depth = $a_count + $t_count + $c_count + $g_count + $n_count
> + $del_count;
>     if ($print == 1){
>      #   print "$start_query\tref base: $ref_base\n";
>         print "$chr:$start_query($refbase)\t";
>         print "A:$a_count\t";
>         print "T:$t_count\t";
>         print "C:$c_count\t";
>         print "G:$g_count\t";
>         print "N:$n_count\t";
>         print "D:$del_count\t";
>         print "Total:$total_depth\n";
>     }
>     return ($a_count, $t_count, $c_count, $g_count, $n_count, $del_count,
> $ins_count);
> }
> sub printAlignment{
>     my ($ref, $matches, $query, $start_query, $start, $offset) = @_;
>     print substr($ref, 0, $start_query-$start+$offset);
>     print (color("red"), substr($ref, $start_query-$start+$offset, 1),
> color("reset"));
>     print substr($ref, $start_query-$start+$offset+1),"\n";
>     print substr($matches, 0, $start_query-$start+$offset);
>     print (color("red"), substr($matches, $start_query-$start+$offset, 1),
> color("reset"));
>     print substr($matches, $start_query-$start+$offset+1),"\n";
>     print substr($query, 0, $start_query-$start+$offset);
>     print (color("red"), substr($query, $start_query-$start+$offset, 1),
> color("reset"));
>     print substr($query, $start_query-$start+$offset+1),"\n";
> }
>
>
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l
>

-- 
Lincoln D. Stein
Director, Informatics and Biocomputing Platform
Ontario Institute for Cancer Research
101 College St., Suite 800
Toronto, ON, Canada M5G0A3
416 673-8514
Assistant: Renata Musa <Renata.Musa at oicr.on.ca>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: Untitled.tiff
Type: image/tiff
Size: 6222 bytes
Desc: not available
URL: <http://lists.open-bio.org/pipermail/bioperl-l/attachments/20120522/ff68a4f1/attachment-0003.tiff>