[Bioperl-l] bioperl count sam reads

Scott Cain scott at scottcain.net
Mon May 7 19:01:21 UTC 2012


Hi Mark,

I can't really help you with your current problem, but I would like to just
point out a few things: Bio::DB::Sam isn't actually part of BioPerl (though
this mailing list is still a reasonable place to ask quesitons about it),
and it was originally developed to support drawing graphical
representations of BAM files in BioGraphics and thus GBrowse.  As a result,
it isn't surprising that some functionality may be missing from the perl
wrapper, since it wasn't needed to create the graphical representation.  If
you would like to add functionality, we can accept patches :-)

Scott


On Wed, May 2, 2012 at 9:39 PM, Mark Aquino <maquino at knome.com> wrote:

> Hi all,
>
> I'm a little stumped as to how to successfully count the depths of all
> reads at a specific locus in a sam/bam file.  I know I can do this with
> GATK DepthOfCoverage but I wanted to do some more customized things with my
> script yet I haven't figured out how to get the right base.  I was a bit
> surprised there wasn't (or that it's not well documented) a method to get
> the individuals read's base at a specific position while getting the
> $refbase is quite easy.  (I'm betting such a method exists and is just not
> documented well)
>
> At any rate, gaps in the alignment are the cause for my problems, so if
> anyone knows a simpler way to do call the bases correctly, or a clever
> algorithm to deal with this issue, it would be much appreciated.  Here's
> what I have for code and it works except in cases where there are multiple
> gaps in the reference sequence, e.g. the alignment below should be T-T here
> not C-C but is shifted due to the second gap.
>
>
> #!/progs/bin/perl
> use strict;
> use warnings;
> use Bio::DB::Sam;
> use Bio::DB::Bam::AlignWrapper;
> use Pod::Usage;
> use Getopt::Long;
> use Bio::DB::Bam::Pileup;
> use Term::ANSIColor;
>
> my $sam = Bio::DB::Sam->new(-bam  =>$BAM,
>         -fasta=> $FASTA);
> getBases($chr, $pos, $pos);
>
>
> sub getBases {
>     my $print = 1;
>     my ($chr, $start_query, $end_query) = @_;
>     my @alignments = $sam->get_features_by_location(-seq_id => $chr,
>             -start  => $start_query,
>             -end    => $end_query);
>     my $refbase;
>     my ($a_count, $t_count, $g_count, $c_count, $n_count, $del_count,
> $ins_count) = (0, 0, 0, 0, 0, 0, 0);
>     for my $a (@alignments) {
>
>         my $start  = $a->start;
>         my $end    = $a->end;
>
>         my $query_start = $a->query->start;
>         my $query_end   = $a->query->end;
>         my $ref_dna   = $a->dna;        # reference sequence bases
>         my ($ref, $matches, $query) = $a->padded_alignment;
>         my $offset = 0;
>         if ($ref =~ /^([-]+)[ATCG]+/){
>             $offset = length($1);
>         }
>         #print "$offset\n";
>         $refbase = $sam->segment($chr,$start_query,$start_query)->dna;
>
>         printAlignment($ref, $matches, $query, $start_query, $start,
> $offset);
>         my $base = substr($query, $start_query-$start+$offset, 1);
>         if (!$base){
>             next;
>         }
>         $a_count++ if ($base eq "A");
>         $t_count++ if ($base eq "T");
>         $c_count++ if ($base eq "C");
>         $g_count++ if ($base eq "G");
>         $n_count++ if ($base eq "N");
>         $del_count++ if ($base eq "-");
>         my @scores    = $a->qscore;     # per-base quality scores
>         my $match_qual= $a->qual;       # quality of the match
>     }
>     my $total_depth = $a_count + $t_count + $c_count + $g_count + $n_count
> + $del_count;
>     if ($print == 1){
>      #   print "$start_query\tref base: $ref_base\n";
>         print "$chr:$start_query($refbase)\t";
>         print "A:$a_count\t";
>         print "T:$t_count\t";
>         print "C:$c_count\t";
>         print "G:$g_count\t";
>         print "N:$n_count\t";
>         print "D:$del_count\t";
>         print "Total:$total_depth\n";
>     }
>     return ($a_count, $t_count, $c_count, $g_count, $n_count, $del_count,
> $ins_count);
> }
> sub printAlignment{
>     my ($ref, $matches, $query, $start_query, $start, $offset) = @_;
>     print substr($ref, 0, $start_query-$start+$offset);
>     print (color("red"), substr($ref, $start_query-$start+$offset, 1),
> color("reset"));
>     print substr($ref, $start_query-$start+$offset+1),"\n";
>     print substr($matches, 0, $start_query-$start+$offset);
>     print (color("red"), substr($matches, $start_query-$start+$offset, 1),
> color("reset"));
>     print substr($matches, $start_query-$start+$offset+1),"\n";
>     print substr($query, 0, $start_query-$start+$offset);
>     print (color("red"), substr($query, $start_query-$start+$offset, 1),
> color("reset"));
>     print substr($query, $start_query-$start+$offset+1),"\n";
> }
>
>
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l
>



-- 
------------------------------------------------------------------------
Scott Cain, Ph. D.                                   scott at scottcain dot
net
GMOD Coordinator (http://gmod.org/)                     216-392-3087
Ontario Institute for Cancer Research
-------------- next part --------------
A non-text attachment was scrubbed...
Name: Untitled.tiff
Type: image/tiff
Size: 6222 bytes
Desc: not available
URL: <http://lists.open-bio.org/pipermail/bioperl-l/attachments/20120507/1861f0db/attachment-0004.tiff>


More information about the Bioperl-l mailing list