[Bioperl-l] genbank parsing of multiple 'function' tags within primary tag
galeb abu-ali
abualiga2 at gmail.com
Thu Sep 8 14:44:39 UTC 2011
Hi,
I'm parsing a genbank file with Bio::SeqIO and am stuck on instances of
multiple tags within a primary tag. E.g., when there are several 'function'
tag-values within a 'CDS' primary tag, I don't know how to link those
'function' tag-values to a particular 'locus_tag'. As parsed values are
returned as a list, I tried creating an array of hashes, where the hash-key
is 'locus_tag' and hash-values are multiple 'function' tags, but am failing
miserably. Pasted below is what I managed so far. At your convenience,
please advise.
thanks!
galeb
#!/usr/local/bin/perl
# parse_gbk.pl
# gsa 09042011
# script to parse out features from gbk
#
http://www.bioperl.org/wiki/HOWTO:Feature-Annotation#Customizing_Sequence_Object_Construction
use strict; use warnings;
use Bio::SeqIO;
my @loci;
my @seqs;
my @directions;
my @start_coords;
my @end_coords;
my @genes;
my @products;
my @notes;
my @functions;
my %functions;
my $gb_file = shift;
my $seqio_obj = Bio::SeqIO->new(-file => $gb_file );
my $seq_obj = $seqio_obj->next_seq;
for my $feat_obj ( $seq_obj->get_SeqFeatures ) {
if ( $feat_obj->primary_tag eq ( 'gene' ) ) {
if ($feat_obj->has_tag( 'locus_tag' ) ) {
push ( @seqs, $feat_obj->seq->seq ); #collect sequences
for my $val ( $feat_obj->get_tag_values( 'locus_tag' ) )
{
push ( @loci, $val ); # locus_tags
}
}
if ( $feat_obj->has_tag( 'gene' ) ) {
for my $val ( $feat_obj->get_tag_values( 'gene' )
) {
push ( @genes, $val ); # gene names
}
}
else {
push ( @genes, "" ); # if gene names are absent, leave
empty
}
if ( $feat_obj->location->isa( 'Bio::Location::Simple' ) ) { # gene
coordinates
for my $location ( $feat_obj->location ) {
push ( @start_coords, $location->start );
push ( @end_coords, $location->end );
if ( $location->strand == -1 ) {
push ( @directions, "reverse" );
}
else {
push ( @directions, "forward" );
}
}
}
}
# gene products, notes, functions
if ( $feat_obj->primary_tag eq ( 'CDS' ) || $feat_obj->primary_tag eq (
'misc_feature' ) || $feat_obj->primary_tag eq ( 'ncRNA' ) ||
$feat_obj->primary_tag eq ( 'rRNA' ) || $feat_obj->primary_tag eq ( 'tRNA' )
|| $feat_obj->primary_tag eq ( 'misc_RNA' ) ) {
if ( $feat_obj->has_tag( 'product' ) ) {
for my $product ( $feat_obj->get_tag_values( 'product' ) ) {
push ( @products, $product );
}
}
else {
push ( @products, "" );
}
if ( $feat_obj->has_tag( 'note' ) ) {
for my $note ( $feat_obj->get_tag_values( 'note' ) ) {
push ( @notes, $note );
}
}
else {
push ( @notes, "" );
}
if ( $feat_obj->has_tag( 'function' ) ) {
for my $function ( $feat_obj->get_tag_values( 'function' ) ) {
push ( @functions, $function );
}
}
else {
push ( @functions, "" );
}
}
}
print
"locus\tgene_name\tstart_nt\tend_nt\tlength_nt\tdirection\tproduct\tnote\tfunction\tsequence_nt\n";
# header
for ( my $elem = 0; $elem < scalar @loci; ++$elem ) {
print $loci[$elem], "\t",$genes[$elem], "\t", $start_coords[$elem],
"\t", $end_coords[$elem], "\t", length( $seqs[$elem] ), "\t",
$directions[$elem], "\t", $products[$elem], "\t", $notes[$elem], "\t",
$functions[$elem], "\t", $seqs[$elem], "\n";
}
More information about the Bioperl-l
mailing list