[Bioperl-guts-l] [15624] bioperl-live/trunk: fixed round-tripping of gff3 format when a feature has multiple parentage.
Lincoln Stein
lstein at dev.open-bio.org
Sat Apr 4 04:57:41 EDT 2009
Revision: 15624
Author: lstein
Date: 2009-04-04 04:57:40 -0400 (Sat, 04 Apr 2009)
Log Message:
-----------
fixed round-tripping of gff3 format when a feature has multiple parentage.
Modified Paths:
--------------
bioperl-live/trunk/Bio/DB/SeqFeature/NormalizedFeature.pm
bioperl-live/trunk/Bio/DB/SeqFeature/Store/GFF3Loader.pm
bioperl-live/trunk/Bio/SeqFeature/Lite.pm
bioperl-live/trunk/t/LocalDB/SeqFeature.t
bioperl-live/trunk/t/data/seqfeaturedb/test.gff3
Modified: bioperl-live/trunk/Bio/DB/SeqFeature/NormalizedFeature.pm
===================================================================
--- bioperl-live/trunk/Bio/DB/SeqFeature/NormalizedFeature.pm 2009-04-02 15:51:07 UTC (rev 15623)
+++ bioperl-live/trunk/Bio/DB/SeqFeature/NormalizedFeature.pm 2009-04-04 08:57:40 UTC (rev 15624)
@@ -471,8 +471,11 @@
# undo the load_id and Target hacks on the way out
sub format_attributes {
my $self = shift;
- my $parent = shift;
+ my $parent = shift;
+ my $fallback_id = shift;
+
my $load_id = $self->load_id || '';
+
my $targobj = ($self->attributes('Target'))[0];
# was getting an 'Use of uninitialized value with split' here, changed to cooperate -cjf 7/10/07
my ($target) = $targobj ? split /\s+/,($self->attributes('Target'))[0] : ('');
@@ -491,10 +494,14 @@
foreach (@values) { s/\s+$// } # get rid of trailing whitespace
push @result,join '=',$self->escape($t),join(',', map {$self->escape($_)} @values) if @values;
}
- my $id = $self->primary_id;
+ my $id = $self->primary_id || $fallback_id;
+ my $parent_id;
+ if (@$parent) {
+ $parent_id = join (',',map {$self->escape($_)} @$parent);
+ }
my $name = $self->display_name;
- unshift @result,"ID=".$self->escape($id) if defined $id;
- unshift @result,"Parent=".$self->escape($parent->primary_id) if defined $parent;
+ unshift @result,"ID=".$self->escape($id) if defined $id;
+ unshift @result,"Parent=".$parent_id if defined $parent_id;
unshift @result,"Name=".$self->escape($name) if defined $name;
return join ';', at result;
}
Modified: bioperl-live/trunk/Bio/DB/SeqFeature/Store/GFF3Loader.pm
===================================================================
--- bioperl-live/trunk/Bio/DB/SeqFeature/Store/GFF3Loader.pm 2009-04-02 15:51:07 UTC (rev 15623)
+++ bioperl-live/trunk/Bio/DB/SeqFeature/Store/GFF3Loader.pm 2009-04-04 08:57:40 UTC (rev 15624)
@@ -570,16 +570,20 @@
# contiguous feature, so add a segment
warn $old_feat if defined $old_feat and !ref $old_feat;
- if (defined $old_feat &&
- (
- $old_feat->seq_id ne $refname ||
- $old_feat->start != $start ||
- $old_feat->end != $end # make sure endpoints are distinct
- )
- )
- {
- $self->add_segment($old_feat,$self->sfclass->new(@args));
- return;
+ if (defined $old_feat) {
+ # set this to 1 to disable split-location behavior
+ if (0 && @parent_ids) { # If multiple features are held together by the same ID
+ $feature_id = $ld->{TemporaryID}++; # AND they have a Parent attribute, this causes an undesirable
+ } # additional layer of aggregation. Changing the ID fixes this.
+ elsif (
+ $old_feat->seq_id ne $refname ||
+ $old_feat->start != $start ||
+ $old_feat->end != $end # make sure endpoints are distinct
+ )
+ {
+ $self->add_segment($old_feat,$self->sfclass->new(@args));
+ return;
+ }
}
# we get here if this is a new feature
@@ -597,10 +601,6 @@
my $has_id = defined $reserved->{ID}[0];
$index_it ||= $top_level;
-# $ld->{IndexIt}{$feature_id}++ if $index_it;
-# $ld->{TopLevel}{$feature_id}++ if !$self->{fast}
-# && $top_level; # need to track top level features
-
my $helper = $ld->{Helper};
$helper->indexit($feature_id=>1) if $index_it;
$helper->toplevel($feature_id=>1) if !$self->{fast}
Modified: bioperl-live/trunk/Bio/SeqFeature/Lite.pm
===================================================================
--- bioperl-live/trunk/Bio/SeqFeature/Lite.pm 2009-04-02 15:51:07 UTC (rev 15623)
+++ bioperl-live/trunk/Bio/SeqFeature/Lite.pm 2009-04-04 08:57:40 UTC (rev 15624)
@@ -266,7 +266,10 @@
-type => $type,
-name => $name,
-class => $class,
+ -phase => $self->{phase},
+ -score => $self->{score},
-source_tag => $source_tag,
+ -attributes => $self->{attributes},
);
$min_start = $start if $start < $min_start;
$max_stop = $stop if $stop > $max_stop;
@@ -280,9 +283,6 @@
}
if (@segments) {
local $^W = 0; # some warning of an uninitialized variable...
- # this was killing performance!
- # $self->{segments} = [ sort {$a->start <=> $b->start } @segments ];
- # this seems much faster and seems to work still
$self->{segments} = \@segments;
$self->{ref} ||= $self->{segments}[0]->seq_id;
$self->{start} = $min_start;
@@ -514,7 +514,8 @@
my $self = shift;
my $d = $self->{primary_id};
$self->{primary_id} = shift if @_;
- $d;
+ return $d if defined $d;
+ return (overload::StrVal($self) =~ /0x([a-f0-9]+)/)[0];
}
sub notes {
@@ -677,67 +678,66 @@
$string;
}
+# Suggested strategy for dealing with the multiple parentage issue.
+# First recurse through object tree and record parent tree.
+# Then recurse again, skipping objects we've seen before.
sub gff3_string {
- my ($self, $recurse, $preserveHomegenousParent, $dontPropogateParentAttrs,
- # Note: the following parameters, whose name begins with '$_',
- # are intended for recursive call only.
- $_parent,
- $_parentGroup, # if so, what is the group (GFF column 9) of the parent
- ) = @_;
+ my ($self,$recurse,$parent_tree,$seenit,$force_id) = @_;
+ $parent_tree ||= {};
+ $seenit ||= {};
+ my @rsf = ();
+ my @parent_ids;
- # PURPOSE: Return GFF3 format for the feature $self. Optionally
- # $recurse to include GFF for any subfeatures of the feature. If
- # recursing, provide special handling to "remove an extraneous level
- # of parentage" (unless $preserveHomegenousParent) for features
- # which have at least one subfeature with the same type as the
- # feature itself (thus redefining Lincoln's "homogenous
- # parent/child" case, which previously required all children to have
- # the same type as parent). This usage is a convention for
- # representing discontiguous features; they may be created by using
- # the -segment directive without specifying a distinct -subtype to
- # Bio::SeqFeature::LiteBase->new (or to Bio::DB::SeqFeature,
- # Bio::SeqFeature::Lite). Such homogenous subfeatures created in
- # this fashion TYPICALLY do not have the parent (GFF column 9)
- # attributes propogated to them; but, since they are all part of the
- # same parent, the ONLY difference relevant to GFF production SHOULD
- # be the $start and $end coordinates for their segment, and ALL
- # THEIR OTHER ATTRIBUTES should be taken from the parent (including:
- # score, Name, ID, Parent, etc), which happens UNLESS
- # $dontPropogateParentAttrs is passed.
+ if ($recurse) {
+ $self->_traverse($parent_tree) unless %$parent_tree; # this will record parents of all children
+ my $primary_id = defined $force_id ? $force_id : $self->primary_id;
- my @rsf = $recurse ? $self->sub_SeqFeature : ();
- my $recurseSubfeatureWithSameType =
- # will be TRUE if we're going to recurse and at least 1 subfeature
- # has same type as $self.
- sub {($_->type eq $self->type) && return 1 for @rsf ; 0 }->();
- my $typeIsSameAsParent = $_parent && ($_parent->type eq $self->type);
- my $hparentOrSelf = ($typeIsSameAsParent && ! $dontPropogateParentAttrs) ? $_parent : $self;
- my $group = ($typeIsSameAsParent && ! $dontPropogateParentAttrs)
- ? $_parentGroup
- : $self->format_attributes($_parent);
+ return if $seenit->{$primary_id}++;
- my @gff3 = $recurseSubfeatureWithSameType && ! $preserveHomegenousParent ? () :
- do {
- my $name = $hparentOrSelf->name;
- my $class = $hparentOrSelf->class;
- my $strand = ('-','.','+')[$hparentOrSelf->strand+1];
- # TODO: understand conditions under which $self->strand could be other than
- # $hparentOrSelf->strand. In particular, why does add_segment flip
- # the strand when start > stop? I thought this was not allowed!
- # Lincoln - any ideas?
- my $p = join("\t",
- $hparentOrSelf->ref||'.',$hparentOrSelf->source||'.',$hparentOrSelf->method||'.',
- $self->start||'.',$self->stop||'.',
- defined($hparentOrSelf->score) ? $hparentOrSelf->score : '.',
- $strand||'.',
- defined($hparentOrSelf->phase) ? $hparentOrSelf->phase : '.',
- $group||'');
- $p;
- };
- join("\n", @gff3, map {$_->gff3_string($recurse,$preserveHomegenousParent,
- $dontPropogateParentAttrs,$hparentOrSelf,$group)} @rsf);
+ @rsf = $self->get_SeqFeatures;
+ if (@rsf) {
+ # Detect case in which we have a split location feature. In this case we
+ # skip to the grandchildren and trick them into thinking that our parent is theirs.
+ my %types = map {$_->primary_tag=>1} @rsf;
+ my @types = keys %types;
+ if (@types == 1 && $types[0] eq $self->primary_tag) {
+ return join ("\n",map {$_->gff3_string(1,$parent_tree,{},$primary_id)} @rsf);
+ }
+ }
+
+ @parent_ids = keys %{$parent_tree->{$primary_id}};
+ }
+
+ my $group = $self->format_attributes(\@parent_ids,$force_id);
+ my $name = $self->name;
+
+ my $class = $self->class;
+ my $strand = ('-','.','+')[$self->strand+1];
+ my $p = join("\t",
+ $self->seq_id||'.',
+ $self->source||'.',
+ $self->method||'.',
+ $self->start||'.',
+ $self->stop||'.',
+ defined($self->score) ? $self->score : '.',
+ $strand||'.',
+ defined($self->phase) ? $self->phase : '.',
+ $group||'');
+ return join("\n",
+ $p,
+ map {$_->gff3_string(1,$parent_tree,$seenit)} @rsf);
}
+sub _traverse {
+ my $self = shift;
+ my $tree = shift; # tree => {$child}{$parent} = 1
+ my $parent = shift;
+ my $id = $self->primary_id;
+ defined $id or return;
+ $tree->{$id}{$parent->primary_id}++ if $parent;
+ $_->_traverse($tree,$self) foreach $self->get_SeqFeatures;
+}
+
sub db { return }
sub source_tag {
@@ -798,18 +798,26 @@
}
sub format_attributes {
- my $self = shift;
- my $parent = shift;
+ my $self = shift;
+ my $parent = shift;
+ my $fallback_id = shift;
+
@@ Diff output truncated at 10000 characters. @@
More information about the Bioperl-guts-l
mailing list