[Bioperl-guts-l] bioperl commit

Lincoln Stein lstein at pub.open-bio.org
Wed Feb 19 21:45:14 EST 2003


lstein
Wed Feb 19 21:45:14 EST 2003
Update of /home/repository/bioperl/bioperl-live/Bio/DB/Flat
In directory pub.open-bio.org:/tmp/cvs-serv16801/Bio/DB/Flat

Modified Files:
	BDB.pm BinarySearch.pm 
Log Message:
added some OBDA documentation plus flat file index load script
bioperl-live/Bio/DB/Flat BDB.pm,1.8,1.9 BinarySearch.pm,1.4,1.5
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/Flat/BDB.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- /home/repository/bioperl/bioperl-live/Bio/DB/Flat/BDB.pm	2003/02/18 02:30:36	1.8
+++ /home/repository/bioperl/bioperl-live/Bio/DB/Flat/BDB.pm	2003/02/20 02:45:14	1.9
@@ -212,7 +212,7 @@
   for my $file (@files) {
     $file = File::Spec->rel2abs($file)
       unless File::Spec->file_name_is_absolute($file);
-    $count++ if $self->_index_file($file);
+    $count += $self->_index_file($file);
   }
   $self->write_config;
   $count;
@@ -227,14 +227,16 @@
 
   my $fh     = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!");
   my $offset = 0;
+  my $count  = 0;
   while (!eof($fh)) {
     my ($ids,$adjustment)  = $self->parse_one_record($fh) or next;
     $adjustment ||= 0;  # prevent uninit variable warning
     my $pos = tell($fh) + $adjustment;
     $self->_store_index($ids,$file,$offset,$pos-$offset);
     $offset = $pos;
+    $count++;
   }
-  1;
+  $count;
 }
 
 =head2 To Be Implemented in Subclasses

===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/Flat/BinarySearch.pm,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- /home/repository/bioperl/bioperl-live/Bio/DB/Flat/BinarySearch.pm	2003/02/18 03:31:53	1.4
+++ /home/repository/bioperl/bioperl-live/Bio/DB/Flat/BinarySearch.pm	2003/02/20 02:45:14	1.5
@@ -569,6 +569,36 @@
 
 }
 
+=head2 get_all_primary_ids
+
+ Title   : get_all_primary_ids
+ Usage   : @ids = $seqdb->get_all_primary_ids()
+ Function: gives an array of all the primary_ids of the
+           sequence objects in the database.
+ Example :
+ Returns : an array of strings
+ Args    : none
+
+=cut
+
+sub get_all_primary_ids {
+  my $self = shift;
+
+  my $fh = $self->primary_index_filehandle;
+  sysseek ($fh,0,2);
+  my $filesize = (tell $fh);
+  my $recsize  = $self->record_size;
+  my $end = $filesize;
+
+  my @ids;
+  for (my $pos=$self->{_start_pos}; $pos < $end; $pos += $recsize) {
+    my $record = $self->read_record($fh,$pos,$recsize);
+    my ($entryid)  = split(/\t/,$record);
+    push @ids,$entryid;
+  }
+  @ids;
+}
+
 
 =head2 find_entry
 
@@ -646,7 +676,7 @@
     if (!defined($rootdir)) {
 	$self->throw("No index directory set - can't build indices");
     }
-    
+
     if (! -d $rootdir) {
 	$self->throw("Index directory [$rootdir] is not a directory. Cant' build indices");
     }
@@ -660,28 +690,37 @@
     if (!(@files)) {
 	$self->throw("Must enter an array of filenames to index");
     }
-    
-    my $pwd = `pwd`; chomp($pwd);
 
     foreach my $file (@files) {
-	if ($file !~ /^\//) {
-	    $file = $pwd . "/$file";
-	}
+      $file = File::Spec->rel2abs($file)
+	unless File::Spec->file_name_is_absolute($file);
 	if (! -e $file) {
 	    $self->throw("Can't index file [$file] as it doesn't exist");
 	}
     }
-    
+
+    if (my $filehash = $self->{_dbfile}) {
+      push @files,keys %$filehash;
+    }
+
+    my %seen;
+    @files = grep {!$seen{$_}++} @files;
+
+    # Lets index
     $self->make_config_file(\@files);
-    
-    # Finally lets index
+    my $entries = 0;
     foreach my $file (@files) {
-	$self->_index_file($file);
+      $entries += $self->_index_file($file);
     }
 
+    # update alphabet if necessary
+    $self->make_config_file(\@files);
+
     # And finally write out the indices
     $self->write_primary_index;
     $self->write_secondary_indices;
+
+    $entries;
 }
 
 =head2 _index_file
@@ -713,7 +752,7 @@
     my $pos = 0;
 
     my $new_primary_entry;
-    
+
     my $length;
     #my $pos = 0;
     my $fh = \*FILE;
@@ -726,9 +765,11 @@
 
     while (<$fh>) {
       $last_one = $_;
+      $self->{alphabet} ||= $self->guess_alphabet($_);		
       if ($_ =~ /$start_pattern/) {
 	if ($done == 0) {
 	  $id = $new_primary_entry;
+	  $self->{alphabet} ||= $self->guess_alphabet($_);
 		
 	  my $tmplen = tell($fh) - length($_);
 
@@ -780,7 +821,6 @@
     # Remember to add in the last one
 
     $id = $new_primary_entry;
-		
     my $tmplen = tell($fh) - length($last_one);
 
     $length = $tmplen  - $pos;
@@ -799,8 +839,10 @@
     }
     
     $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
+    $count++;
     
     close(FILE);
+    $count;
 }
 
 =head2 write_primary_index
@@ -822,8 +864,6 @@
 
     @ids = sort {$a cmp $b} @ids;
 
-    print STDERR "Number of ids = " . scalar(@ids) . "\n";
-
     open (INDEX,">" . $self->primary_index_file) || $self->throw("Can't open primary index file [" . $self->primary_index_file . "]");
 
     my $recordlength = $self->{_maxidlength} +
@@ -1103,10 +1143,12 @@
     if (!defined($self->format)) {
 	$self->throw("Format does not exist in module - can't write config file");
     } else {
-	print CON "format\t" . $self->format . "\n";
+      my $format = $self->format;
+      my $alphabet = $self->alphabet;
+      my $alpha    = $alphabet ? "/$alphabet" : '';
+      print CON "format\t" . "URN:LSID:open-bio.org:$format$alpha\n";
     }
 
-
     close(CON);
 }
 
@@ -1185,9 +1227,16 @@
 
 	if ($_ =~ /format\t(\S+)/) {
 
-	    # Check the format here?
+	  # Check the format here?
+	  my $format = $1;
 
+	  # handle LSID format
+	  if ($format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))/) {
+	    $self->format($1);
+	    $self->alphabet($2);
+	  } else {  # compatibility with older versions
 	    $self->format($1);
+	  }
 	}
     }
     close(CON);
@@ -1316,6 +1365,14 @@
 
 }
 
+sub alphabet{
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{alphabet} = $value;
+    }
+    return $obj->{alphabet};
+}
+
 =head2 write_flag
 
  Title   : write_flag
@@ -1642,6 +1699,32 @@
 
 }
 
+# EVERYTHING THAT FOLLOWS THIS
+# is an awful hack - in reality Michele's code needs to be rewritten
+# to use Bio::SeqIO, but I have too little time to do this -- LS
+sub guess_alphabet {
+  my $self = shift;
+  my $line = shift;
+
+  my $format = $self->format;
+  return 'protein' if $format eq 'swissprot';
+
+  if ($format eq 'genbank') {
+    return unless $line =~ /^LOCUS/;
+    return 'dna' if $line =~ /\s+\d+\s+bp/i;
+    return 'protein';
+  }
+
+  if ($format eq 'embl') {
+    return unless $line =~ /^ID/;
+    return 'dna' if $line =~ / DNA;/i;
+    return 'rna' if $line =~ / RNA;/i;
+    return 'protein';
+  }
+
+  return;
+}
+
 # return (namespace,primary_pattern,start_pattern,secondary_pattern)
 sub _guess_patterns {
   my $self = shift;



More information about the Bioperl-guts-l mailing list