Index: Bio/Perl.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/Perl.pm,v
retrieving revision 1.5
diff -a -u -r1.5 Perl.pm
--- Bio/Perl.pm	16 Feb 2002 21:16:43 -0000	1.5
+++ Bio/Perl.pm	22 Mar 2005 16:49:33 -0000
@@ -1,4 +1,4 @@
-
+# $Id: Perl.pm,v 1.20 2003/09/15 13:38:07 bosborne Exp $
 #
 # BioPerl module for Bio::Perl
 #
@@ -16,47 +16,52 @@
 
 =head1 SYNOPSIS
 
-   use Bio::Perl qw(read_sequence read_all_sequences write_sequence new_sequence get_sequence);
-
-# will guess file format from extension
-   $seq_object = read_sequence($filename); 
-
-# forces genbank format
-   $seq_object = read_sequence($filename,'genbank'); 
-
-# reads an array of sequences
-   @seq_object_array = read_all_sequences($filename,'fasta'); 
-
-# sequences are Bio::Seq objects, so the following methods work
-# for more info see L<Bio::Seq>, or do 'perldoc Bio/Seq.pm'
-
-   print "Sequence name is ",$seq_object->display_id,"\n";
-   print "Sequence acc  is ",$seq_object->accession_number,"\n";
-   print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
-
-# get the whole sequence as a single string
-
-   $sequence_as_a_string = $seq_object->seq();
-
-# writing sequences
-
-   write_sequence(">$filename",'genbank',$seq_object);
+  use Bio::Perl;
 
-   write_sequence(">$filename",'genbank',@seq_object_array);
+  # will guess file format from extension
+  $seq_object = read_sequence($filename);
 
-# making a new sequence from just strings you have
-# from something else
+  # forces genbank format
+  $seq_object = read_sequence($filename,'genbank');
 
-   $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA","myname","AL12232");
+  # reads an array of sequences
+  @seq_object_array = read_all_sequences($filename,'fasta');
+
+  # sequences are Bio::Seq objects, so the following methods work
+  # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm'
+  print "Sequence name is ",$seq_object->display_id,"\n";
+  print "Sequence acc  is ",$seq_object->accession_number,"\n";
+  print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
+
+  # get the whole sequence as a single string
+  $sequence_as_a_string = $seq_object->seq();
+
+  # writing sequences
+  write_sequence(">$filename",'genbank',$seq_object);
+  write_sequence(">$filename",'genbank',@seq_object_array);
+
+  # making a new sequence from just a string
+  $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
+      "myname","AL12232");
+
+  # getting a sequence from a database (assumes internet connection)
+  $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
+  $seq_object = get_sequence('embl',"AI129902");
+  $seq_object = get_sequence('genbank',"AI129902");
+
+  # BLAST a sequence (assumes an internet connection)
+  $blast_report = blast_sequence($seq_object);
+
+  # BLAST one or more sequences (requires a local Blast server running wwwBlast)
+  # Default URL http://localhost/blast/Blast.cgi
+  my $seq_string = 'MKVDVGPDPSLVYRPDVDPEVAKDKASFRNYTSGPLLDRVFT';
+  my $localServerURL = 'http://127.0.0.1/blast/Blast.cgi';
+  my $localDB = 'test_aa_db';
+  $blast_report = wwwBlast_sequence($seq_object, $localDB);
+  $blast_report = wwwBlast_sequence($seq_string, $localServerURL, $localDB);
 
-
-# getting a sequence from a database (assumes internet connection)
-
-   $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
-
-   $seq_object = get_sequence('embl',"AI129902");
-
-   $seq_object = get_sequence('genbank',"AI129902");
+  # writing out a blast report
+  write_blast(">blast.out",$blast_report);
 
 
 =head1 DESCRIPTION
@@ -80,7 +85,7 @@
 or the web:
 
   bioperl-bugs@bio.perl.org
-  http://bio.perl.org/bioperl-bugs/
+  http://bugzilla.bioperl.org/
 
 =head1 AUTHOR - Ewan Birney
 
@@ -90,7 +95,7 @@
 
 =head1 APPENDIX
 
-The rest of the documentation details each of the object methods. 
+The rest of the documentation details each of the object methods.
 Internal methods are usually preceded with a _
 
 =cut
@@ -100,19 +105,20 @@
 
 
 package Bio::Perl;
-use vars qw(@ISA @EXPORT_OK $DBOKAY);
+use vars qw(@ISA @EXPORT @EXPORT_OK $DBOKAY);
 use strict;
 use Carp;
 use Exporter;
 
 use Bio::SeqIO;
 use Bio::Seq;
-BEGIN { 
-    eval { 
+BEGIN {
+    eval {
 	require Bio::DB::EMBL;
 	require Bio::DB::GenBank;
 	require Bio::DB::SwissProt;
 	require Bio::DB::RefSeq;
+	require Bio::DB::GenPept;
     };
     if( $@ ) {
 	$DBOKAY = 0;
@@ -123,8 +129,13 @@
 
 @ISA = qw(Exporter);
 
-@EXPORT_OK = qw(read_sequence read_all_sequences write_sequence 
-		new_sequence get_sequence);
+@EXPORT = qw(read_sequence read_all_sequences write_sequence 
+	     new_sequence get_sequence translate translate_as_string 
+	     reverse_complement revcom revcom_as_string 
+	     reverse_complement_as_string blast_sequence 
+	     wwwBlast_sequence write_blast);
+
+@EXPORT_OK = @EXPORT;
 
 
 =head2 read_sequence
@@ -247,13 +258,18 @@
    my $error = 0;
    my $seqname = "sequence1";
 
+   # catch users who haven't passed us a filename we can open
+   if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
+       $filename = ">".$filename;
+   }
+
    my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format);
 
    foreach my $seq ( @sequence_objects ) {
        my $seq_obj;
 
        if( !ref $seq ) {
-	   if( length $seq > 50 ) { 
+	   if( length $seq > 50 ) {
 	       # odds are this is a sequence as a string, and someone has not figured out
 	       # how to make objects. Warn him/her and then make a sequence object
 	       # from this
@@ -286,7 +302,7 @@
  Usage   :
  Function:
  Example :
- Returns : 
+ Returns :
  Args    :
 
 
@@ -308,14 +324,245 @@
    return $seq_object;
 }
 
+=head2 blast_sequence
+
+ Title   : blast_sequence
+ Usage   : $blast_result = blast_sequence($seq)
+           $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
+
+ Function: If the computer has Internet accessibility, blasts
+           the sequence using the NCBI BLAST server against nrdb.
+
+           Any additional parameters specified for the GET or PUT are passed
+           as is to Bio::Tools::Run::RemoteBlast. By default, runs blastp
+           with E-val = 1e-10.
+
+           This function uses Bio::Tools::Run::RemoteBlast, which itself
+           use Bio::SearchIO - as soon as you want to know more, check out
+           these modules.
+
+ Returns : Bio::Search::Result::GenericResult
+
+ Args    : $seq = A string of protein letters or nucleotides
+                  or a Bio::Seq object.
+           1 (verbose) or 0 (quiet)
+           @args = optional PUT and GET arguments to pass to RemoteBlast
+                   (e.g. '-prog' => 'blastn', '-expect' => '1e-3',  
+                    '-descriptions' => '25'). 
+           For a description of the many possible parameters see:
+           http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html or the BEGIN 
+           block of Bio::Tools::Run::Tools::RemoteBlast.
+
+=cut
+
+sub blast_sequence {
+    my ($seq, $verbose, @params) = @_;
+
+    if ( !defined $verbose ) {
+	$verbose = 1;
+    } elsif ( $verbose =~ /^-/ ) {
+	# $verbose is actually tag for first argument restore it
+	# where it belongs
+	unshift @params, $verbose;
+	$verbose = 1;
+    }
+
+    if( !ref $seq ) {
+	$seq = Bio::Seq->new('-seq' => $seq, '-id' => 'blast-sequence-temp-id');
+    } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
+	croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
+    }
+
+    require Bio::Tools::Run::RemoteBlast;
+
+    # set some default values which are different from those in RemoteBlast
+    # unless their values are passed as parameters
+    my ($prog_found, $expect_found);
+    my $count = 0;
+    for my $arg ( @params ) {
+	if ( $arg =~ /-prog/ ) {
+	    $prog_found = 1;
+	    if ( ++$count > 1) { 
+		last;
+	    }
+	} elsif ( $arg =~ /-expect/ ) {
+	    $expect_found = 1;
+	    if ( ++$count > 1) { 
+		last;
+	    }
+	}
+    }
+    push @params, ('-prog', 'blastp') unless $prog_found; 
+    push @params, ('-expect', '1e-10') unless $expect_found;
+    push @params, ('-readmethod', 'SearchIO');
+
+    my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
+
+    my $r = $factory->submit_blast($seq);
+    if( $verbose ) {
+	print STDERR "Submitted Blast for [".$seq->id."] ";
+    }
+    sleep 5;
+
+    my $result;
+
+    LOOP :
+    while( my @rids = $factory->each_rid) {
+	foreach my $rid ( @rids ) {
+	    my $rc = $factory->retrieve_blast($rid);
+	    if( !ref($rc) ) {
+		if( $rc < 0 ) {
+		    $factory->remove_rid($rid);
+		}
+		if( $verbose ) {
+		    print STDERR ".";
+		}
+		sleep 10;
+	    } else {
+		$result = $rc->next_result();
+		$factory->remove_rid($rid);
+		last LOOP;
+	    }
+	}
+    }
+
+    if( $verbose ) {
+	print STDERR "\n";
+    }
+    return $result;
+}
+
+=head2 wwwBlast_sequence
+
+ Title   : wwwBlast_sequence
+
+ Usage   : $blast_result = wwwBlast_sequence($seq_obj)
+           $blast_result = wwwBlast_sequence($seq, $localDB, $localServerURL)
+           $blast_result = wwwBlast_sequence($filename)
+
+ Function: Blasts the sequence using an accessible wwwBlast server against
+           an optionally specified local database. Default server is
+           http://localhost/blast/Blast.cgi. Default db is test_na_db. Default
+           program is blastn.
+
+           Any additional parameters specified for the GET or PUT are passed
+           as is to Bio::Tools::Run::LocalServerBlast. Default values are those
+           of Bio::Tools::Run::LocalServerBlast.
+
+           This function uses Bio::Tools::Run::LocalServerBlast, which itself
+           uses Bio::SearchIO - as soon as you want to know more, check out
+           these modules.
+
+ Returns : -1 on error
+           An array of Bio::Search::Result::GenericResult
+
+ Args    : $seq = A string of protein letters or nucleotides, the name of
+                  of a file containing one or more Fasta-formatted sequences, 
+                  or a Bio::Seq object.
+           1 (verbose) or 0 (quiet)
+           @args = optional arguments, including the URL of the local Blast
+                   server ('-server' => 'http://localhost/blast/Blast.cgi'),
+                   the name of the database to blast against ('-database' => 
+		   'my_db'), or PUT and GET arguments to pass to LocalServerBlast
+                   (e.g. HITLIST_SIZE => '250', EXPECT => '1e-6',
+		    DESCRIPTIONS => '25'). 
+           For a description of the many possible parameters see:
+           http://athena.bioc.uvic.ca/blast/readme.html#Installation or the BEGIN 
+           block of Bio::Tools::Run::Tools::LocalServerBlast.
+
+=cut
+
+sub wwwBlast_sequence {
+
+    my ($seq, $verbose, @params) = @_;
+
+    # do a quick check on $seq
+    if ( !defined $seq ) {
+	confess("No sequence provided. Can't Blast.\n");
+    } elsif ( ref $seq) {
+	$seq->isa('Bio::PrimarySeqI') or
+	    croak("[$seq] is an object but not a Bio::Seq object so cannot be Blasted\n");
+    } elsif ( ! -e $seq ) {
+	# not a valid filename so probably a sequence-in-a-string
+	$seq = Bio::Seq->new( '-seq' => $seq, '-id' => 'default ID');
+    }
+
+    if ( !defined $verbose ) {
+	$verbose = 1;
+    } elsif ( $verbose =~ /^\D+/ ) {
+	# $verbose is actually tag for first argument so restore it to
+	# where it belongs
+	unshift @params, $verbose;
+	$verbose = 1;
+    }
+
+    require Bio::Tools::Run::LocalServerBlast;
+    # set the server address if it's been passed. Kludge for compatibility: also
+    # check whether ALIGNMENT_VIEW set using text rather than number and fix if so
+    my $index = 0;
+    my %alignment_options =  ('Pairwise' => 0,
+			      'QueryAnchored' => 1,
+			      'QueryAnchoredNoIdentitites' => 2,
+			      'FlatQueryAnchored' => 3,
+			      'FlatQueryAnchoredNoIdentities' => 4,
+			      'BlastXML' => 7,
+			      'Tabular' => 9);
+    for my $arg ( @params ) {
+	if ( $arg =~ /-server/i ) {
+	    $Bio::Tools::Run::LocalServerBlast::URLBASE = $params[$index];
+	    last;
+	} elsif ( ($arg =~ /alignment_view/i) && ($params[$index+1] !~ /[0-9]/) ) {
+	    $params[$index+1] = $alignment_options{$params[$index+1]};
+	}
+	$index++;
+    }
+    push @params, ('-readmethod', 'SearchIO');
+
+    my $factory = Bio::Tools::Run::LocalServerBlast->new(@params);
+    print STDOUT "Job\(s\) submitted.\n" if $verbose;
+    my @r = $factory->submit_blast($seq);
+    
+    return @r;
+}
+
+=head2 write_blast
+
+ Title   : write_blast
+ Usage   : write_blast($filename,$blast_report);
+
+ Function: Writes a BLAST result object (or more formally
+           a SearchIO result object) out to a filename
+           in BLAST-like format
+
+ Returns : none
+
+ Args    : filename as a string
+           Bio::SearchIO::Results object
+
+=cut
+
+sub write_blast {
+    my ($filename,$blast) = @_;
+
+    if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
+	$filename = ">".$filename;
+    }
+
+    my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename);
+
+    $output->write_result($blast);
+
+}
+
 =head2 get_sequence
 
  Title   : get_sequence
  Usage   : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
 
- Function: If the computer has Internet accessibility, gets
+ Function: If the computer has Internet access this method gets
            the sequence from Internet accessible databases. Currently
-           this supports Swissprot, EMBL, GenBank and RefSeq.
+           this supports Swissprot ('swiss'), EMBL ('embl'), GenBank
+           ('genbank'), GenPept ('genpept'), and RefSeq ('refseq').
 
            Swissprot and EMBL are more robust than GenBank fetching.
 
@@ -324,12 +571,13 @@
 
  Returns : A Bio::Seq object
 
- Args    : database type - one of swiss, embl, genbank or refseq
-           identifier or accession number
+ Args    : database type - one of swiss, embl, genbank, genpept, or 
+           refseq
 
 =cut
 
 my $genbank_db = undef;
+my $genpept_db = undef;
 my $embl_db = undef;
 my $swiss_db = undef;
 my $refseq_db = undef;
@@ -337,38 +585,45 @@
 sub get_sequence{
    my ($db_type,$identifier) = @_;
    if( ! $DBOKAY ) {
-       confess("Your system does not have IO::String installed so the DB retrieval method is not available");
+       confess ("Your system does not have one of LWP, HTTP::Request::Common, IO::String installed so the DB retrieval method is not available.  \nFull error message is:\n $!\n");
        return;
    }
    $db_type = lc($db_type);
 
    my $db;
 
-   if( $db_type =~ /gen/ ) {
+   if( $db_type =~ /genbank/ ) {
        if( !defined $genbank_db ) {
 	   $genbank_db = Bio::DB::GenBank->new();
-       } 
+       }
        $db = $genbank_db;
    }
+   if( $db_type =~ /genpept/ ) {
+       if( !defined $genpept_db ) {
+	   $genpept_db = Bio::DB::GenPept->new();
+       } 
+       $db = $genpept_db;
+   }
 
    if( $db_type =~ /swiss/ ) {
        if( !defined $swiss_db ) {
 	   $swiss_db = Bio::DB::SwissProt->new();
-       } 
+       }
        $db = $swiss_db;
    }
 
    if( $db_type =~ /embl/ ) {
        if( !defined $embl_db ) {
 	   $embl_db = Bio::DB::EMBL->new();
-       } 
+       }
        $db = $embl_db;
    }
 
-   if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and $identifier =~ /_/)) {
+   if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and 
+				$identifier =~ /^\s*N\S+_/)) {
        if( !defined $refseq_db ) {
 	   $refseq_db = Bio::DB::RefSeq->new();
-       } 
+       }
        $db = $refseq_db;
    }
 
@@ -383,4 +638,181 @@
    return $seq;
 }
 
+=head2 translate
+
+ Title   : translate
+ Usage   : $seqobj = translate($seq_or_string_scalar)
+
+ Function: translates a DNA sequence object OR just a plain
+           string of DNA to amino acids
+ Returns : A Bio::Seq object
+
+ Args    : Either a sequence object or a string of
+           just DNA sequence characters
+
+=cut
+
+sub translate {
+   my ($scalar) = shift;
+
+   my $obj;
+
+   if( ref $scalar ) {
+     if( !$scalar->isa("Bio::PrimarySeqI") ) {
+        confess("Expecting a sequence object not a $scalar");
+     } else {
+        $obj= $scalar;
+
+     }
+
+   } else {
+
+     # check this looks vaguely like DNA
+     my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ );
+
+     if( $n < length($scalar) * 0.85 ) {
+       confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
+     }
+
+     $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
+   }
+
+   return $obj->translate();
+}
+
+
+=head2 translate_as_string
+
+ Title   : translate_as_string
+ Usage   : $seqstring = translate_as_string($seq_or_string_scalar)
+
+ Function: translates a DNA sequence object OR just a plain
+           string of DNA to amino acids
+ Returns : A stirng of just amino acids
+
+ Args    : Either a sequence object or a string of
+           just DNA sequence characters
+
+=cut
+
+sub translate_as_string {
+   my ($scalar) = shift;
+
+   my $obj = Bio::Perl::translate($scalar);
+
+   return $obj->seq;
+}
+
+
+=head2 reverse_complement
+
+ Title   : reverse_complement
+ Usage   : $seqobj = reverse_complement($seq_or_string_scalar)
+
+ Function: reverse complements a string or sequence argument
+           producing a Bio::Seq - if you want a string, you
+           can use reverse_complement_as_string
+ Returns : A Bio::Seq object
+
+ Args    : Either a sequence object or a string of
+           just DNA sequence characters
+
+=cut
+
+sub reverse_complement {
+   my ($scalar) = shift;
+
+   my $obj;
+
+   if( ref $scalar ) {
+     if( !$scalar->isa("Bio::PrimarySeqI") ) {
+        confess("Expecting a sequence object not a $scalar");
+     } else {
+        $obj= $scalar;
+
+     }
+
+   } else {
+
+     # check this looks vaguely like DNA
+     my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ );
+
+     if( $n < length($scalar) * 0.85 ) {
+       confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
+     }
+
+     $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
+   }
+
+   return $obj->revcom();
+}
+
+=head2 revcom
+
+ Title   : revcom
+ Usage   : $seqobj = revcom($seq_or_string_scalar)
+
+ Function: reverse complements a string or sequence argument
+           producing a Bio::Seq - if you want a string, you
+           can use reverse_complement_as_string
+
+           This is an alias for reverse_complement
+ Returns : A Bio::Seq object
+
+ Args    : Either a sequence object or a string of
+           just DNA sequence characters
+
+=cut
+
+sub revcom {
+    return &Bio::Perl::reverse_complement(@_);
+}
+
+
+=head2 reverse_complement_as_string
+
+ Title   : reverse_complement_as_string
+ Usage   : $string = reverse_complement_as_string($seq_or_string_scalar)
+
+ Function: reverse complements a string or sequence argument
+           producing a string
+ Returns : A string of DNA letters
+
+ Args    : Either a sequence object or a string of
+           just DNA sequence characters
+
+=cut
+
+sub reverse_complement_as_string {
+   my ($scalar) = shift;
+
+   my $obj = &Bio::Perl::reverse_complement($scalar);
+
+   return $obj->seq;
+}
+
+
+=head2 revcom_as_string
+
+ Title   : revcom_as_string
+ Usage   : $string = revcom_as_string($seq_or_string_scalar)
+
+ Function: reverse complements a string or sequence argument
+           producing a string
+ Returns : A string of DNA letters
+
+ Args    : Either a sequence object or a string of
+           just DNA sequence characters
+
+=cut
+
+sub revcom_as_string {
+   my ($scalar) = shift;
+
+   my $obj = &Bio::Perl::reverse_complement($scalar);
+
+   return $obj->seq;
+}
+
+
 1;
