[Biopython-dev] [patch] NCBIStandalone.py cleanup

Fri Aug 8 06:09:03 EDT 2003

Hi,

Here's a patch bringing Bio.Blast.NCBIStandalone nearer to Python 3 
compatibility :-)

Changelog:
* Don't use the string module
* use .find() == -1, not .find() <= 0
* use .startswith() and .endwith()

Regards,

yves
-------------- next part --------------
Index: NCBIStandalone.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Blast/NCBIStandalone.py,v
retrieving revision 1.47
diff -u -p -r1.47 NCBIStandalone.py

--- NCBIStandalone.py	2003/06/09 02:12:09	1.47
+++ NCBIStandalone.py	2003/08/08 10:04:35
@@ -34,9 +34,7 @@ blastpgp        Execute blastpgp.
 """
 
 import os
-import string
 import re
-from types import *
 
 from Bio import File
 from Bio.ParserSupport import *
@@ -145,9 +143,9 @@ class _Scanner:
 
         while 1:
             line = safe_peekline(uhandle)
-            if line[:9] != 'Searching' and \
-               re.search(r"Score +E", line) is None and \
-               string.find(line, 'No hits found') < 0:
+            if (not line.startswith('Searching') and
+                re.search(r"Score +E", line) is None and
+                line.find('No hits found') == -1):
                 break
 
             self._scan_descriptions(uhandle, consumer)
@@ -197,7 +195,7 @@ class _Scanner:
         # Check for these error lines and ignore them for now.  Let
         # the BlastErrorParser deal with them.
         line = uhandle.peekline()
-        if line.find("ERROR:") >= 0 or line.startswith("done"):
+        if line.find("ERROR:") != -1 or line.startswith("done"):
             read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
             read_and_call(uhandle, consumer.noevent, start="done")
 
@@ -256,7 +254,7 @@ class _Scanner:
 
         # Read the descriptions and the following blank lines, making
         # sure that there are descriptions.
-        if uhandle.peekline()[:19] != 'Sequences not found':
+        if not uhandle.peekline().startswith('Sequences not found'):
             read_and_call_until(uhandle, consumer.description, blank=1)
             read_and_call_while(uhandle, consumer.noevent, blank=1)
 
@@ -269,7 +267,7 @@ class _Scanner:
             # Read the descriptions and the following blank lines.
             read_and_call_while(uhandle, consumer.noevent, blank=1)
             l = safe_peekline(uhandle)
-            if l[:9] != 'CONVERGED' and l[0] != '>':
+            if not l.startswith('CONVERGED') and l[0] != '>':
                 read_and_call_until(uhandle, consumer.description, blank=1)
                 read_and_call_while(uhandle, consumer.noevent, blank=1)
 
@@ -281,7 +279,7 @@ class _Scanner:
     def _scan_alignments(self, uhandle, consumer):
         # First, check to see if I'm at the database report.
         line = safe_peekline(uhandle)
-        if line[:10] == '  Database':
+        if line.startswith('  Database'):
             return
         elif line[0] == '>':
             # XXX make a better check here between pairwise and masterslave
@@ -305,7 +303,7 @@ class _Scanner:
         # Scan a bunch of score/alignment pairs.
         while 1:
             line = safe_peekline(uhandle)
-            if line[:6] != ' Score':
+            if not line.startswith(' Score'):
                 break
             self._scan_hsp(uhandle, consumer)
         consumer.end_alignment()
@@ -318,7 +316,7 @@ class _Scanner:
         read_and_call(uhandle, consumer.title, start='>')
         while 1:
             line = safe_readline(uhandle)
-            if string.lstrip(line)[:8] == 'Length =':
+            if line.lstrip().startswith('Length ='):
                 consumer.length(line)
                 break
             elif is_blank_line(line):
@@ -372,7 +370,7 @@ class _Scanner:
             read_and_call_while(uhandle, consumer.noevent, blank=1)
             line = safe_peekline(uhandle)
             # Alignment continues if I see a 'Query' or the spaces for Blastn.
-            if line[:5] != 'Query' and line[:5] != '     ':
+            if not (line.startswith('Query') or line.startswith('     ')):
                 break
  
     def _scan_masterslave_alignment(self, uhandle, consumer):
@@ -382,10 +380,10 @@ class _Scanner:
             # Check to see whether I'm finished reading the alignment.
             # This is indicated by 1) database section, 2) next psi-blast round
             # patch by chapmanb
-            if line[:9] == 'Searching':
+            if line.startswith('Searching'):
                 uhandle.saveline(line)
                 break
-            elif line[:10] == '  Database':
+            elif line.startswith('  Database'):
                 uhandle.saveline(line)
                 break
             elif is_blank_line(line):
@@ -423,7 +421,7 @@ class _Scanner:
 
 	    line = safe_readline(uhandle)
 	    uhandle.saveline(line)
-            if string.find(line, 'Lambda') >= 0:
+            if line.find('Lambda') != -1:
 		break
 
 	read_and_call(uhandle, consumer.noevent, start='Lambda')
@@ -577,22 +575,22 @@ class _HeaderConsumer:
         self._header = Record.Header()
         
     def version(self, line):
-        c = string.split(line)
+        c = line.split()
         self._header.application = c[0]
         self._header.version = c[1]
         self._header.date = c[2][1:-1]
 
     def reference(self, line):
-        if line[:11] == 'Reference: ':
+        if line.startswith('Reference: '):
             self._header.reference = line[11:]
         else:
             self._header.reference = self._header.reference + line
             
     def query_info(self, line):
-        if line[:7] == 'Query= ':
+        if line.startswith('Query= '):
             self._header.query = line[7:]
-        elif line[:7] != '       ':  # continuation of query_info
-            self._header.query = self._header.query + line
+        elif not line.startswith('       '):  # continuation of query_info
+            self._header.query = "%s%s" % (self._header.query, line)
         else:
             letters, = _re_search(
                 r"([0-9,]+) letters", line,
@@ -600,11 +598,11 @@ class _HeaderConsumer:
             self._header.query_letters = _safe_int(letters)
                 
     def database_info(self, line):
-        line = string.rstrip(line)
-        if line[:10] == 'Database: ':
+        line = line.rstrip()
+        if line.startswith('Database: '):
             self._header.database = line[10:]
-	elif not line[-13:] == 'total letters':
-            self._header.database = self._header.database + string.strip(line)
+	elif not line.endswith('total letters'):
+            self._header.database = self._header.database + line.strip()
         else:
             sequences, letters =_re_search(
                 r"([0-9,]+) sequences; ([0-9,]+) total letters", line,
@@ -614,8 +612,8 @@ class _HeaderConsumer:
 
     def end_header(self):
         # Get rid of the trailing newlines
-        self._header.reference = string.rstrip(self._header.reference)
-        self._header.query = string.rstrip(self._header.query)
+        self._header.reference = self._header.reference.rstrip()
+        self._header.query = self._header.query.rstrip()
 
 class _DescriptionConsumer:
     def start_descriptions(self):
@@ -629,8 +627,8 @@ class _DescriptionConsumer:
         self.__has_n = 0   # Does the description line contain an N value?
 
     def description_header(self, line):
-        if line[:19] == 'Sequences producing':
-            cols = string.split(line)
+        if line.startswith('Sequences producing'):
+            cols = line.split()
             if cols[-1] == 'N':
                 self.__has_n = 1
     
@@ -656,9 +654,9 @@ class _DescriptionConsumer:
         pass
 
     def round(self, line):
-        if line[:18] != 'Results from round':
+        if not line.startswith('Results from round'):
             raise SyntaxError, "I didn't understand the round line\n%s" % line
-        self._roundnum = _safe_int(string.strip(line[18:]))
+        self._roundnum = _safe_int(line[18:])
 
     def end_descriptions(self):
         pass
@@ -674,23 +672,23 @@ class _DescriptionConsumer:
         #   - title must be preserved exactly (including whitespaces)
         #   - score could be equal to e-value (not likely, but what if??)
         #   - sometimes there's an "N" score of '1'.
-        cols = string.split(line)
+        cols = line.split()
         if len(cols) < 3:
             raise SyntaxError, \
                   "Line does not appear to contain description:\n%s" % line
         if self.__has_n:
-            i = string.rfind(line, cols[-1])        # find start of N
-            i = string.rfind(line, cols[-2], 0, i)  # find start of p-value
-            i = string.rfind(line, cols[-3], 0, i)  # find start of score
+            i = line.rfind(cols[-1])        # find start of N
+            i = line.rfind(cols[-2], 0, i)  # find start of p-value
+            i = line.rfind(cols[-3], 0, i)  # find start of score
         else:
-            i = string.rfind(line, cols[-1])        # find start of p-value
-            i = string.rfind(line, cols[-2], 0, i)  # find start of score
+            i = line.rfind(cols[-1])        # find start of p-value
+            i = line.rfind(cols[-2], 0, i)  # find start of score
         if self.__has_n:
             dh.title, dh.score, dh.e, dh.num_alignments = \
-                      string.rstrip(line[:i]), cols[-3], cols[-2], cols[-1]
+                      line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
         else:
             dh.title, dh.score, dh.e, dh.num_alignments = \
-                      string.rstrip(line[:i]), cols[-2], cols[-1], 1
+                      line[:i].rstrip(), cols[-2], cols[-1], 1
         dh.num_alignments = _safe_int(dh.num_alignments)
         dh.score = _safe_int(dh.score)
         dh.e = _safe_float(dh.e)
@@ -706,52 +704,52 @@ class _AlignmentConsumer:
         self._multiple_alignment = Record.MultipleAlignment()
 
     def title(self, line):
-        self._alignment.title = self._alignment.title + string.lstrip(line)
+        self._alignment.title = "%s%s" % (self._alignment.title,
+                                           line.lstrip())
 
     def length(self, line):
-        self._alignment.length = string.split(line)[2]
+        self._alignment.length = line.split()[2]
         self._alignment.length = _safe_int(self._alignment.length)
 
     def multalign(self, line):
         # Standalone version uses 'QUERY', while WWW version uses blast_tmp.
-        if line[:5] == 'QUERY' or line[:9] == 'blast_tmp':
+        if line.startswith('QUERY') or line.startswith('blast_tmp'):
             # If this is the first line of the multiple alignment,
             # then I need to figure out how the line is formatted.
             
             # Format of line is:
             # QUERY 1   acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60
             try:
-                name, start, seq, end = string.split(line)
+                name, start, seq, end = line.split()
             except ValueError:
                 raise SyntaxError, "I do not understand the line\n%s" \
                       % line
-            self._start_index = string.index(line, start, len(name))
-            self._seq_index = string.index(line, seq,
-                                           self._start_index+len(start))
+            self._start_index = line.index(start, len(name))
+            self._seq_index = line.index(seq,
+                                         self._start_index+len(start))
             # subtract 1 for the space
             self._name_length = self._start_index - 1
             self._start_length = self._seq_index - self._start_index - 1
-            self._seq_length = string.rfind(line, end) - self._seq_index - 1
+            self._seq_length = line.rfind(end) - self._seq_index - 1
             
-            #self._seq_index = string.index(line, seq)
+            #self._seq_index = line.index(seq)
             ## subtract 1 for the space
-            #self._seq_length = string.rfind(line, end) - self._seq_index - 1
-            #self._start_index = string.index(line, start)
+            #self._seq_length = line.rfind(end) - self._seq_index - 1
+            #self._start_index = line.index(start)
             #self._start_length = self._seq_index - self._start_index - 1
             #self._name_length = self._start_index
 
         # Extract the information from the line
-        name = string.rstrip(line[:self._name_length])
-        start = string.rstrip(
-            line[self._start_index:self._start_index+self._start_length])
+        name = line[:self._name_length]
+        name = name.rstrip()
+        start = line[self._start_index:self._start_index+self._start_length]
+        start = start.rstrip()
         if start:
             start = _safe_int(start)
-        end = string.rstrip(
-            line[self._seq_index+self._seq_length:])
+        end = line[self._seq_index+self._seq_length:].rstrip()
         if end:
             end = _safe_int(end)
-        seq = string.rstrip(
-            line[self._seq_index:self._seq_index+self._seq_length])
+        seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
         # right pad the sequence with spaces if necessary
         if len(seq) < self._seq_length:
             seq = seq + ' '*(self._seq_length-len(seq))
@@ -826,7 +824,7 @@ class _AlignmentConsumer:
     def end_alignment(self):
         # Remove trailing newlines
         if self._alignment:
-            self._alignment.title = string.rstrip(self._alignment.title)
+            self._alignment.title = self._alignment.title.rstrip()
 
         # This code is also obsolete.  See note above.
         # If there's a multiple alignment, I will need to make sure
@@ -883,16 +881,16 @@ class _HSPConsumer:
             "I could not find the identities in line\n%s" % line)
         self._hsp.identities = _safe_int(x), _safe_int(y)
 
-        if string.find(line, 'Positives') >= 0:
+        if line.find('Positives') != -1:
             x, y = _re_search(
                 r"Positives = (\d+)\/(\d+)", line,
                 "I could not find the positives in line\n%s" % line)
             self._hsp.positives = _safe_int(x), _safe_int(y)
 
-        if string.find(line, 'Gaps') >= 0:
+        if line.find('Gaps') != -1:
             x, y = _re_search(
                 r"Gaps = (\d+)\/(\d+)", line,
-                "I could not find the positives in line\n%s" % line)
+                "I could not find the gaps in line\n%s" % line)
             self._hsp.gaps = _safe_int(x), _safe_int(y)
 
         
@@ -905,7 +903,7 @@ class _HSPConsumer:
         # Frame can be in formats:
         # Frame = +1
         # Frame = +2 / +2
-        if string.find(line, '/') >= 0:
+        if line.find('/') != -1:
             self._hsp.frame = _re_search(
                 r"Frame = ([-+][123]) / ([-+][123])", line,
                 "I could not find the frame in line\n%s" % line)
@@ -931,7 +929,7 @@ class _HSPConsumer:
         self._query_len = len(seq)
 
     def align(self, line):
-        seq = string.rstrip(line[self._query_start_index:])
+        seq = line[self._query_start_index:].rstrip()
         if len(seq) < self._query_len:
             # Make sure the alignment is the same length as the query
             seq = seq + ' ' * (self._query_len-len(seq))
@@ -948,7 +946,7 @@ class _HSPConsumer:
 	#On occasion, there is a blast hit with no subject match
 	#so far, it only occurs with 1-line short "matches"
 	#I have decided to let these pass as they appear
-	if not string.strip(seq):
+	if not seq.strip():
             seq = ' ' * self._query_len
         self._hsp.sbjct = self._hsp.sbjct + seq
         if self._hsp.sbjct_start is None:
@@ -976,8 +974,8 @@ class _DatabaseReportConsumer:
             self._dr.database_name.append(m.group(1))
         elif self._dr.database_name:
             # This must be a continuation of the previous name.
-            x = self._dr.database_name[-1] + string.strip(line)
-            self._dr.database_name[-1] = x
+            self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1],
+                                                   line.strip())
 
     def posted_date(self, line):
         self._dr.posted_date.append(_re_search(
@@ -995,14 +993,14 @@ class _DatabaseReportConsumer:
         self._dr.num_sequences_in_database.append(_safe_int(sequences))
 
     def ka_params(self, line):
-        x = string.split(line)
+        x = line.split()
         self._dr.ka_params = map(_safe_float, x)
 
     def gapped(self, line):
         self._dr.gapped = 1
 
     def ka_params_gap(self, line):
-        x = string.split(line)
+        x = line.split()
         self._dr.ka_params_gap = map(_safe_float, x)
 
     def end_database_report(self):
@@ -1013,7 +1011,7 @@ class _ParametersConsumer:
         self._params = Record.Parameters()
 
     def matrix(self, line):
-        self._params.matrix = string.rstrip(line[8:])
+        self._params.matrix = line[8:].rstrip()
 
     def gap_penalties(self, line):
         x = _get_cols(
@@ -1021,7 +1019,7 @@ class _ParametersConsumer:
         self._params.gap_penalties = map(_safe_float, x)
 
     def num_hits(self, line):
-        if string.find(line, '1st pass') >= 0:
+        if line.find('1st pass') != -1:
             x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
             self._params.num_hits = _safe_int(x)
         else:
@@ -1029,7 +1027,7 @@ class _ParametersConsumer:
             self._params.num_hits = _safe_int(x)
 
     def num_sequences(self, line):
-        if string.find(line, '1st pass') >= 0:
+        if line.find('1st pass') != -1:
             x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
             self._params.num_sequences = _safe_int(x)
         else:
@@ -1037,7 +1035,7 @@ class _ParametersConsumer:
             self._params.num_sequences = _safe_int(x)
 
     def num_extends(self, line):
-        if string.find(line, '1st pass') >= 0:
+        if line.find('1st pass') != -1:
             x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
             self._params.num_extends = _safe_int(x)
         else:
@@ -1045,7 +1043,7 @@ class _ParametersConsumer:
             self._params.num_extends = _safe_int(x)
 
     def num_good_extends(self, line):
-        if string.find(line, '1st pass') >= 0:
+        if line.find('1st pass') != -1:
             x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
             self._params.num_good_extends = _safe_int(x)
         else:
@@ -1297,8 +1295,13 @@ class Iterator:
         If set to None, then the raw contents of the file will be returned.
 
         """
-        if type(handle) is not FileType and type(handle) is not InstanceType:
-            raise ValueError, "I expected a file handle or file-like object"
+        try:
+            dummy = handle.readline
+        except AttributeError:
+            raise ValueError(
+                "I expected a file handle or file-like object, got %s"
+                % type(handle))
+        del dummy
         self._uhandle = File.UndoHandle(handle)
         self._parser = parser
 
@@ -1315,7 +1318,8 @@ class Iterator:
             if not line:
                 break
             # If I've reached the next one, then put the line back and stop.
-            if lines and (line[:5] == 'BLAST' or line[1:6] == 'BLAST'):
+            if lines and (line.startswith('BLAST')
+                          or line.startswith('BLAST', start = 1)):
                 self._uhandle.saveline(line)
                 break
             lines.append(line)
@@ -1323,7 +1327,7 @@ class Iterator:
         if not lines:
             return None
             
-        data = string.join(lines, '')
+        data = ''.join(lines)
         if self._parser is not None:
             return self._parser.parse(File.StringHandle(data))
         return data
@@ -1559,7 +1563,7 @@ def _re_search(regex, line, error_msg):
     return m.groups()
 
 def _get_cols(line, cols_to_get, ncols=None, expected={}):
-    cols = string.split(line)
+    cols = line.split()
 
     # Check to make sure number of columns is correct
     if ncols is not None and len(cols) != ncols:
@@ -1584,13 +1588,14 @@ def _safe_int(str):
     except ValueError:
         # Something went wrong.  Try to clean up the string.
         # Remove all commas from the string
-        str = string.replace(str, ',', '')
+        str = str.replace(',', '')
     try:
         # try again.
         return int(str)
     except ValueError:
         pass
     # If it fails again, maybe it's too long?
+    # XXX why converting to float?
     return long(float(str))
 
 def _safe_float(str):
@@ -1599,13 +1604,13 @@ def _safe_float(str):
     # we need to check the string for this condition.
     
     # Sometimes BLAST leaves of the '1' in front of an exponent.
-    if str[0] in ['E', 'e']:
+    if str and str[0] in ['E', 'e']:
         str = '1' + str
     try:
         return float(str)
     except ValueError:
         # Remove all commas from the string
-        str = string.replace(str, ',', '')
+        str = str.replace(',', '')
     # try again.
     return float(str)
 
@@ -1613,7 +1618,7 @@ class _BlastErrorConsumer(_BlastConsumer
     def __init__(self):
         _BlastConsumer.__init__(self)
     def noevent(self, line):
-        if line.find("Query must be at least wordsize") >= 0:
+        if line.find("Query must be at least wordsize") != -1:
             raise ShortQueryBlastError, "Query must be at least wordsize"
         # Now pass the line back up to the superclass.
         method = getattr(_BlastConsumer, 'noevent',
@@ -1687,7 +1692,7 @@ class BlastErrorParser(AbstractParser):
             # 'Searchingdone' instead of 'Searching......done' seems
             # to indicate a failure to perform the BLAST due to
             # low quality sequence
-            if line[:13] == 'Searchingdone':
+            if line.startswith('Searchingdone'):
                 raise LowQualityBlastError("Blast failure occured on query: ",
                                            data_record.query)
             line = handle.readline()