[Biopython-dev] [patch] NCBIStandalone.py cleanup
Jeffrey Chang
jchang at jeffchang.com
Fri Aug 8 15:31:46 EDT 2003
Great patch! I've applied it to the CVS repository.
Jeff
On Friday, August 8, 2003, at 03:09 AM, Yves Bastide wrote:
> Hi,
>
> Here's a patch bringing Bio.Blast.NCBIStandalone nearer to Python 3
> compatibility :-)
>
> Changelog:
> * Don't use the string module
> * use .find() == -1, not .find() <= 0
> * use .startswith() and .endwith()
>
> Regards,
>
> yves
> Index: NCBIStandalone.py
> ===================================================================
> RCS file:
> /home/repository/biopython/biopython/Bio/Blast/NCBIStandalone.py,v
> retrieving revision 1.47
> diff -u -p -r1.47 NCBIStandalone.py
> --- NCBIStandalone.py 2003/06/09 02:12:09 1.47
> +++ NCBIStandalone.py 2003/08/08 10:04:35
> @@ -34,9 +34,7 @@ blastpgp Execute blastpgp.
> """
>
> import os
> -import string
> import re
> -from types import *
>
> from Bio import File
> from Bio.ParserSupport import *
> @@ -145,9 +143,9 @@ class _Scanner:
>
> while 1:
> line = safe_peekline(uhandle)
> - if line[:9] != 'Searching' and \
> - re.search(r"Score +E", line) is None and \
> - string.find(line, 'No hits found') < 0:
> + if (not line.startswith('Searching') and
> + re.search(r"Score +E", line) is None and
> + line.find('No hits found') == -1):
> break
>
> self._scan_descriptions(uhandle, consumer)
> @@ -197,7 +195,7 @@ class _Scanner:
> # Check for these error lines and ignore them for now. Let
> # the BlastErrorParser deal with them.
> line = uhandle.peekline()
> - if line.find("ERROR:") >= 0 or line.startswith("done"):
> + if line.find("ERROR:") != -1 or line.startswith("done"):
> read_and_call_while(uhandle, consumer.noevent,
> contains="ERROR:")
> read_and_call(uhandle, consumer.noevent, start="done")
>
> @@ -256,7 +254,7 @@ class _Scanner:
>
> # Read the descriptions and the following blank lines, making
> # sure that there are descriptions.
> - if uhandle.peekline()[:19] != 'Sequences not found':
> + if not uhandle.peekline().startswith('Sequences not found'):
> read_and_call_until(uhandle, consumer.description,
> blank=1)
> read_and_call_while(uhandle, consumer.noevent, blank=1)
>
> @@ -269,7 +267,7 @@ class _Scanner:
> # Read the descriptions and the following blank lines.
> read_and_call_while(uhandle, consumer.noevent, blank=1)
> l = safe_peekline(uhandle)
> - if l[:9] != 'CONVERGED' and l[0] != '>':
> + if not l.startswith('CONVERGED') and l[0] != '>':
> read_and_call_until(uhandle, consumer.description,
> blank=1)
> read_and_call_while(uhandle, consumer.noevent,
> blank=1)
>
> @@ -281,7 +279,7 @@ class _Scanner:
> def _scan_alignments(self, uhandle, consumer):
> # First, check to see if I'm at the database report.
> line = safe_peekline(uhandle)
> - if line[:10] == ' Database':
> + if line.startswith(' Database'):
> return
> elif line[0] == '>':
> # XXX make a better check here between pairwise and
> masterslave
> @@ -305,7 +303,7 @@ class _Scanner:
> # Scan a bunch of score/alignment pairs.
> while 1:
> line = safe_peekline(uhandle)
> - if line[:6] != ' Score':
> + if not line.startswith(' Score'):
> break
> self._scan_hsp(uhandle, consumer)
> consumer.end_alignment()
> @@ -318,7 +316,7 @@ class _Scanner:
> read_and_call(uhandle, consumer.title, start='>')
> while 1:
> line = safe_readline(uhandle)
> - if string.lstrip(line)[:8] == 'Length =':
> + if line.lstrip().startswith('Length ='):
> consumer.length(line)
> break
> elif is_blank_line(line):
> @@ -372,7 +370,7 @@ class _Scanner:
> read_and_call_while(uhandle, consumer.noevent, blank=1)
> line = safe_peekline(uhandle)
> # Alignment continues if I see a 'Query' or the spaces
> for Blastn.
> - if line[:5] != 'Query' and line[:5] != ' ':
> + if not (line.startswith('Query') or line.startswith('
> ')):
> break
>
> def _scan_masterslave_alignment(self, uhandle, consumer):
> @@ -382,10 +380,10 @@ class _Scanner:
> # Check to see whether I'm finished reading the alignment.
> # This is indicated by 1) database section, 2) next
> psi-blast round
> # patch by chapmanb
> - if line[:9] == 'Searching':
> + if line.startswith('Searching'):
> uhandle.saveline(line)
> break
> - elif line[:10] == ' Database':
> + elif line.startswith(' Database'):
> uhandle.saveline(line)
> break
> elif is_blank_line(line):
> @@ -423,7 +421,7 @@ class _Scanner:
>
> line = safe_readline(uhandle)
> uhandle.saveline(line)
> - if string.find(line, 'Lambda') >= 0:
> + if line.find('Lambda') != -1:
> break
>
> read_and_call(uhandle, consumer.noevent, start='Lambda')
> @@ -577,22 +575,22 @@ class _HeaderConsumer:
> self._header = Record.Header()
>
> def version(self, line):
> - c = string.split(line)
> + c = line.split()
> self._header.application = c[0]
> self._header.version = c[1]
> self._header.date = c[2][1:-1]
>
> def reference(self, line):
> - if line[:11] == 'Reference: ':
> + if line.startswith('Reference: '):
> self._header.reference = line[11:]
> else:
> self._header.reference = self._header.reference + line
>
> def query_info(self, line):
> - if line[:7] == 'Query= ':
> + if line.startswith('Query= '):
> self._header.query = line[7:]
> - elif line[:7] != ' ': # continuation of query_info
> - self._header.query = self._header.query + line
> + elif not line.startswith(' '): # continuation of
> query_info
> + self._header.query = "%s%s" % (self._header.query, line)
> else:
> letters, = _re_search(
> r"([0-9,]+) letters", line,
> @@ -600,11 +598,11 @@ class _HeaderConsumer:
> self._header.query_letters = _safe_int(letters)
>
> def database_info(self, line):
> - line = string.rstrip(line)
> - if line[:10] == 'Database: ':
> + line = line.rstrip()
> + if line.startswith('Database: '):
> self._header.database = line[10:]
> - elif not line[-13:] == 'total letters':
> - self._header.database = self._header.database +
> string.strip(line)
> + elif not line.endswith('total letters'):
> + self._header.database = self._header.database +
> line.strip()
> else:
> sequences, letters =_re_search(
> r"([0-9,]+) sequences; ([0-9,]+) total letters", line,
> @@ -614,8 +612,8 @@ class _HeaderConsumer:
>
> def end_header(self):
> # Get rid of the trailing newlines
> - self._header.reference = string.rstrip(self._header.reference)
> - self._header.query = string.rstrip(self._header.query)
> + self._header.reference = self._header.reference.rstrip()
> + self._header.query = self._header.query.rstrip()
>
> class _DescriptionConsumer:
> def start_descriptions(self):
> @@ -629,8 +627,8 @@ class _DescriptionConsumer:
> self.__has_n = 0 # Does the description line contain an N
> value?
>
> def description_header(self, line):
> - if line[:19] == 'Sequences producing':
> - cols = string.split(line)
> + if line.startswith('Sequences producing'):
> + cols = line.split()
> if cols[-1] == 'N':
> self.__has_n = 1
>
> @@ -656,9 +654,9 @@ class _DescriptionConsumer:
> pass
>
> def round(self, line):
> - if line[:18] != 'Results from round':
> + if not line.startswith('Results from round'):
> raise SyntaxError, "I didn't understand the round
> line\n%s" % line
> - self._roundnum = _safe_int(string.strip(line[18:]))
> + self._roundnum = _safe_int(line[18:])
>
> def end_descriptions(self):
> pass
> @@ -674,23 +672,23 @@ class _DescriptionConsumer:
> # - title must be preserved exactly (including whitespaces)
> # - score could be equal to e-value (not likely, but what
> if??)
> # - sometimes there's an "N" score of '1'.
> - cols = string.split(line)
> + cols = line.split()
> if len(cols) < 3:
> raise SyntaxError, \
> "Line does not appear to contain description:\n%s"
> % line
> if self.__has_n:
> - i = string.rfind(line, cols[-1]) # find start of N
> - i = string.rfind(line, cols[-2], 0, i) # find start of
> p-value
> - i = string.rfind(line, cols[-3], 0, i) # find start of
> score
> + i = line.rfind(cols[-1]) # find start of N
> + i = line.rfind(cols[-2], 0, i) # find start of p-value
> + i = line.rfind(cols[-3], 0, i) # find start of score
> else:
> - i = string.rfind(line, cols[-1]) # find start of
> p-value
> - i = string.rfind(line, cols[-2], 0, i) # find start of
> score
> + i = line.rfind(cols[-1]) # find start of p-value
> + i = line.rfind(cols[-2], 0, i) # find start of score
> if self.__has_n:
> dh.title, dh.score, dh.e, dh.num_alignments = \
> - string.rstrip(line[:i]), cols[-3], cols[-2],
> cols[-1]
> + line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
> else:
> dh.title, dh.score, dh.e, dh.num_alignments = \
> - string.rstrip(line[:i]), cols[-2], cols[-1], 1
> + line[:i].rstrip(), cols[-2], cols[-1], 1
> dh.num_alignments = _safe_int(dh.num_alignments)
> dh.score = _safe_int(dh.score)
> dh.e = _safe_float(dh.e)
> @@ -706,52 +704,52 @@ class _AlignmentConsumer:
> self._multiple_alignment = Record.MultipleAlignment()
>
> def title(self, line):
> - self._alignment.title = self._alignment.title +
> string.lstrip(line)
> + self._alignment.title = "%s%s" % (self._alignment.title,
> + line.lstrip())
>
> def length(self, line):
> - self._alignment.length = string.split(line)[2]
> + self._alignment.length = line.split()[2]
> self._alignment.length = _safe_int(self._alignment.length)
>
> def multalign(self, line):
> # Standalone version uses 'QUERY', while WWW version uses
> blast_tmp.
> - if line[:5] == 'QUERY' or line[:9] == 'blast_tmp':
> + if line.startswith('QUERY') or line.startswith('blast_tmp'):
> # If this is the first line of the multiple alignment,
> # then I need to figure out how the line is formatted.
>
> # Format of line is:
> # QUERY 1
> acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60
> try:
> - name, start, seq, end = string.split(line)
> + name, start, seq, end = line.split()
> except ValueError:
> raise SyntaxError, "I do not understand the line\n%s"
> \
> % line
> - self._start_index = string.index(line, start, len(name))
> - self._seq_index = string.index(line, seq,
> -
> self._start_index+len(start))
> + self._start_index = line.index(start, len(name))
> + self._seq_index = line.index(seq,
> + self._start_index+len(start))
> # subtract 1 for the space
> self._name_length = self._start_index - 1
> self._start_length = self._seq_index - self._start_index
> - 1
> - self._seq_length = string.rfind(line, end) -
> self._seq_index - 1
> + self._seq_length = line.rfind(end) - self._seq_index - 1
>
> - #self._seq_index = string.index(line, seq)
> + #self._seq_index = line.index(seq)
> ## subtract 1 for the space
> - #self._seq_length = string.rfind(line, end) -
> self._seq_index - 1
> - #self._start_index = string.index(line, start)
> + #self._seq_length = line.rfind(end) - self._seq_index - 1
> + #self._start_index = line.index(start)
> #self._start_length = self._seq_index - self._start_index
> - 1
> #self._name_length = self._start_index
>
> # Extract the information from the line
> - name = string.rstrip(line[:self._name_length])
> - start = string.rstrip(
> -
> line[self._start_index:self._start_index+self._start_length])
> + name = line[:self._name_length]
> + name = name.rstrip()
> + start =
> line[self._start_index:self._start_index+self._start_length]
> + start = start.rstrip()
> if start:
> start = _safe_int(start)
> - end = string.rstrip(
> - line[self._seq_index+self._seq_length:])
> + end = line[self._seq_index+self._seq_length:].rstrip()
> if end:
> end = _safe_int(end)
> - seq = string.rstrip(
> - line[self._seq_index:self._seq_index+self._seq_length])
> + seq =
> line[self._seq_index:self._seq_index+self._seq_length].rstrip()
> # right pad the sequence with spaces if necessary
> if len(seq) < self._seq_length:
> seq = seq + ' '*(self._seq_length-len(seq))
> @@ -826,7 +824,7 @@ class _AlignmentConsumer:
> def end_alignment(self):
> # Remove trailing newlines
> if self._alignment:
> - self._alignment.title =
> string.rstrip(self._alignment.title)
> + self._alignment.title = self._alignment.title.rstrip()
>
> # This code is also obsolete. See note above.
> # If there's a multiple alignment, I will need to make sure
> @@ -883,16 +881,16 @@ class _HSPConsumer:
> "I could not find the identities in line\n%s" % line)
> self._hsp.identities = _safe_int(x), _safe_int(y)
>
> - if string.find(line, 'Positives') >= 0:
> + if line.find('Positives') != -1:
> x, y = _re_search(
> r"Positives = (\d+)\/(\d+)", line,
> "I could not find the positives in line\n%s" % line)
> self._hsp.positives = _safe_int(x), _safe_int(y)
>
> - if string.find(line, 'Gaps') >= 0:
> + if line.find('Gaps') != -1:
> x, y = _re_search(
> r"Gaps = (\d+)\/(\d+)", line,
> - "I could not find the positives in line\n%s" % line)
> + "I could not find the gaps in line\n%s" % line)
> self._hsp.gaps = _safe_int(x), _safe_int(y)
>
>
> @@ -905,7 +903,7 @@ class _HSPConsumer:
> # Frame can be in formats:
> # Frame = +1
> # Frame = +2 / +2
> - if string.find(line, '/') >= 0:
> + if line.find('/') != -1:
> self._hsp.frame = _re_search(
> r"Frame = ([-+][123]) / ([-+][123])", line,
> "I could not find the frame in line\n%s" % line)
> @@ -931,7 +929,7 @@ class _HSPConsumer:
> self._query_len = len(seq)
>
> def align(self, line):
> - seq = string.rstrip(line[self._query_start_index:])
> + seq = line[self._query_start_index:].rstrip()
> if len(seq) < self._query_len:
> # Make sure the alignment is the same length as the query
> seq = seq + ' ' * (self._query_len-len(seq))
> @@ -948,7 +946,7 @@ class _HSPConsumer:
> #On occasion, there is a blast hit with no subject match
> #so far, it only occurs with 1-line short "matches"
> #I have decided to let these pass as they appear
> - if not string.strip(seq):
> + if not seq.strip():
> seq = ' ' * self._query_len
> self._hsp.sbjct = self._hsp.sbjct + seq
> if self._hsp.sbjct_start is None:
> @@ -976,8 +974,8 @@ class _DatabaseReportConsumer:
> self._dr.database_name.append(m.group(1))
> elif self._dr.database_name:
> # This must be a continuation of the previous name.
> - x = self._dr.database_name[-1] + string.strip(line)
> - self._dr.database_name[-1] = x
> + self._dr.database_name[-1] = "%s%s" %
> (self._dr.database_name[-1],
> + line.strip())
>
> def posted_date(self, line):
> self._dr.posted_date.append(_re_search(
> @@ -995,14 +993,14 @@ class _DatabaseReportConsumer:
>
> self._dr.num_sequences_in_database.append(_safe_int(sequences))
>
> def ka_params(self, line):
> - x = string.split(line)
> + x = line.split()
> self._dr.ka_params = map(_safe_float, x)
>
> def gapped(self, line):
> self._dr.gapped = 1
>
> def ka_params_gap(self, line):
> - x = string.split(line)
> + x = line.split()
> self._dr.ka_params_gap = map(_safe_float, x)
>
> def end_database_report(self):
> @@ -1013,7 +1011,7 @@ class _ParametersConsumer:
> self._params = Record.Parameters()
>
> def matrix(self, line):
> - self._params.matrix = string.rstrip(line[8:])
> + self._params.matrix = line[8:].rstrip()
>
> def gap_penalties(self, line):
> x = _get_cols(
> @@ -1021,7 +1019,7 @@ class _ParametersConsumer:
> self._params.gap_penalties = map(_safe_float, x)
>
> def num_hits(self, line):
> - if string.find(line, '1st pass') >= 0:
> + if line.find('1st pass') != -1:
> x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
> self._params.num_hits = _safe_int(x)
> else:
> @@ -1029,7 +1027,7 @@ class _ParametersConsumer:
> self._params.num_hits = _safe_int(x)
>
> def num_sequences(self, line):
> - if string.find(line, '1st pass') >= 0:
> + if line.find('1st pass') != -1:
> x, = _get_cols(line, (-4,), ncols=9,
> expected={2:"Sequences:"})
> self._params.num_sequences = _safe_int(x)
> else:
> @@ -1037,7 +1035,7 @@ class _ParametersConsumer:
> self._params.num_sequences = _safe_int(x)
>
> def num_extends(self, line):
> - if string.find(line, '1st pass') >= 0:
> + if line.find('1st pass') != -1:
> x, = _get_cols(line, (-4,), ncols=9,
> expected={2:"extensions:"})
> self._params.num_extends = _safe_int(x)
> else:
> @@ -1045,7 +1043,7 @@ class _ParametersConsumer:
> self._params.num_extends = _safe_int(x)
>
> def num_good_extends(self, line):
> - if string.find(line, '1st pass') >= 0:
> + if line.find('1st pass') != -1:
> x, = _get_cols(line, (-4,), ncols=10,
> expected={3:"extensions:"})
> self._params.num_good_extends = _safe_int(x)
> else:
> @@ -1297,8 +1295,13 @@ class Iterator:
> If set to None, then the raw contents of the file will be
> returned.
>
> """
> - if type(handle) is not FileType and type(handle) is not
> InstanceType:
> - raise ValueError, "I expected a file handle or file-like
> object"
> + try:
> + dummy = handle.readline
> + except AttributeError:
> + raise ValueError(
> + "I expected a file handle or file-like object, got %s"
> + % type(handle))
> + del dummy
> self._uhandle = File.UndoHandle(handle)
> self._parser = parser
>
> @@ -1315,7 +1318,8 @@ class Iterator:
> if not line:
> break
> # If I've reached the next one, then put the line back
> and stop.
> - if lines and (line[:5] == 'BLAST' or line[1:6] ==
> 'BLAST'):
> + if lines and (line.startswith('BLAST')
> + or line.startswith('BLAST', start = 1)):
> self._uhandle.saveline(line)
> break
> lines.append(line)
> @@ -1323,7 +1327,7 @@ class Iterator:
> if not lines:
> return None
>
> - data = string.join(lines, '')
> + data = ''.join(lines)
> if self._parser is not None:
> return self._parser.parse(File.StringHandle(data))
> return data
> @@ -1559,7 +1563,7 @@ def _re_search(regex, line, error_msg):
> return m.groups()
>
> def _get_cols(line, cols_to_get, ncols=None, expected={}):
> - cols = string.split(line)
> + cols = line.split()
>
> # Check to make sure number of columns is correct
> if ncols is not None and len(cols) != ncols:
> @@ -1584,13 +1588,14 @@ def _safe_int(str):
> except ValueError:
> # Something went wrong. Try to clean up the string.
> # Remove all commas from the string
> - str = string.replace(str, ',', '')
> + str = str.replace(',', '')
> try:
> # try again.
> return int(str)
> except ValueError:
> pass
> # If it fails again, maybe it's too long?
> + # XXX why converting to float?
> return long(float(str))
>
> def _safe_float(str):
> @@ -1599,13 +1604,13 @@ def _safe_float(str):
> # we need to check the string for this condition.
>
> # Sometimes BLAST leaves of the '1' in front of an exponent.
> - if str[0] in ['E', 'e']:
> + if str and str[0] in ['E', 'e']:
> str = '1' + str
> try:
> return float(str)
> except ValueError:
> # Remove all commas from the string
> - str = string.replace(str, ',', '')
> + str = str.replace(',', '')
> # try again.
> return float(str)
>
> @@ -1613,7 +1618,7 @@ class _BlastErrorConsumer(_BlastConsumer
> def __init__(self):
> _BlastConsumer.__init__(self)
> def noevent(self, line):
> - if line.find("Query must be at least wordsize") >= 0:
> + if line.find("Query must be at least wordsize") != -1:
> raise ShortQueryBlastError, "Query must be at least
> wordsize"
> # Now pass the line back up to the superclass.
> method = getattr(_BlastConsumer, 'noevent',
> @@ -1687,7 +1692,7 @@ class BlastErrorParser(AbstractParser):
> # 'Searchingdone' instead of 'Searching......done' seems
> # to indicate a failure to perform the BLAST due to
> # low quality sequence
> - if line[:13] == 'Searchingdone':
> + if line.startswith('Searchingdone'):
> raise LowQualityBlastError("Blast failure occured on
> query: ",
> data_record.query)
> line = handle.readline()
> _______________________________________________
> Biopython-dev mailing list
> Biopython-dev at biopython.org
> http://biopython.org/mailman/listinfo/biopython-dev
More information about the Biopython-dev
mailing list