[Biopython-dev] Patch: data validation in Bio.Seq and Bio.MutableSeq
Yves Bastide
Yves.Bastide at irisa.fr
Tue Oct 5 08:34:36 EDT 2004
Attached patch adds a third parameter, "check", to Seq and MutableSeq's
constructors. If set, the data is checked against the alphabet.
Regards,
yves
-------------- next part --------------
Index: Bio/Seq.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Seq.py,v
retrieving revision 1.10
diff -u -p -r1.10 Seq.py
--- Bio/Seq.py 26 Aug 2004 13:22:58 -0000 1.10
+++ Bio/Seq.py 5 Oct 2004 12:28:10 -0000
@@ -4,12 +4,36 @@ import Alphabet
from Alphabet import IUPAC
from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
+def _check_data(data, alphabet):
+ """_check_data(data, alphabet) -> bool
+
+ Check whether the data corresponds to the alphabet."""
+ if not data or not alphabet.letters:
+ return 1
+ if alphabet.size == 1:
+ import re
+ mo = re.match(r"^[%s]+$" % re.escape(alphabet.letters), data)
+ return mo is not None
+ else:
+ for ch in data:
+ if ch not in alphabet.letters:
+ return 0
+ return 1
+
class Seq:
- def __init__(self, data, alphabet = Alphabet.generic_alphabet):
+ """Seq(data[, alphabet, [check]]) -> Seq object.
+
+ Create a sequence. the alphabet describes how to interpret the
+ data's individual characters. Be default, no checking is done on
+ the data's validity.
+ """
+ def __init__(self, data, alphabet=Alphabet.generic_alphabet, check=0):
# Enforce string storage
assert (type(data) == type("") or # must use a string
type(data) == type(u"")) # but can be a unicode string
+ if check and not _check_data(data, alphabet):
+ raise TypeError, "invalid data"
self.data = data # Seq API requirement
self.alphabet = alphabet # Seq API requirement
@@ -44,7 +68,7 @@ class Seq:
elif other.alphabet.contains(self.alphabet):
return self.__class__(self.data + other.data, other.alphabet)
else:
- raise TypeError, ("incompatable alphabets", str(self.alphabet),
+ raise TypeError, ("incompatible alphabets", str(self.alphabet),
str(other.alphabet))
def __radd__(self, other):
if self.alphabet.contains(other.alphabet):
@@ -52,7 +76,7 @@ class Seq:
elif other.alphabet.contains(self.alphabet):
return self.__class__(other.data + self.data, other.alphabet)
else:
- raise TypeError, ("incompatable alphabets", str(self.alphabet),
+ raise TypeError, ("incompatible alphabets", str(self.alphabet),
str(other.alphabet))
@@ -119,11 +143,13 @@ class Seq:
class MutableSeq:
- def __init__(self, data, alphabet = Alphabet.generic_alphabet):
- if type(data) == type(""):
- self.data = array.array("c", data)
+ def __init__(self, data, alphabet=Alphabet.generic_alphabet, check=0):
+ if check and not _check_data(data, alphabet):
+ raise TypeError, "invalid data"
+ if isinstance(data, array.array):
+ self.data = data
else:
- self.data = data # assumes the input is an array
+ self.data = array.array("c", data) # assume convertibility
self.alphabet = alphabet
def __repr__(self):
return "%s(%s, %s)" % (self.__class__.__name__,
@@ -176,7 +202,7 @@ class MutableSeq:
elif other.alphabet.contains(self.alphabet):
return self.__class__(self.data + other.data, other.alphabet)
else:
- raise TypeError, ("incompatable alphabets", str(self.alphabet),
+ raise TypeError, ("incompatible alphabets", str(self.alphabet),
str(other.alphabet))
def __radd__(self, other):
if self.alphabet.contains(other.alphabet):
@@ -184,7 +210,7 @@ class MutableSeq:
elif other.alphabet.contains(self.alphabet):
return self.__class__(other.data + self.data, other.alphabet)
else:
- raise TypeError, ("incompatable alphabets", str(self.alphabet),
+ raise TypeError, ("incompatible alphabets", str(self.alphabet),
str(other.alphabet))
def append(self, c):
More information about the Biopython-dev
mailing list