[Biopython-dev] Patch: data validation in Bio.Seq and Bio.MutableSeq

Yves Bastide Yves.Bastide at irisa.fr
Tue Oct 5 08:34:36 EDT 2004


Attached patch adds a third parameter, "check", to Seq and MutableSeq's
constructors. If set, the data is checked against the alphabet.

Regards,

yves
-------------- next part --------------
Index: Bio/Seq.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/Seq.py,v
retrieving revision 1.10
diff -u -p -r1.10 Seq.py
--- Bio/Seq.py	26 Aug 2004 13:22:58 -0000	1.10
+++ Bio/Seq.py	5 Oct 2004 12:28:10 -0000
@@ -4,12 +4,36 @@ import Alphabet
 from Alphabet import IUPAC
 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
 
+def _check_data(data, alphabet):
+    """_check_data(data, alphabet) -> bool
+    
+    Check whether the data corresponds to the alphabet."""
+    if not data or not alphabet.letters:
+        return 1
+    if alphabet.size == 1:
+        import re
+        mo = re.match(r"^[%s]+$" % re.escape(alphabet.letters), data)
+        return mo is not None
+    else:
+        for ch in data:
+            if ch not in alphabet.letters:
+                return 0
+    return 1
+
 
 class Seq:
-    def __init__(self, data, alphabet = Alphabet.generic_alphabet):
+    """Seq(data[, alphabet, [check]]) -> Seq object.
+
+    Create a sequence. the alphabet describes how to interpret the
+    data's individual characters. Be default, no checking is done on
+    the data's validity.
+    """
+    def __init__(self, data, alphabet=Alphabet.generic_alphabet, check=0):
         # Enforce string storage
         assert (type(data) == type("") or # must use a string
                 type(data) == type(u""))  # but can be a unicode string
+        if check and not _check_data(data, alphabet):
+            raise TypeError, "invalid data"
 
         self.data = data                           # Seq API requirement
         self.alphabet = alphabet                   # Seq API requirement
@@ -44,7 +68,7 @@ class Seq:
         elif other.alphabet.contains(self.alphabet):
             return self.__class__(self.data + other.data, other.alphabet)
         else:
-            raise TypeError, ("incompatable alphabets", str(self.alphabet),
+            raise TypeError, ("incompatible alphabets", str(self.alphabet),
                               str(other.alphabet))
     def __radd__(self, other):
         if self.alphabet.contains(other.alphabet):
@@ -52,7 +76,7 @@ class Seq:
         elif other.alphabet.contains(self.alphabet):
             return self.__class__(other.data + self.data, other.alphabet)
         else:
-            raise TypeError, ("incompatable alphabets", str(self.alphabet),
+            raise TypeError, ("incompatible alphabets", str(self.alphabet),
                               str(other.alphabet))
 
 
@@ -119,11 +143,13 @@ class Seq:
 
 
 class MutableSeq:
-    def __init__(self, data, alphabet = Alphabet.generic_alphabet):
-        if type(data) == type(""):
-            self.data = array.array("c", data)
+    def __init__(self, data, alphabet=Alphabet.generic_alphabet, check=0):
+        if check and not _check_data(data, alphabet):
+            raise TypeError, "invalid data"
+        if isinstance(data, array.array):
+            self.data = data
         else:
-            self.data = data   # assumes the input is an array
+            self.data = array.array("c", data) # assume convertibility
         self.alphabet = alphabet
     def __repr__(self):
         return "%s(%s, %s)" % (self.__class__.__name__,
@@ -176,7 +202,7 @@ class MutableSeq:
         elif other.alphabet.contains(self.alphabet):
             return self.__class__(self.data + other.data, other.alphabet)
         else:
-            raise TypeError, ("incompatable alphabets", str(self.alphabet),
+            raise TypeError, ("incompatible alphabets", str(self.alphabet),
                               str(other.alphabet))
     def __radd__(self, other):
         if self.alphabet.contains(other.alphabet):
@@ -184,7 +210,7 @@ class MutableSeq:
         elif other.alphabet.contains(self.alphabet):
             return self.__class__(other.data + self.data, other.alphabet)
         else:
-            raise TypeError, ("incompatable alphabets", str(self.alphabet),
+            raise TypeError, ("incompatible alphabets", str(self.alphabet),
                               str(other.alphabet))
 
     def append(self, c):


More information about the Biopython-dev mailing list