[BioPython] LCC function coded.

Estienne Swart estienne@sanbi.ac.za
Thu, 05 Dec 2002 12:16:01 +0200


This is a multi-part message in MIME format.
--------------080008040609010108000501
Content-Type: multipart/alternative;
 boundary="------------020203060801040105020409"


--------------020203060801040105020409
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit

Rune Linding - EMBL Biocomputing Unit wrote:

>hi
>can i have a look at the code,iam planning to do this for
>proteins...
>
I've written a little masking script (attached) that handles protein/dna 
(or whatever other alphabet you choose). It calculates complexity scores 
over sliding windows, and does some crude masking.

I've been meaning to do this for some time now (I wanted to see exactly 
how masking works, and what parameters can be twiddled for 'optimal' 
masking). So, seeing the LCC postings stimulated me to get moving!

I've thrown together this little script, which is by no means lightning 
fast (I resorted to regexp's etc.), but works the way it should (judge 
for yourself). Low-complexity calculations/concepts were nabbed from 
Mount's Bioinformatics (I still have to get the original research papers 
for SEG/DUST, to see how they implement things).

Biopythoners: I'd welcome any insights as to how I can inherit properly 
from Biopython's seq class etc. ;)


>
>:)
>r.
>On Mon, Dec 02, 2002 at 08:25:09AM -0300, Sebastian Bassi wrote:
>
>>Hi Iddo,
>>
>>I've just coded 2 functions to calculate LCC.
>>lcc_mult and lcc_simp. Both calculates LCC (Local composition content),
>>lcc_mult is optimized for speed in several runs. It returns a list of
>>LCC values.
>>lcc_simp is for 1 LCC value (you can get more calling the function using
>>a for loop). If you need more LCC values than the size of the LCC
>>window, is recomended lcc_mult.
>>lcc_mult calculates in forehand all logs to save time. It also calculate
>>the "next" window lcc by comparing the next nucleotide on the window
>>(head) and the nucleotide that leaves the window (tail), if they are the
>>same, the LCC is the same so it is not calculated.
>>
>>lcc_mult(seq,wsize,start,end):
>>seq: String of the sequence.
>>wsize: Windows size
>>start: Where to start "walking".
>>end: Where to end "walking"
>>it returns a list, all elements float values.
>>
>>lcc_simp(seq,start,end):
>>seq: String of the sequence.
>>start: Where to start calculating LCC.
>>end: Where to end calculating LCC.
>>it returns one float value.
>>
>>I did an informal benchmark and, for a large list of LCC values, 
>>lcc_simp took 54 secs. and lcc_mult took just 12.
>>
>>I don't know how to make a module, so I give the code here for you to
>>adapt it according the BioPython standarts. I hopw it make it to the
>>BioPython.
>>
>>-- 
>>Best regards,
>>
>>//=\ Sebastian Bassi - Diplomado en Ciencia y Tecnologia, UNQ   //=\
>>\=// IT Manager Advanta Seeds - Balcarce Research Center -      \=//
>>//=\ Pro secretario ASALUP - www.asalup.org - PGP key available //=\
>>\=// E-mail: sbassi@genesdigitales.com - ICQ UIN: 3356556 -     \=//
>>
>>               Linux para todos: http://Linuxfacil.info
>>
>
>># idem version lcc3f2, pero el sliding windows hace que no haga que contar
>># todas las ocurrencias de las letras, solo sumar las nuevas.
>>
>>from Bio import Fasta
>>from string import count
>>import math
>>import time
>>
>>
>>parser=Fasta.RecordParser()
>>entrada=open("D:\\projects\\tesis\\fna-1small.nn","r")
>>cur_record=1
>>iterator=Fasta.Iterator(entrada,parser)
>>crom=0
>>compone=[0]
>>lccsal=[0]
>>initime=time.time()
>>
>>def lcc_mult(seq,wsize,start,end):
>>    """Return LCC, a complexity measure from a sequence, called seq."""
>>    l2=math.log(2)
>>    tamseq=end-start
>>    global compone
>>    global lccsal
>>    for i in range(wsize):
>>        compone.append(((i+1)/float(wsize))*((math.log((i+1)/float(wsize)))/l2))
>>    window=seq[0:wsize]
>>    cant_a=count(window,'A')
>>    cant_c=count(window,'C')
>>    cant_t=count(window,'T')
>>    cant_g=count(window,'G')
>>    term_a=compone[cant_a]
>>    term_c=compone[cant_c]
>>    term_t=compone[cant_t]
>>    term_g=compone[cant_g]
>>    lccsal[0]=(-(term_a+term_c+term_t+term_g))
>>    tail=seq[0]
>>    for x in range (tamseq-wsize):
>>        window=seq[x+1:wsize+x+1]
>>        if tail==window[-1]:
>>            lccsal.append(lccsal[-1])
>>            #break
>>        else:
>>            if tail=='A':
>>                cant_a=cant_a-1
>>                if window[-1]=='C':
>>                    cant_c=cant_c+1
>>                    term_a=compone[cant_a]
>>                    term_c=compone[cant_c]
>>                    lccsal.append(-(term_a+term_c+term_t+term_g))
>>                elif window[-1]=='T':
>>                    cant_t=cant_t+1
>>                    term_a=compone[cant_a]
>>                    term_t=compone[cant_t]
>>                    lccsal.append(-(term_a+term_c+term_t+term_g))
>>                elif window[-1]=='G':
>>                    cant_g=cant_g+1
>>                    term_a=compone[cant_a]
>>                    term_g=compone[cant_g]
>>                    lccsal.append(-(term_a+term_c+term_t+term_g))
>>            else:
>>                if tail=='C':
>>                    cant_c=cant_c-1
>>                    if window[-1]=='A':
>>                        cant_a=cant_a+1
>>                        term_a=compone[cant_a]
>>                        term_c=compone[cant_c]
>>                        lccsal.append(-(term_a+term_c+term_t+term_g))
>>                    elif window[-1]=='T':
>>                        cant_t=cant_t+1
>>                        term_c=compone[cant_c]
>>                        term_t=compone[cant_t]
>>                        lccsal.append(-(term_a+term_c+term_t+term_g))
>>                    elif window[-1]=='G':
>>                        cant_g=cant_g+1
>>                        term_c=compone[cant_c]
>>                        term_g=compone[cant_g]
>>                        lccsal.append(-(term_a+term_c+term_t+term_g))
>>                
>>                else:
>>                    if tail=='T':
>>                        cant_t=cant_t-1
>>                        if window[-1]=='A':
>>                            cant_a=cant_a+1
>>                            term_a=compone[cant_a]
>>                            term_t=compone[cant_t]
>>                            lccsal.append(-(term_a+term_c+term_t+term_g))
>>                        elif window[-1]=='C':
>>                            cant_c=cant_c+1
>>                            term_c=compone[cant_c]
>>                            term_t=compone[cant_t]
>>                            lccsal.append(-(term_a+term_c+term_t+term_g))
>>                        elif window[-1]=='G':
>>                            cant_g=cant_g+1
>>                            term_t=compone[cant_t]
>>                            term_g=compone[cant_g]
>>                            lccsal.append(-(term_a+term_c+term_t+term_g))
>>                    else:
>>                        if tail=='G':
>>                            cant_g=cant_g-1
>>                            if window[-1]=='A':
>>                                cant_a=cant_a+1
>>                                term_a=compone[cant_a]
>>                                term_g=compone[cant_g]
>>                                lccsal.append(-(term_a+term_c+term_t+term_g))
>>                            elif window[-1]=='C':
>>                                cant_c=cant_c+1
>>                                term_c=compone[cant_c]
>>                                term_g=compone[cant_g]
>>                                lccsal.append(-(term_a+term_c+term_t+term_g))
>>                            elif window[-1]=='T':
>>                                cant_t=cant_t+1
>>                                term_t=compone[cant_t]
>>                                term_g=compone[cant_g]
>>                                lccsal.append(-(term_a+term_c+term_t+term_g))
>>        tail=window[0]
>>    return lccsal
>>
>>def lcc_simp(seq,start,end):
>>    """Return LCC, a complexity measure from a sequence (seq.)"""
>>    wsize=end-start
>>    l2=math.log(2)
>>    window=seq[start:end]
>>    term_a=((count(window,'A'))/float(wsize))*((math.log((count(window,'A'))/float(wsize)))/l2)
>>    term_c=((count(window,'C'))/float(wsize))*((math.log((count(window,'C'))/float(wsize)))/l2)
>>    term_t=((count(window,'T'))/float(wsize))*((math.log((count(window,'T'))/float(wsize)))/l2)
>>    term_a=((count(window,'G'))/float(wsize))*((math.log((count(window,'G'))/float(wsize)))/l2)
>>    lccsal=-(term_a+term_c+term_t+term_g)
>>    return lccsal
>>
>>while cur_record:
>>    cur_record=iterator.next()
>>    crom=crom+1
>>    if cur_record is None:
>>        break
>>    salida=open("C:\\bioinfo-adv\\blast\\data\\test\\smallcsw-"+str(crom)+".txt","w")
>>    
>>    lcc=lcc_mult(cur_record.sequence,17,0,len(cur_record.sequence))
>>    for x in range(len(lcc)):
>>        salida.write(str("%.5f" %lcc[x])+'\n')
>>        
>>    salida.close()
>>          
>>entrada.close()
>>
>
>--
>Rune Linding				room	v101
>EMBL - Biocomputing - Gibson Team	lab	+49 6221 387451
>Meyerhofstrasse 1			fax	+49 6221 387517
>D-69117 Heidelberg			cell	+49 1712 128951
>Germany					http://www.EMBL-Heidelberg.DE/~linding
>Of all the animals, the boy is the most unmanageable.
>		-- Plato
>_______________________________________________
>BioPython mailing list  -  BioPython@biopython.org
>http://biopython.org/mailman/listinfo/biopython
>


-- 
Estienne Swart
SANBI, UWC Private Bag X17, Bellville 7535
estienne@sanbi.ac.za
tel work: +27 21 959 3908
tel home: +27 21 448 8118
fax work: +27 21 959 2512



--------------020203060801040105020409
Content-Type: text/html; charset=us-ascii
Content-Transfer-Encoding: 7bit

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
  <title></title>
</head>
<body>
Rune Linding - EMBL Biocomputing Unit wrote:<br>
<blockquote type="cite"
 cite="mid20021202190736.GC40592@EMBL-Heidelberg.DE">
  <pre wrap="">hi<br>can i have a look at the code,iam planning to do this for<br>proteins...<br></pre>
</blockquote>
I've written a little masking script (attached) that handles protein/dna
(or whatever other alphabet you choose). It calculates complexity scores
over sliding windows, and does some crude masking.<br>
<br>
I've been meaning to do this for some time now (I wanted to see exactly how
masking works, and what parameters can be twiddled for 'optimal' masking).
So, seeing the LCC postings stimulated me to get moving!<br>
<br>
I've thrown together this little script, which is by no means lightning fast
(I resorted to regexp's etc.), but works the way it should (judge for yourself).
Low-complexity calculations/concepts were nabbed from Mount's Bioinformatics
(I still have to get the original research papers for SEG/DUST, to see how
they implement things).<br>
<br>
Biopythoners: I'd welcome any insights as to how I can inherit properly from
Biopython's seq class etc. ;)<br>
<br>
<br>
<blockquote type="cite"
 cite="mid20021202190736.GC40592@EMBL-Heidelberg.DE">
  <pre wrap=""><br>:)<br>r.<br>On Mon, Dec 02, 2002 at 08:25:09AM -0300, Sebastian Bassi wrote:<br></pre>
  <blockquote type="cite">
    <pre wrap="">Hi Iddo,<br><br>I've just coded 2 functions to calculate LCC.<br>lcc_mult and lcc_simp. Both calculates LCC (Local composition content),<br>lcc_mult is optimized for speed in several runs. It returns a list of<br>LCC values.<br>lcc_simp is for 1 LCC value (you can get more calling the function using<br>a for loop). If you need more LCC values than the size of the LCC<br>window, is recomended lcc_mult.<br>lcc_mult calculates in forehand all logs to save time. It also calculate<br>the "next" window lcc by comparing the next nucleotide on the window<br>(head) and the nucleotide that leaves the window (tail), if they are the<br>same, the LCC is the same so it is not calculated.<br><br>lcc_mult(seq,wsize,start,end):<br>seq: String of the sequence.<br>wsize: Windows size<br>start: Where to start "walking".<br>end: Where to end "walking"<br>it returns a list, all elements float values.<br><br>lcc_simp(seq,start,end):<br>seq: String of the sequence.<br>start: Where t
o start calculating LCC.<br>end: Where to end calculating LCC.<br>it returns one float value.<br><br>I did an informal benchmark and, for a large list of LCC values, <br>lcc_simp took 54 secs. and lcc_mult took just 12.<br><br>I don't know how to make a module, so I give the code here for you to<br>adapt it according the BioPython standarts. I hopw it make it to the<br>BioPython.<br><br>-- <br>Best regards,<br><br>//=\ Sebastian Bassi - Diplomado en Ciencia y Tecnologia, UNQ   //=\<br>\=// IT Manager Advanta Seeds - Balcarce Research Center -      \=//<br>//=\ Pro secretario ASALUP - <a class="moz-txt-link-abbreviated" href="http://www.asalup.org">www.asalup.org</a> - PGP key available //=\<br>\=// E-mail: <a class="moz-txt-link-abbreviated" href="mailto:sbassi@genesdigitales.com">sbassi@genesdigitales.com</a> - ICQ UIN: 3356556 -     \=//<br><br>               Linux para todos: <a class="moz-txt-link-freetext" href="http://Linuxfacil.info">http://Linuxfacil.info</a><br><br><
/pre>
  </blockquote>
  <pre wrap=""><!----><br></pre>
  <blockquote type="cite">
    <pre wrap=""># idem version lcc3f2, pero el sliding windows hace que no haga que contar<br># todas las ocurrencias de las letras, solo sumar las nuevas.<br><br>from Bio import Fasta<br>from string import count<br>import math<br>import time<br><br><br>parser=Fasta.RecordParser()<br>entrada=open("D:\\projects\\tesis\\fna-1small.nn","r")<br>cur_record=1<br>iterator=Fasta.Iterator(entrada,parser)<br>crom=0<br>compone=[0]<br>lccsal=[0]<br>initime=time.time()<br><br>def lcc_mult(seq,wsize,start,end):<br>    """Return LCC, a complexity measure from a sequence, called seq."""<br>    l2=math.log(2)<br>    tamseq=end-start<br>    global compone<br>    global lccsal<br>    for i in range(wsize):<br>        compone.append(((i+1)/float(wsize))*((math.log((i+1)/float(wsize)))/l2))<br>    window=seq[0:wsize]<br>    cant_a=count(window,'A')<br>    cant_c=count(window,'C')<br>    cant_t=count(window,'T')<br>    cant_g=count(window,'G')<br>    term_a=compone[cant_a]<br>    term_c=compone[c
ant_c]<br>    term_t=compone[cant_t]<br>    term_g=compone[cant_g]<br>    lccsal[0]=(-(term_a+term_c+term_t+term_g))<br>    tail=seq[0]<br>    for x in range (tamseq-wsize):<br>        window=seq[x+1:wsize+x+1]<br>        if tail==window[-1]:<br>            lccsal.append(lccsal[-1])<br>            #break<br>        else:<br>            if tail=='A':<br>                cant_a=cant_a-1<br>                if window[-1]=='C':<br>                    cant_c=cant_c+1<br>                    term_a=compone[cant_a]<br>                    term_c=compone[cant_c]<br>                    lccsal.append(-(term_a+term_c+term_t+term_g))<br>                elif window[-1]=='T':<br>                    cant_t=cant_t+1<br>                    term_a=compone[cant_a]<br>                    term_t=compone[cant_t]<br>                    lccsal.append(-(term_a+term_c+term_t+term_g))<br>                elif window[-1]=='G':<br>                    cant_g=cant_g+1<br>                    term_a=compone[cant_
a]<br>                    term_g=compone[cant_g]<br>                    lccsal.append(-(term_a+term_c+term_t+term_g))<br>            else:<br>                if tail=='C':<br>                    cant_c=cant_c-1<br>                    if window[-1]=='A':<br>                        cant_a=cant_a+1<br>                        term_a=compone[cant_a]<br>                        term_c=compone[cant_c]<br>                        lccsal.append(-(term_a+term_c+term_t+term_g))<br>                    elif window[-1]=='T':<br>                        cant_t=cant_t+1<br>                        term_c=compone[cant_c]<br>                        term_t=compone[cant_t]<br>                        lccsal.append(-(term_a+term_c+term_t+term_g))<br>                    elif window[-1]=='G':<br>                        cant_g=cant_g+1<br>                        term_c=compone[cant_c]<br>                        term_g=compone[cant_g]<br>                        lccsal.append(-(term_a+term_c+term_t+term_g)
)<br>                <br>                else:<br>                    if tail=='T':<br>                        cant_t=cant_t-1<br>                        if window[-1]=='A':<br>                            cant_a=cant_a+1<br>                            term_a=compone[cant_a]<br>                            term_t=compone[cant_t]<br>                            lccsal.append(-(term_a+term_c+term_t+term_g))<br>                        elif window[-1]=='C':<br>                            cant_c=cant_c+1<br>                            term_c=compone[cant_c]<br>                            term_t=compone[cant_t]<br>                            lccsal.append(-(term_a+term_c+term_t+term_g))<br>                        elif window[-1]=='G':<br>                            cant_g=cant_g+1<br>                            term_t=compone[cant_t]<br>                            term_g=compone[cant_g]<br>                            lccsal.append(-(term_a+term_c+term_t+term_g))<br>                   
 else:<br>                        if tail=='G':<br>                            cant_g=cant_g-1<br>                            if window[-1]=='A':<br>                                cant_a=cant_a+1<br>                                term_a=compone[cant_a]<br>                                term_g=compone[cant_g]<br>                                lccsal.append(-(term_a+term_c+term_t+term_g))<br>                            elif window[-1]=='C':<br>                                cant_c=cant_c+1<br>                                term_c=compone[cant_c]<br>                                term_g=compone[cant_g]<br>                                lccsal.append(-(term_a+term_c+term_t+term_g))<br>                            elif window[-1]=='T':<br>                                cant_t=cant_t+1<br>                                term_t=compone[cant_t]<br>                                term_g=compone[cant_g]<br>                                lccsal.append(-(term_a+term_c+term_t+ter
m_g))<br>        tail=window[0]<br>    return lccsal<br><br>def lcc_simp(seq,start,end):<br>    """Return LCC, a complexity measure from a sequence (seq.)"""<br>    wsize=end-start<br>    l2=math.log(2)<br>    window=seq[start:end]<br>    term_a=((count(window,'A'))/float(wsize))*((math.log((count(window,'A'))/float(wsize)))/l2)<br>    term_c=((count(window,'C'))/float(wsize))*((math.log((count(window,'C'))/float(wsize)))/l2)<br>    term_t=((count(window,'T'))/float(wsize))*((math.log((count(window,'T'))/float(wsize)))/l2)<br>    term_a=((count(window,'G'))/float(wsize))*((math.log((count(window,'G'))/float(wsize)))/l2)<br>    lccsal=-(term_a+term_c+term_t+term_g)<br>    return lccsal<br><br>while cur_record:<br>    cur_record=iterator.next()<br>    crom=crom+1<br>    if cur_record is None:<br>        break<br>    salida=open("C:\\bioinfo-adv\\blast\\data\\test\\smallcsw-"+str(crom)+".txt","w")<br>    <br>    lcc=lcc_mult(cur_record.sequence,17,0,len(cur_record.sequence))<br>
    for x in range(len(lcc)):<br>        salida.write(str("%.5f" %lcc[x])+'\n')<br>        <br>    salida.close()<br>          <br>entrada.close()<br><br></pre>
  </blockquote>
  <pre wrap=""><!----><br>--<br>Rune Linding				room	v101<br>EMBL - Biocomputing - Gibson Team	lab	+49 6221 387451<br>Meyerhofstrasse 1			fax	+49 6221 387517<br>D-69117 Heidelberg			cell	+49 1712 128951<br>Germany					<a class="moz-txt-link-freetext" href="http://www.EMBL-Heidelberg.DE/~linding">http://www.EMBL-Heidelberg.DE/~linding</a><br>Of all the animals, the boy is the most unmanageable.<br>		-- Plato<br>_______________________________________________<br>BioPython mailing list  -  <a class="moz-txt-link-abbreviated" href="mailto:BioPython@biopython.org">BioPython@biopython.org</a><br><a class="moz-txt-link-freetext" href="http://biopython.org/mailman/listinfo/biopython">http://biopython.org/mailman/listinfo/biopython</a><br></pre>
</blockquote>
<br>
<br>
<pre class="moz-signature" cols="$mailwrapcol">-- <br>Estienne Swart<br>SANBI, UWC Private Bag X17, Bellville 7535<br><a class="moz-txt-link-abbreviated" href="mailto:estienne@sanbi.ac.za">estienne@sanbi.ac.za</a><br>tel work: +27 21 959 3908<br>tel home: +27 21 448 8118<br>fax work: +27 21 959 2512<br></pre>
<br>
</body>
</html>

--------------020203060801040105020409--

--------------080008040609010108000501
Content-Type: text/plain;
 name="lcc.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="lcc.py"

#!/bin/env python2.2
__author__  = """$Author: estienne $"""
__date__    = """$Date: 2002/12/05 04:00:53 $"""
__version__ = """$Revision: 1.2 $"""
import sys, getopt, string
from math import log
from Bio import Seq, Fasta
from Bio.Alphabet import IUPAC
import re

#caveats: if there are ambiguous residues present in the sequence, eg. in poly-A-tails, AAAANAAA, 
#this method will calculate artificially high complexity scores. It may be possible to get more
#accurate estimates, by ignoring these positions entirely.
#This prog does not merge low-complexity regions like seg/dust supposedly do

class Complexity:
    """Class to calculate complexity given a sequence and window
    Ideally would like to inherit from Bio.Seq, TODO"""

    def __init__(self, seq, **kwds):
	"""could make individual kwds more explicit here,
	Currently the class is initiated as follows:
	Complexity(seq = record, window = arguments["window"], alphabet = IUPAC.IUPACUnambiguousDNA())"""

	self.__dict__.update(kwds)
	self.Seq = seq
	self.sequence = self.Seq.sequence.upper()
	self._calc_complexity_across_sequence()

    def _calc(self, seqportion):
	"""Calculate the complexity score here"""
	def factorial(n):
	    """A simple iterative factorial function
	    No range checking here yet."""
	    if n > 0:
		factorial = 1
		for i in range(1, n +1): factorial *= i
		return factorial
	    else: return 1
	    
	L = self.window
	L_factorial = factorial(L)
	multi_ni = 1
	for residue in self.alphabet.letters: multi_ni *= factorial(seqportion.count(residue))
	
	return (float(1) / L) * (log(float(L_factorial) / multi_ni)) / log(len(self.alphabet.letters))
    
    def _calc_complexity_across_sequence(self):
	"""Calculate complexity of all sequence residues using a sliding window"""
	self.complexity_list = []
	for i in range(0, len(self.sequence) - self.window + 1):
	    self.complexity_list.append(self._calc(self.sequence[i:i+self.window]))

	endbit = [self._calc(self.sequence[-self.window:])]
	for i in range(-1, -self.window + 1, -1):
	    endbit.append(self._calc(self.sequence[i-self.window:i]))
	endbit.reverse()
	
	self.complexity_list += endbit#need to calculate complexity scores for last few residues in the opposite direction
			#then add them on

    def mask_sequence(self, threshold = None, mask = None, joingap = None):
	"""returns a low-complexity masked sequence
	masked residues are those that fall below the threshold score
	Masking is applied by a constant function (would probably be better if
	a non-constant one was chosen)"""
	if not threshold:
	    if self.__dict__.has_key("threshold"): threshold = self.threshold
	if not mask:
	    if self.__dict__.has_key("mask"): mask = self.mask
	if not joingap:
	    if self.__dict__.has_key("joingap"): joingap = self.joingap

	self._calc_complexity_across_sequence()
	bits = []
	for i in range(0, len(self.sequence)):
	    if self.complexity_list[i] < threshold: bits.append(mask)
	    else: bits.append(self.sequence[i])
		
	self.maskedsequence = string.join(bits, "") #could convert this into a proper sequence object
	temp = []
	
	return self.maskedsequence

    def overmask_sequence(self, mask=None, joingap = None):
	"""this joins masked regions, so that there are no 'straggler' residues between the masked residues"""
	if not mask:
	    if self.__dict__.has_key("mask"): mask = self.mask
	if not joingap:
	    if self.__dict__.has_key("joingap"): joingap = self.joingap
	if not self.__dict__.has_key("maskedsequence"): self.mask_sequence(self.threshold, mask)

	p = re.compile("%s+.{1,%s}%s" % (mask, joingap, mask))
	#maybe this would be faster (when iterating through multiple fasta records)
	#if the pattern is compiled outside the class  ?!?
	
	self.overmaskedsequence = self.maskedsequence
	for i in range(0, 3):
	    #python regexp operators can't find overlapping patterns, so
	    #overmasking needs to be repeated a few times
	    m = p.findall(self.overmaskedsequence)
	    for match in m: self.overmaskedsequence = self.overmaskedsequence.replace(match, mask*len(match))
	
	return self.overmaskedsequence

    def __str__(self):
	if not self.__dict__.has_key("maskedsequence"): self.mask_sequence()
	if not self.__dict__.has_key("overmaskedsequence"): self.overmask_sequence()
	
	#Unmask line below to get printout of complexity scores for each seq position
	#astr = str("""complexity (at each position) for a window of %(window)s:\n%(complexity_list)s 
	astr = str("""
	\nSequence:\n%(sequence)s
	\nMasked with '%(mask)s' at threshold %(threshold)s:\n%(maskedsequence)s\nOvermasked\n%(overmaskedsequence)s""" 
	% (self.__dict__))
	return astr
	
def main():
    def usage():
	ustr =  """
	lcc.py -w <WINDOWSIZE> -i <INPUTFILE>	
	\nOPTIONAL arguments: -m <MASK> -t <THRESHOLD FOR MASKING in range 0 - 1> -a <ALPHABET, protein or dna(default)>
	-j <JOIN GAPS BETWEEN MASKED RESIDUES, SHORTER THAN x - default 12>
	e.g. lcc.py -a protein -i tester.fasta -w 12 -m X -t 0.4 -j 12
	"""
	print ustr
	sys.exit(2)

    args = sys.argv[1:]
    try:
	shortopts, longopts = getopt.getopt(args, 'a:j:m:i:w:t:vh', ["version", "stdin", "help"]) 
    except getopt.GetoptError:
	usage()
	
    arguments = {}    
    for opt, arg in shortopts:
	if opt == '-i': arguments["file"] = arg
	if opt == '-w': arguments["window"] = string.atoi(arg)
	if opt == '-a': arguments["type"] = arg
	if opt == '-m': arguments["mask"] = arg
	if opt == '-t': arguments["threshold"] = string.atof(arg)
	if opt == '-j': arguments["joingap"] = arg
	if opt == '--stdin': arguments["stdin"] = True #overrides readin from files for piping purposes
	if opt == '--help' or opt == '-h': usage()
	if opt == '-v' or opt == "--version": 
	    print __version__
#	    sys.exit()

    if not(arguments.has_key("file") or arguments.has_key("stdin")) or not arguments.has_key("window"): usage()

    if arguments.has_key("type"):
	if arguments["type"] == 'protein': alpha = IUPAC.IUPACProtein()
	elif arguments["type"] == 'dna': alpha = IUPAC.IUPACUnambiguousDNA()
    else: alpha = IUPAC.IUPACUnambiguousDNA()

    if arguments.has_key("threshold"): threshold = arguments["threshold"]
    else: threshold = 0.5

    if arguments.has_key("stdin"): fh = sys.stdin
    else: fh = open(arguments["file"])
    
    fiterator = Fasta.Iterator(fh, Fasta.RecordParser())

    while 1:
	record = fiterator.next()
	if record is None: break
    #    comp = Complexity(seq = record, window = arguments["window"], alphabet = IUPAC.IUPACUnambiguousDNA())
    #    x = comp._calc_complexity_across_sequence()
    #    print comp.complexity_list
    #    print comp.mask_sequence(0.2, 'N')
	c = Complexity(
		seq = record, 
		window = arguments["window"], 
		alphabet = alpha,
		threshold = threshold,
		mask = arguments["mask"],
		joingap = arguments["joingap"],
		)
	print c
    
    fh.close()
    
if __name__ == "__main__":
    main()

--------------080008040609010108000501--