[BioPython] LCC function coded.

Sebastian Bassi sbassi@asalup.org
Mon, 02 Dec 2002 08:25:09 -0300


This is a multi-part message in MIME format.
--------------090906020902080406010104
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit

Hi Iddo,

I've just coded 2 functions to calculate LCC.
lcc_mult and lcc_simp. Both calculates LCC (Local composition content),
lcc_mult is optimized for speed in several runs. It returns a list of
LCC values.
lcc_simp is for 1 LCC value (you can get more calling the function using
a for loop). If you need more LCC values than the size of the LCC
window, is recomended lcc_mult.
lcc_mult calculates in forehand all logs to save time. It also calculate
the "next" window lcc by comparing the next nucleotide on the window
(head) and the nucleotide that leaves the window (tail), if they are the
same, the LCC is the same so it is not calculated.

lcc_mult(seq,wsize,start,end):
seq: String of the sequence.
wsize: Windows size
start: Where to start "walking".
end: Where to end "walking"
it returns a list, all elements float values.

lcc_simp(seq,start,end):
seq: String of the sequence.
start: Where to start calculating LCC.
end: Where to end calculating LCC.
it returns one float value.

I did an informal benchmark and, for a large list of LCC values, 
lcc_simp took 54 secs. and lcc_mult took just 12.

I don't know how to make a module, so I give the code here for you to
adapt it according the BioPython standarts. I hopw it make it to the
BioPython.

-- 
Best regards,

//=\ Sebastian Bassi - Diplomado en Ciencia y Tecnologia, UNQ   //=\
\=// IT Manager Advanta Seeds - Balcarce Research Center -      \=//
//=\ Pro secretario ASALUP - www.asalup.org - PGP key available //=\
\=// E-mail: sbassi@genesdigitales.com - ICQ UIN: 3356556 -     \=//

                Linux para todos: http://Linuxfacil.info


--------------090906020902080406010104
Content-Type: text/plain;
 name="lcc3f2sw.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="lcc3f2sw.py"

# idem version lcc3f2, pero el sliding windows hace que no haga que contar
# todas las ocurrencias de las letras, solo sumar las nuevas.

from Bio import Fasta
from string import count
import math
import time


parser=Fasta.RecordParser()
entrada=open("D:\\projects\\tesis\\fna-1small.nn","r")
cur_record=1
iterator=Fasta.Iterator(entrada,parser)
crom=0
compone=[0]
lccsal=[0]
initime=time.time()

def lcc_mult(seq,wsize,start,end):
    """Return LCC, a complexity measure from a sequence, called seq."""
    l2=math.log(2)
    tamseq=end-start
    global compone
    global lccsal
    for i in range(wsize):
        compone.append(((i+1)/float(wsize))*((math.log((i+1)/float(wsize)))/l2))
    window=seq[0:wsize]
    cant_a=count(window,'A')
    cant_c=count(window,'C')
    cant_t=count(window,'T')
    cant_g=count(window,'G')
    term_a=compone[cant_a]
    term_c=compone[cant_c]
    term_t=compone[cant_t]
    term_g=compone[cant_g]
    lccsal[0]=(-(term_a+term_c+term_t+term_g))
    tail=seq[0]
    for x in range (tamseq-wsize):
        window=seq[x+1:wsize+x+1]
        if tail==window[-1]:
            lccsal.append(lccsal[-1])
            #break
        else:
            if tail=='A':
                cant_a=cant_a-1
                if window[-1]=='C':
                    cant_c=cant_c+1
                    term_a=compone[cant_a]
                    term_c=compone[cant_c]
                    lccsal.append(-(term_a+term_c+term_t+term_g))
                elif window[-1]=='T':
                    cant_t=cant_t+1
                    term_a=compone[cant_a]
                    term_t=compone[cant_t]
                    lccsal.append(-(term_a+term_c+term_t+term_g))
                elif window[-1]=='G':
                    cant_g=cant_g+1
                    term_a=compone[cant_a]
                    term_g=compone[cant_g]
                    lccsal.append(-(term_a+term_c+term_t+term_g))
            else:
                if tail=='C':
                    cant_c=cant_c-1
                    if window[-1]=='A':
                        cant_a=cant_a+1
                        term_a=compone[cant_a]
                        term_c=compone[cant_c]
                        lccsal.append(-(term_a+term_c+term_t+term_g))
                    elif window[-1]=='T':
                        cant_t=cant_t+1
                        term_c=compone[cant_c]
                        term_t=compone[cant_t]
                        lccsal.append(-(term_a+term_c+term_t+term_g))
                    elif window[-1]=='G':
                        cant_g=cant_g+1
                        term_c=compone[cant_c]
                        term_g=compone[cant_g]
                        lccsal.append(-(term_a+term_c+term_t+term_g))
                
                else:
                    if tail=='T':
                        cant_t=cant_t-1
                        if window[-1]=='A':
                            cant_a=cant_a+1
                            term_a=compone[cant_a]
                            term_t=compone[cant_t]
                            lccsal.append(-(term_a+term_c+term_t+term_g))
                        elif window[-1]=='C':
                            cant_c=cant_c+1
                            term_c=compone[cant_c]
                            term_t=compone[cant_t]
                            lccsal.append(-(term_a+term_c+term_t+term_g))
                        elif window[-1]=='G':
                            cant_g=cant_g+1
                            term_t=compone[cant_t]
                            term_g=compone[cant_g]
                            lccsal.append(-(term_a+term_c+term_t+term_g))
                    else:
                        if tail=='G':
                            cant_g=cant_g-1
                            if window[-1]=='A':
                                cant_a=cant_a+1
                                term_a=compone[cant_a]
                                term_g=compone[cant_g]
                                lccsal.append(-(term_a+term_c+term_t+term_g))
                            elif window[-1]=='C':
                                cant_c=cant_c+1
                                term_c=compone[cant_c]
                                term_g=compone[cant_g]
                                lccsal.append(-(term_a+term_c+term_t+term_g))
                            elif window[-1]=='T':
                                cant_t=cant_t+1
                                term_t=compone[cant_t]
                                term_g=compone[cant_g]
                                lccsal.append(-(term_a+term_c+term_t+term_g))
        tail=window[0]
    return lccsal

def lcc_simp(seq,start,end):
    """Return LCC, a complexity measure from a sequence (seq.)"""
    wsize=end-start
    l2=math.log(2)
    window=seq[start:end]
    term_a=((count(window,'A'))/float(wsize))*((math.log((count(window,'A'))/float(wsize)))/l2)
    term_c=((count(window,'C'))/float(wsize))*((math.log((count(window,'C'))/float(wsize)))/l2)
    term_t=((count(window,'T'))/float(wsize))*((math.log((count(window,'T'))/float(wsize)))/l2)
    term_a=((count(window,'G'))/float(wsize))*((math.log((count(window,'G'))/float(wsize)))/l2)
    lccsal=-(term_a+term_c+term_t+term_g)
    return lccsal

while cur_record:
    cur_record=iterator.next()
    crom=crom+1
    if cur_record is None:
        break
    salida=open("C:\\bioinfo-adv\\blast\\data\\test\\smallcsw-"+str(crom)+".txt","w")
    
    lcc=lcc_mult(cur_record.sequence,17,0,len(cur_record.sequence))
    for x in range(len(lcc)):
        salida.write(str("%.5f" %lcc[x])+'\n')
        
    salida.close()
          
entrada.close()


--------------090906020902080406010104--