[BioPython] LCC function coded.

Rune Linding - EMBL Biocomputing Unit linding@EMBL-Heidelberg.DE
Mon, 2 Dec 2002 20:07:36 +0100


hi
can i have a look at the code,iam planning to do this for
proteins...

:)
r.
On Mon, Dec 02, 2002 at 08:25:09AM -0300, Sebastian Bassi wrote:
> Hi Iddo,
> 
> I've just coded 2 functions to calculate LCC.
> lcc_mult and lcc_simp. Both calculates LCC (Local composition content),
> lcc_mult is optimized for speed in several runs. It returns a list of
> LCC values.
> lcc_simp is for 1 LCC value (you can get more calling the function using
> a for loop). If you need more LCC values than the size of the LCC
> window, is recomended lcc_mult.
> lcc_mult calculates in forehand all logs to save time. It also calculate
> the "next" window lcc by comparing the next nucleotide on the window
> (head) and the nucleotide that leaves the window (tail), if they are the
> same, the LCC is the same so it is not calculated.
> 
> lcc_mult(seq,wsize,start,end):
> seq: String of the sequence.
> wsize: Windows size
> start: Where to start "walking".
> end: Where to end "walking"
> it returns a list, all elements float values.
> 
> lcc_simp(seq,start,end):
> seq: String of the sequence.
> start: Where to start calculating LCC.
> end: Where to end calculating LCC.
> it returns one float value.
> 
> I did an informal benchmark and, for a large list of LCC values, 
> lcc_simp took 54 secs. and lcc_mult took just 12.
> 
> I don't know how to make a module, so I give the code here for you to
> adapt it according the BioPython standarts. I hopw it make it to the
> BioPython.
> 
> -- 
> Best regards,
> 
> //=\ Sebastian Bassi - Diplomado en Ciencia y Tecnologia, UNQ   //=\
> \=// IT Manager Advanta Seeds - Balcarce Research Center -      \=//
> //=\ Pro secretario ASALUP - www.asalup.org - PGP key available //=\
> \=// E-mail: sbassi@genesdigitales.com - ICQ UIN: 3356556 -     \=//
> 
>                Linux para todos: http://Linuxfacil.info
> 

> # idem version lcc3f2, pero el sliding windows hace que no haga que contar
> # todas las ocurrencias de las letras, solo sumar las nuevas.
> 
> from Bio import Fasta
> from string import count
> import math
> import time
> 
> 
> parser=Fasta.RecordParser()
> entrada=open("D:\\projects\\tesis\\fna-1small.nn","r")
> cur_record=1
> iterator=Fasta.Iterator(entrada,parser)
> crom=0
> compone=[0]
> lccsal=[0]
> initime=time.time()
> 
> def lcc_mult(seq,wsize,start,end):
>     """Return LCC, a complexity measure from a sequence, called seq."""
>     l2=math.log(2)
>     tamseq=end-start
>     global compone
>     global lccsal
>     for i in range(wsize):
>         compone.append(((i+1)/float(wsize))*((math.log((i+1)/float(wsize)))/l2))
>     window=seq[0:wsize]
>     cant_a=count(window,'A')
>     cant_c=count(window,'C')
>     cant_t=count(window,'T')
>     cant_g=count(window,'G')
>     term_a=compone[cant_a]
>     term_c=compone[cant_c]
>     term_t=compone[cant_t]
>     term_g=compone[cant_g]
>     lccsal[0]=(-(term_a+term_c+term_t+term_g))
>     tail=seq[0]
>     for x in range (tamseq-wsize):
>         window=seq[x+1:wsize+x+1]
>         if tail==window[-1]:
>             lccsal.append(lccsal[-1])
>             #break
>         else:
>             if tail=='A':
>                 cant_a=cant_a-1
>                 if window[-1]=='C':
>                     cant_c=cant_c+1
>                     term_a=compone[cant_a]
>                     term_c=compone[cant_c]
>                     lccsal.append(-(term_a+term_c+term_t+term_g))
>                 elif window[-1]=='T':
>                     cant_t=cant_t+1
>                     term_a=compone[cant_a]
>                     term_t=compone[cant_t]
>                     lccsal.append(-(term_a+term_c+term_t+term_g))
>                 elif window[-1]=='G':
>                     cant_g=cant_g+1
>                     term_a=compone[cant_a]
>                     term_g=compone[cant_g]
>                     lccsal.append(-(term_a+term_c+term_t+term_g))
>             else:
>                 if tail=='C':
>                     cant_c=cant_c-1
>                     if window[-1]=='A':
>                         cant_a=cant_a+1
>                         term_a=compone[cant_a]
>                         term_c=compone[cant_c]
>                         lccsal.append(-(term_a+term_c+term_t+term_g))
>                     elif window[-1]=='T':
>                         cant_t=cant_t+1
>                         term_c=compone[cant_c]
>                         term_t=compone[cant_t]
>                         lccsal.append(-(term_a+term_c+term_t+term_g))
>                     elif window[-1]=='G':
>                         cant_g=cant_g+1
>                         term_c=compone[cant_c]
>                         term_g=compone[cant_g]
>                         lccsal.append(-(term_a+term_c+term_t+term_g))
>                 
>                 else:
>                     if tail=='T':
>                         cant_t=cant_t-1
>                         if window[-1]=='A':
>                             cant_a=cant_a+1
>                             term_a=compone[cant_a]
>                             term_t=compone[cant_t]
>                             lccsal.append(-(term_a+term_c+term_t+term_g))
>                         elif window[-1]=='C':
>                             cant_c=cant_c+1
>                             term_c=compone[cant_c]
>                             term_t=compone[cant_t]
>                             lccsal.append(-(term_a+term_c+term_t+term_g))
>                         elif window[-1]=='G':
>                             cant_g=cant_g+1
>                             term_t=compone[cant_t]
>                             term_g=compone[cant_g]
>                             lccsal.append(-(term_a+term_c+term_t+term_g))
>                     else:
>                         if tail=='G':
>                             cant_g=cant_g-1
>                             if window[-1]=='A':
>                                 cant_a=cant_a+1
>                                 term_a=compone[cant_a]
>                                 term_g=compone[cant_g]
>                                 lccsal.append(-(term_a+term_c+term_t+term_g))
>                             elif window[-1]=='C':
>                                 cant_c=cant_c+1
>                                 term_c=compone[cant_c]
>                                 term_g=compone[cant_g]
>                                 lccsal.append(-(term_a+term_c+term_t+term_g))
>                             elif window[-1]=='T':
>                                 cant_t=cant_t+1
>                                 term_t=compone[cant_t]
>                                 term_g=compone[cant_g]
>                                 lccsal.append(-(term_a+term_c+term_t+term_g))
>         tail=window[0]
>     return lccsal
> 
> def lcc_simp(seq,start,end):
>     """Return LCC, a complexity measure from a sequence (seq.)"""
>     wsize=end-start
>     l2=math.log(2)
>     window=seq[start:end]
>     term_a=((count(window,'A'))/float(wsize))*((math.log((count(window,'A'))/float(wsize)))/l2)
>     term_c=((count(window,'C'))/float(wsize))*((math.log((count(window,'C'))/float(wsize)))/l2)
>     term_t=((count(window,'T'))/float(wsize))*((math.log((count(window,'T'))/float(wsize)))/l2)
>     term_a=((count(window,'G'))/float(wsize))*((math.log((count(window,'G'))/float(wsize)))/l2)
>     lccsal=-(term_a+term_c+term_t+term_g)
>     return lccsal
> 
> while cur_record:
>     cur_record=iterator.next()
>     crom=crom+1
>     if cur_record is None:
>         break
>     salida=open("C:\\bioinfo-adv\\blast\\data\\test\\smallcsw-"+str(crom)+".txt","w")
>     
>     lcc=lcc_mult(cur_record.sequence,17,0,len(cur_record.sequence))
>     for x in range(len(lcc)):
>         salida.write(str("%.5f" %lcc[x])+'\n')
>         
>     salida.close()
>           
> entrada.close()
> 

--
Rune Linding				room	v101
EMBL - Biocomputing - Gibson Team	lab	+49 6221 387451
Meyerhofstrasse 1			fax	+49 6221 387517
D-69117 Heidelberg			cell	+49 1712 128951
Germany					http://www.EMBL-Heidelberg.DE/~linding
Of all the animals, the boy is the most unmanageable.
		-- Plato