[BioPython] LCC function coded.
Rune Linding - EMBL Biocomputing Unit
linding@EMBL-Heidelberg.DE
Mon, 2 Dec 2002 20:07:36 +0100
hi
can i have a look at the code,iam planning to do this for
proteins...
:)
r.
On Mon, Dec 02, 2002 at 08:25:09AM -0300, Sebastian Bassi wrote:
> Hi Iddo,
>
> I've just coded 2 functions to calculate LCC.
> lcc_mult and lcc_simp. Both calculates LCC (Local composition content),
> lcc_mult is optimized for speed in several runs. It returns a list of
> LCC values.
> lcc_simp is for 1 LCC value (you can get more calling the function using
> a for loop). If you need more LCC values than the size of the LCC
> window, is recomended lcc_mult.
> lcc_mult calculates in forehand all logs to save time. It also calculate
> the "next" window lcc by comparing the next nucleotide on the window
> (head) and the nucleotide that leaves the window (tail), if they are the
> same, the LCC is the same so it is not calculated.
>
> lcc_mult(seq,wsize,start,end):
> seq: String of the sequence.
> wsize: Windows size
> start: Where to start "walking".
> end: Where to end "walking"
> it returns a list, all elements float values.
>
> lcc_simp(seq,start,end):
> seq: String of the sequence.
> start: Where to start calculating LCC.
> end: Where to end calculating LCC.
> it returns one float value.
>
> I did an informal benchmark and, for a large list of LCC values,
> lcc_simp took 54 secs. and lcc_mult took just 12.
>
> I don't know how to make a module, so I give the code here for you to
> adapt it according the BioPython standarts. I hopw it make it to the
> BioPython.
>
> --
> Best regards,
>
> //=\ Sebastian Bassi - Diplomado en Ciencia y Tecnologia, UNQ //=\
> \=// IT Manager Advanta Seeds - Balcarce Research Center - \=//
> //=\ Pro secretario ASALUP - www.asalup.org - PGP key available //=\
> \=// E-mail: sbassi@genesdigitales.com - ICQ UIN: 3356556 - \=//
>
> Linux para todos: http://Linuxfacil.info
>
> # idem version lcc3f2, pero el sliding windows hace que no haga que contar
> # todas las ocurrencias de las letras, solo sumar las nuevas.
>
> from Bio import Fasta
> from string import count
> import math
> import time
>
>
> parser=Fasta.RecordParser()
> entrada=open("D:\\projects\\tesis\\fna-1small.nn","r")
> cur_record=1
> iterator=Fasta.Iterator(entrada,parser)
> crom=0
> compone=[0]
> lccsal=[0]
> initime=time.time()
>
> def lcc_mult(seq,wsize,start,end):
> """Return LCC, a complexity measure from a sequence, called seq."""
> l2=math.log(2)
> tamseq=end-start
> global compone
> global lccsal
> for i in range(wsize):
> compone.append(((i+1)/float(wsize))*((math.log((i+1)/float(wsize)))/l2))
> window=seq[0:wsize]
> cant_a=count(window,'A')
> cant_c=count(window,'C')
> cant_t=count(window,'T')
> cant_g=count(window,'G')
> term_a=compone[cant_a]
> term_c=compone[cant_c]
> term_t=compone[cant_t]
> term_g=compone[cant_g]
> lccsal[0]=(-(term_a+term_c+term_t+term_g))
> tail=seq[0]
> for x in range (tamseq-wsize):
> window=seq[x+1:wsize+x+1]
> if tail==window[-1]:
> lccsal.append(lccsal[-1])
> #break
> else:
> if tail=='A':
> cant_a=cant_a-1
> if window[-1]=='C':
> cant_c=cant_c+1
> term_a=compone[cant_a]
> term_c=compone[cant_c]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='T':
> cant_t=cant_t+1
> term_a=compone[cant_a]
> term_t=compone[cant_t]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='G':
> cant_g=cant_g+1
> term_a=compone[cant_a]
> term_g=compone[cant_g]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> else:
> if tail=='C':
> cant_c=cant_c-1
> if window[-1]=='A':
> cant_a=cant_a+1
> term_a=compone[cant_a]
> term_c=compone[cant_c]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='T':
> cant_t=cant_t+1
> term_c=compone[cant_c]
> term_t=compone[cant_t]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='G':
> cant_g=cant_g+1
> term_c=compone[cant_c]
> term_g=compone[cant_g]
> lccsal.append(-(term_a+term_c+term_t+term_g))
>
> else:
> if tail=='T':
> cant_t=cant_t-1
> if window[-1]=='A':
> cant_a=cant_a+1
> term_a=compone[cant_a]
> term_t=compone[cant_t]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='C':
> cant_c=cant_c+1
> term_c=compone[cant_c]
> term_t=compone[cant_t]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='G':
> cant_g=cant_g+1
> term_t=compone[cant_t]
> term_g=compone[cant_g]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> else:
> if tail=='G':
> cant_g=cant_g-1
> if window[-1]=='A':
> cant_a=cant_a+1
> term_a=compone[cant_a]
> term_g=compone[cant_g]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='C':
> cant_c=cant_c+1
> term_c=compone[cant_c]
> term_g=compone[cant_g]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> elif window[-1]=='T':
> cant_t=cant_t+1
> term_t=compone[cant_t]
> term_g=compone[cant_g]
> lccsal.append(-(term_a+term_c+term_t+term_g))
> tail=window[0]
> return lccsal
>
> def lcc_simp(seq,start,end):
> """Return LCC, a complexity measure from a sequence (seq.)"""
> wsize=end-start
> l2=math.log(2)
> window=seq[start:end]
> term_a=((count(window,'A'))/float(wsize))*((math.log((count(window,'A'))/float(wsize)))/l2)
> term_c=((count(window,'C'))/float(wsize))*((math.log((count(window,'C'))/float(wsize)))/l2)
> term_t=((count(window,'T'))/float(wsize))*((math.log((count(window,'T'))/float(wsize)))/l2)
> term_a=((count(window,'G'))/float(wsize))*((math.log((count(window,'G'))/float(wsize)))/l2)
> lccsal=-(term_a+term_c+term_t+term_g)
> return lccsal
>
> while cur_record:
> cur_record=iterator.next()
> crom=crom+1
> if cur_record is None:
> break
> salida=open("C:\\bioinfo-adv\\blast\\data\\test\\smallcsw-"+str(crom)+".txt","w")
>
> lcc=lcc_mult(cur_record.sequence,17,0,len(cur_record.sequence))
> for x in range(len(lcc)):
> salida.write(str("%.5f" %lcc[x])+'\n')
>
> salida.close()
>
> entrada.close()
>
--
Rune Linding room v101
EMBL - Biocomputing - Gibson Team lab +49 6221 387451
Meyerhofstrasse 1 fax +49 6221 387517
D-69117 Heidelberg cell +49 1712 128951
Germany http://www.EMBL-Heidelberg.DE/~linding
Of all the animals, the boy is the most unmanageable.
-- Plato