[BioPython] Comparing short sequences against a large fasta file
Bzy Bee
nomy2020 at yahoo.com
Wed Oct 27 01:23:05 EDT 2004
Hi
I have made some advancement in writing up a script
for searching short sequences against a fasta format
file 9with protein coding genes).
Basically the program takes an input file and the
length of the sequence to be searched (say 10mer or
so) from the command line, and then takes the first
10mer of the first sequence in the file and searches
the entire file for that 10mer, then moves one base
forward and searches the next 10 mer and so on.
I have used python dictionaries to keep the sequence
names and the locations of oligos (a call to keys
(sequence names) prints out the values (location of
oligo) of the keys). The problem I am facing is that
the program works fine on a smaller version of file if
the length of the sequence to be searched i 10 mer,
but fails on the entire file 9with 3000 or so
sequences). It does work on larger file too but I have
to reduce the length of the seqeunce to only 5 bases.
I think it is either the issue of memory allocation or
may be the maximum length of dictionary. I would have
thought that the memory to the dictionary is
dynamically allocated. your suggestions will be very
much appreciated.
here's the code:
=============================
from string import *
import os, sys, string
from Bio.SeqIO import SeqIO, Seq, Dictionaries, FASTA
from Bio import Fasta
def findOlig(dna, prim):
loc = []
count = 0
site = dna.find(prim)
while site != -1:
loc.append(site)
site = dna.find(prim, site + 1)
count += 1
return loc #prim, count
# print 'primer sequence: ', prim, 'no of times
present:', count, 'oligo site:', loc
#to get the reverse of a given sequence
def revSeq(dna):
myseq = list(dna)
myseq.reverse()
return join(myseq, '')
#to get the reverse complement of a sequence
def revComp(dna):
comp = dna.translate(maketrans("AGCTagct",
"TCGAtcga"))
lcomp = list(comp)
lcomp.reverse()
return join(lcomp, '')
def gcContent(dna):
'''This method finds gc percent of a given
sequence '''
length = len(dna)
count_gc = count(dna, 'g') + count(dna, 'G') +
count(dna, 'c') + count(dna, 'C')
gc = float(count_gc)/length *100
return gc
def main():
"""
Reads sequences from a file in fasta format,
calculates the GC content of the sequences
in the file. Finds sliding nmers (reads nmers from
the stndin and compares them with the
sequences in the file.
"""
infile = sys.argv[1]
nmer = atoi(sys.argv[2])
handle = open(infile)
it = FASTA.FastaReader(handle)
seqs = []
for seq in it:
seqs.append(seq)
handle.close()
# Calculates GC contents of the sequences in the
file
c = 0
seq_gc = 0.0
for seq in seqs:
seq_gc = seq_gc +
(gcContent(seq.seq.tostring()))
c = c+1
total_gc = (seq_gc)/c
# print total_gc
num_seqs = 0
for seq in seqs:
#find the nmers starting from first sequence
in the file
for i in range(len(seq.seq.tostring())):
prim = seq.seq.tostring()[i:i+nmer]
name_seq = ''
location = []
prim_p = []
olig_p = 0.0
percent_p = 0.0
s1_dict = {}
s2_dict = {}
seq_dict = {}
# stops when the oligo sequence is smaller
than the input nmer
if len(prim) < nmer:
break
# search for the defined oligo in all the
sequences in the file one by one
for myseq in seqs:
f_olig =
findOlig(myseq.seq.tostring(),prim)
# work only with sequences where the
oligo is found
if f_olig != []:
olig_p+=1
name_seq = myseq.name[0]
location = f_olig
# stores the names and locations
of the seqeunces and oligo being searched
# in a dictionary
s1_dict = {name_seq:location}
seq_dict.update(s1_dict)
# calculates the % of sequences with the
oligo being searched
percent_p = olig_p/len(seqs)*100
# and reports only those oligos present in
>=90% of sequences
if percent_p >= 90.0:
print prim#, ",", i,"GC content:",
gcContent(prim)
# print "seq name and primer location
are: ", seq_dict
# print "primer ", i, "present in ",
olig_p, "seqeunces"
# print "primer ", i, "present in ",
percent_p, "percent seqs"
# prints out the keys and values of
the dictionary
s2_dict = seq_dict
s_keys = []
s_keys = s2_dict.keys()
# print s_keys
for j in range(len(s_keys)):
id_j = ''
try:
while s_keys!=-1:
id_j = s_keys.pop(0)
print id_j, ",",
s2_dict[id_j]
except IndexError:
pass
if __name__ == '__main__':
main()
=============================
Thanks
JA
__________________________________
Do you Yahoo!?
Yahoo! Mail Address AutoComplete - You start. We finish.
http://promotions.yahoo.com/new_mail
More information about the BioPython
mailing list