[BioPython] Comparing short sequences against a large fasta file

Wed Oct 27 01:23:05 EDT 2004

Hi 

I have made some advancement in writing up a script
for searching short sequences against a fasta format
file 9with protein coding genes).

Basically the program takes an input file and the
length of the sequence to be searched (say 10mer or
so) from the command line, and then takes the first
10mer of the first sequence in the file and searches
the entire file for that 10mer, then moves one base
forward and searches the next 10 mer and so on.

I have used python dictionaries to keep the sequence
names and the locations of oligos (a call to keys
(sequence names) prints out the values (location of
oligo) of the keys). The problem I am facing is that
the program works fine on a smaller version of file if
the length of the sequence to be searched i 10 mer,
but fails on the entire file 9with 3000 or so
sequences). It does work on larger file too but I have
to reduce the length of the seqeunce to only 5 bases.

I think it is either the issue of memory allocation or
may be the maximum length of dictionary. I would have
thought that the memory to the dictionary is
dynamically allocated. your suggestions will be very
much appreciated.

here's the code:

=============================
from string import *
import os, sys, string
from Bio.SeqIO import SeqIO, Seq, Dictionaries, FASTA
from Bio import Fasta

def findOlig(dna, prim):
    loc = []
    count = 0

    site = dna.find(prim)
    while site != -1:
        loc.append(site)
        site = dna.find(prim, site + 1)
        count += 1
    return loc #prim, count 

#    print 'primer sequence: ', prim, 'no of times
present:', count, 'oligo site:', loc

#to get the reverse of a given sequence
def revSeq(dna):
    myseq = list(dna)
    myseq.reverse()

    return join(myseq, '')

#to get the reverse complement of a sequence
def revComp(dna):
    comp = dna.translate(maketrans("AGCTagct",
"TCGAtcga"))
    lcomp = list(comp)
    lcomp.reverse()

    return join(lcomp, '')

def gcContent(dna):
    '''This method finds gc percent of a given
sequence '''
    length = len(dna)
    count_gc = count(dna, 'g') + count(dna, 'G') +
count(dna, 'c') + count(dna, 'C')
    gc = float(count_gc)/length *100
    return gc

def main():
    """
    Reads sequences from a file in fasta format,
calculates the GC content of the sequences
    in the file. Finds sliding nmers (reads nmers from
the stndin and compares them with the
    sequences in the file.
    """

    infile = sys.argv[1]
    nmer = atoi(sys.argv[2])

    handle = open(infile)
    it = FASTA.FastaReader(handle)

    seqs = []
    for seq in it:
        seqs.append(seq)
    handle.close()

    # Calculates GC contents of the sequences in the
file
    c = 0
    seq_gc = 0.0
    for seq in seqs:
        seq_gc = seq_gc +
(gcContent(seq.seq.tostring()))
        c = c+1

    total_gc = (seq_gc)/c
#    print total_gc

    num_seqs = 0

    for seq in seqs:
        #find the nmers starting from first sequence
in the file
        for i in range(len(seq.seq.tostring())):
            prim = seq.seq.tostring()[i:i+nmer]

            name_seq = ''
            location = []
            prim_p = []
            olig_p = 0.0
            percent_p = 0.0
            s1_dict = {}
            s2_dict = {}
            seq_dict = {}

            # stops when the oligo sequence is smaller
than the input nmer
            if len(prim) < nmer:
                break

            # search for the defined oligo in all the
sequences in the file one by one
            for myseq in seqs:
                f_olig =
findOlig(myseq.seq.tostring(),prim)

                # work only with sequences where the
oligo is found
                if f_olig != []:
                    olig_p+=1

                    name_seq = myseq.name[0]
                    location = f_olig

                    # stores the names and locations
of the seqeunces and oligo being searched
                    # in a dictionary
                    s1_dict = {name_seq:location}
                    seq_dict.update(s1_dict)

            # calculates the % of sequences with the
oligo being searched
            percent_p = olig_p/len(seqs)*100

            # and reports only those oligos present in
>=90% of sequences
            if percent_p >= 90.0:
                print prim#, ",", i,"GC content:",
gcContent(prim)
#                print "seq name and primer location
are: ", seq_dict
#                print "primer ", i, "present in ",
olig_p, "seqeunces"
#                print "primer ", i, "present in ",
percent_p, "percent seqs"

                # prints out the keys and values of
the dictionary
                s2_dict = seq_dict
                s_keys = []
                s_keys = s2_dict.keys()
#                print s_keys

                for j in range(len(s_keys)):
                    id_j = ''
                    try:
                        while s_keys!=-1:
                            id_j = s_keys.pop(0)
                            print id_j, ",",
s2_dict[id_j]

                    except IndexError:
                        pass

if __name__ == '__main__':
    main()

=============================

Thanks

JA

__________________________________
Do you Yahoo!?
Yahoo! Mail Address AutoComplete - You start. We finish.
http://promotions.yahoo.com/new_mail