###########
'''
Copyright (c) 2008, Bruce Southey
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
###########


import re

# (GCT|GCC|GCA|GCG)(TTT|TTC)(TTA|TTG|CTT|CTC|CTA|CTG)(TTT|TTC)(CAA|CAG)(CCT|CCC|CCA|CCG)(CAA|CAG)(CGT|CGC|CGA|CGG|AGA|AGG)(TTT|TTC)(GGT|GGC|GGA|GGG)(CGT|CGC|CGA|CGG|AGA|AGG)
def get_dna_list(reg_seq):
	'''Returns a list of all possible sequences from a regular expression encoding using the ActiveState recipe 496807 at http://code.activestate.com/recipes/496807/'''
	#First convert the regular expression to a list of lists
	reg_list=[]
	start=reg_seq.find('(')
	end=reg_seq.find(')', start)
	while start != -1:
		match=reg_seq[start+1:end]
		reg_list.append(match.split('|'))
		start=reg_seq.find('(', end)
		end=reg_seq.find(')', start)
	#Now perform recipe, note comment about using list comprehension is wrong
	list_seq=[[]]
	for x in reg_list:
		tmp_list = []
		for y in x:
			for i in list_seq:
				tmp_list.append(i+[y])
		list_seq = tmp_list
	#Now convert sub-lists into a list of sequences
	final_seq=[]
	for seq in list_seq:
		big_seq=''.join(seq)
		final_seq.append(big_seq)
	return final_seq

def rt_re_find(express, sequences):
	'''Perform a regular expression search of a list of fasta sequences for supplied regular expression encoding and returns the matching DNA sequence'''
	regexp = re.compile(express, re.I)
	fastadb=sequences.split('>') #quick and dirty approach
	MatchList=[]
	MatchName=[]
	for entry in fastadb[1:]:
		#Get fasta title and sequence
		elist=entry.split('\n')
		title=elist[0]
		sequence=''.join(elist[1:])
		
		foundlist=regexp.findall(sequence)
		nfound=len(foundlist)
		if nfound > 0:
			for nf in range(nfound):
				tpf=foundlist[nf]
				smatch=''
				for nt in range(len(tpf)):
					smatch=smatch+tpf[nt]
				MatchList.append(smatch)
				MatchName.append(title)
	return MatchName, MatchList

#Create a dictionary for translation
dna_aa={
'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A', 'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
'CGT':'R', 'CGC':'R' ,'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R', 'AAA':'K', 'AAG':'K', 'AAT':'N', 'AAC':'N',
'ATG':'M', 'GAT':'D', 'GAC':'D', 'TTT':'F', 'TTC':'F', 'TGT':'C', 'TGC':'C', 'CCT':'P', 'CCC':'P', 'CCA':'P', 
'CCG':'P', 'CAA':'Q', 'CAG':'Q', 'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'AGT':'S', 'AGC':'S', 'GAA':'E',
'GAG':'E', 'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T', 'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G', 'TGG':'W',
'CAT':'H', 'CAC':'H', 'TAT':'Y', 'TAC':'Y', 'ATT':'I', 'ATC':'I', 'ATA':'I', 'GTT':'V', 'GTC':'V', 'GTA':'V',
'GTG':'V' 
}
#Create a dictionary for back translation of the amino acid into a regular expression
rt_dna={'A':'(GCT|GCC|GCA|GCG)', 'L':'(TTA|TTG|CTT|CTC|CTA|CTG)', 'R':'(CGT|CGC|CGA|CGG|AGA|AGG)',
        'K':'(AAA|AAG)', 'N':'(AAT|AAC)', 'M':'(ATG)', 'D':'(GAT|GAC)', 'F':'(TTT|TTC)', 'C':'(TGT|TGC)',
        'P':'(CCT|CCC|CCA|CCG)', 'Q':'(CAA|CAG)', 'S':'(TCT|TCC|TCA|TCG|AGT|AGC)', 'E':'(GAA|GAG)',
        'T':'(ACT|ACC|ACA|ACG)', 'G':'(GGT|GGC|GGA|GGG)', 'W':'(TGG)', 'H':'(CAT|CAC)', 'Y':'(TAT|TAC)',
        'I':'(ATT|ATC|ATA)', 'V':'(GTT|GTC|GTA|GTG)'
        }
##Create a dictionary for back translation of the amino acid into ambiguous codons
rt_ambig={'A' : 'GCN', 'L' : 'YTN', 'R' : 'MGV', 'K' : 'AAR', 'N' : 'AAY', 'M' : 'ATG', 'D' : 'GAY',
          'F' : 'TTY', 'C' : 'TGY', 'P' : 'CCN', 'Q' : 'CAR', 'S' : 'WSN', 'E' : 'GAR', 'T' : 'ACN',
          'G' : 'GGN', 'W' : 'TGG', 'H' : 'CAY', 'Y' : 'TAY', 'I' : 'ATH', 'V' : 'GTN'}
#create reverse dictionaries 
dna_rt={}
for aa, rdna in rt_dna.iteritems():
	dna_rt.update({rdna:aa})
ambig_rt={}
for aa, rdna in rt_ambig.iteritems():
	ambig_rt.update({rdna:aa})

def aa2dna(express):
	#print express
	dna=''
	amdna=''
	for aminoacid in express:
		#print aminoacid
		dval=rt_dna.get(aminoacid, '(NNN)')
		dna=dna + dval
		mval=rt_ambig.get(aminoacid, 'NNN')
		amdna=amdna + mval
	return dna, amdna

def dna2aa(express):
	start=express.find('(')
	end=express.find(')', start)
	translation=''
	while start != -1:
		match=express[start:end+1]
		taa=dna_rt.get(match, 'X')
		translation= translation + taa
		start=express.find('(', end)
		end=express.find(')', start)
	return translation
		
#Function for standard translation of dna sequence to an amino acid sequence
def dna2aa(express):
	'''Converts a DNA sequence given as a string into an amino acid sequence given as a string. No ambiguous codons used.'''
	start=0
	end=start+3
	translation=''
	for i in range(0,len(express),3):
		match=express[i:i+3]
		taa=dna_aa.get(match, 'X')
		translation= translation + taa
	return translation
		
def ambig2aa(express):
	'''Converts a ambiguous DNA sequence given as a string into an amino acid sequence given as a string. No ambiguous codons used.'''
	start=0
	end=start+3
	translation=''
	for i in range(0,len(express),3):
		match=express[i:i+3]
		taa=ambig_rt.get(match, 'X')
		translation= translation + taa
	return translation
		
#Just some protein and DNA sequences to explore
aa_sequences='''>sp|O15130|NPFF_HUMAN FMRFamide-related peptides OS=Homo sapiens GN=NPFF PE=2 SV=1
MDSRQAAALLVLLLLIDGGCAEGPGGQQEDQLSAEEDSEPLPPQDAQTSGSLLHYLLQAMERPGRSQAFLFQPQRFGRNTQGSWRNEWLSPRAGEGLNSQFWSLAAPQRFGKK
>sp|Q9WVA9|NPFF_RAT FMRFamide-related peptides OS=Rattus norvegicus GN=Npff PE=3 SV=1
MDSKWAAVLLLLLLLRNWGHAEEAGSWGEDQVFAEEDKGPHPSQYAHTPDRIQTPGSLMRVLLQAMERPRRNPAFLFQPQRFGRNAWGPWSKEQLSPQAREFWSLAAPQRFGKK
>sp|Q9TUX7|NPFF_BOVIN FMRFamide-related peptides OS=Bos taurus GN=NPFF PE=3 SV=1
MDARQAAALLLVLLLVTDWSHAEGPGGRDGGDQIFMEEDSGAHPAQDAQTPRSLLRSLLQAMQRPGRSPAFLFQPQRFGRNTRGSWSNKRLSPRAGEGLSSPFWSLAAPQRFGKK
>sp|Q9WVA8|NPFF_MOUSE FMRFamide-related peptides OS=Mus musculus GN=Npff PE=2 SV=1
MDSKWAALLLLLLLLLNWGHTEEAGSWGEDQVFAGEDKGPHPPQYAHIPDRIQTPGSLFRVLLQAMDTPRRSPAFLFQPQRFGRSAWGSWSKEQLNPQARQFWSLAAPQRFGKK
'''
dna_sequences='''
>embl|AF005271|AF005271 Homo sapiens FMRFamide-related prepropeptide mRNA, complete cds. ...
catgaagtcctgggggcgccat
gggaggagatcccaggtggctc
ctaatgagccctgcatttcatttgcctgctctagattcccctaaggctactgtgaggctgggggtgggggaacagcaggtataagaggttggggtggctgtaggagggtaggtggcagcatggattctaggcaggctgctgcactgctggtgctgctgctgttaatagacgggggctgtgctgaagggccaggaggccagcaggaagaccagctctccgcggaggaagacagcgaacccctcccaccacaggatgcccagacctctgggtcactgttgcactacctgctccaggcaatggagagacctggccggagccaagccttcctgtttcagccccagaggtttggcagaaatacccagggatcctggaggaatgaatggctgagtccccgggctggagaggggctgaattcccagttctggagcctggctgcccctcaacgctttgggaagaagtgacatgtcatcccttgatatgtctgcatgcaaggtccacacccaaaagtgtcaatgtttgccccccaaataaaattgtctggcttctg
>embl|AF148700|AF148700 Rattus norvegicus NPFF precursor (NPFF) mRNA, complete cds. 
ggcagatggcagcatggattccaagtgggctgctgtgctgctgctactgctgctgctgaggaactggggccacgctgaagaggcagggagctggggtgaagaccaagtctttgcagaagaagataagggaccccacccatcacagtatgcccacactccagacaggatccagactcctgggtccctcatgcgtgttctgctccaggccatggagagacctagaaggaacccagccttcttgtttcagccccagaggtttggcagaaatgcttgggggccctggagcaaggagcagctaagtccccaggctagagagttctggagcctggccgctcctcagcgctttgggaagaagtaacatcatctgctgtgatatgtctgcatgcaaaatcacttccccacattccatgtgtgcccccaaataaagtggtctggcttctgcaaaaaaaaaa
>embl|AF148699|AF148699 Bos taurus NPFF precursor (NPFF) mRNA, complete cds. 
gagatggcagcatggacgccaggcaggcagcagcgctgctgctggtgctgctgttagtaacagactggagccacgctgaaggaccagggggccgggatggaggagaccagatcttcatggaggaagacagcggcgcccacccagcacaggatgctcagacccctaggtcgctcctgcgctccctgctccaggccatgcagagacccggccggagtccagccttcctgtttcagccccagaggtttggcagaaacacccggggctcctggagcaacaaaagactgagtcctcgagctggagaggggctgagttcccccttttggagcctggctgcccctcagcgctttgggaagaagtgacatgtcatcctctgaggtctgcatgcaaggcccacgccccaaactgccacgtgtcccctgcaaataaagctgtctggcttctgaaaaaaaaaa
>embl|AF148701|AF148701 Mus musculus NPFF precursor (Npff) mRNA, complete cds. 
ggcagatggcagcatggactccaagtgggctgctctgctgctgctgctactgctgctgctgaattggggccacactgaagaggcagggagctggggtgaagaccaagtctttgcaggagaagataagggaccccacccaccacagtatgcccacattccagacaggatccagactcctgggtccctctttcgtgttctgctccaggccatggacacacctagaaggagcccagccttcctgtttcagccccagaggtttggcagaagtgcatgggggtcctggagcaaggaacagctaaatccgcaggccagacagttctggagcctggccgctcctcagcgctttgggaagaagtagcatcgtcagctgtgatgcctgcatgcaaaaccacttccccatgttccctgtgtgcccccaaataaaaatggtccggctggcttcagaaaaaaa
'''
print dna_sequences

express='AFLFQPQRFGR' #test amino acid sequence
rexpress=express[::-1] #reverse of test amino acid sequence so no bother of a reverse complement
dnaexpress, ambigxp = aa2dna(express) # back translate the amino acid sequence into regular expression and ambiguous coding
print dnaexpress
print ambigxp
aaexpress = dna2aa(dnaexpress) # check that the back translation worked for the regular expression 
print express
print aaexpress
print ambig_rt
aaexpress = ambig2aa(ambigxp) # check that the back translation worked for the ambiguous coding
print express
print aaexpress
AllName, AllMatch=rt_re_find(dnaexpress, dna_sequences) # Use the regular expression coding to search on a list of sequences
print AllName
print AllMatch
all_seqs=get_dna_list(dnaexpress) # Find all possible DNA sequences based on the regular expression coding
print 'Number of combinations:', len(all_seqs)
# remove the triple quotes to print all possible DNA sequences
# Warning for the above sequence the number of combinations= 442368
'''
for iseq in all_seqs:
	paa=dna2aa(iseq)
	print iseq, paa, (paa==express) #check that it worked
'''

