[BioPython] Calculating synonymous sites
Brad Chapman
chapmanb at uga.edu
Fri Oct 17 12:01:56 EDT 2003
Sean:
> > Hello, I was hoping that someone may have run across an existing method
> > for calculating the number of synonymous and non-synonymous sites.
Jason:
> Do you really need to code it yourself if you are new to programming or
> can you rely on other programs?
>
> HY-PHY and PAML have a ML calculation of Ka and Ks and can be scripted
> with some work.
If you want to go the route of using an existing program, I've
attached a script I wrote for one of my lab-mates which takes files
of duplicated protein pairs and duplicated nucleotide pairs, aligns
them using mrtrans, and then calculates and extracts Ks and Ka
values using PAML.
This has some case-specific stuff in it (like parsing the info I
need from ugly FASTA titles), but could be a useful starting point
nevertheless.
Brad
-------------- next part --------------
#!/usr/bin/env python
"""Calculate synonymous mutation rates for duplicated gene pairs.
This does the following:
1. Fetches a duplicated protein pair.
2. Aligns the protein pair with clustalw
3. Convert the output to Fasta format.
4. Use this alignment info to align gene sequences using mrtrans
5. Convert mrtrans output to sequential input looking like phy
6. Run yn00 to calculate synonymous mutation rates.
This requires that mrtrans:
ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar
and PAML (specificially yn00):
http://abacus.gene.ucl.ac.uk/software/paml.html
be installed (to do the nucleotide alignments and synynmous calculations,
respectively).
Usage:
dup_synonymous.py <dup protein file> <dup nucleotide file> <output file>
<dup protein file> is a file containing duplicated protein pairs two at a
time.
<dup nucleotide file> is a file containing duplicated nucleotide pairs
corresponding to the protein pairs in <dup protein file>. These will be
aligned based on alignments of the corresponding proteins.
<output file> is the name of a file to write the results to. The results are
output as a simple comma separated value file of pair name, synomymous rate,
non-synonymous rate.
"""
CLEAN_UP = 0
import sys
import os
from Bio import Application
from Bio import Fasta
from Bio import Clustalw
from Bio.Align.FormatConvert import FormatConverter
def main(protein_file, dna_file, output_file):
output_h = open(output_file, "w")
output_h.write("name,dS,dN\n")
work_dir = os.path.join(os.getcwd(), "syn_analysis")
if not(os.path.exists(work_dir)):
os.makedirs(work_dir)
prot_iterator = Fasta.Iterator(open(protein_file), Fasta.RecordParser())
dna_iterator = Fasta.Iterator(open(dna_file), Fasta.RecordParser())
while 1:
p_rec_1 = prot_iterator.next()
if p_rec_1 is None:
break
p_rec_2 = prot_iterator.next()
if p_rec_2 is None:
break
n_rec_1 = dna_iterator.next()
if n_rec_1 is None:
break
n_rec_2 = dna_iterator.next()
if n_rec_2 is None:
break
print "--------", p_rec_1.title, p_rec_2.title
align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir)
mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2,
work_dir)
if mrtrans_fasta:
fix_mrtrans = fix_file(mrtrans_fasta)
ds_subs, dn_subs = find_synonymous(fix_mrtrans, work_dir)
if ds_subs is not None:
pair_name = "%s;%s" % (p_rec_1.title, p_rec_2.title)
output_h.write("%s,%s,%s\n" % (pair_name, ds_subs, dn_subs))
output_h.flush()
def find_synonymous(input_file, work_dir):
"""Run yn00 to find the synonymous subsitution rate for the alignment.
"""
# create the .ctl file
ctl_file = os.path.join(work_dir, "yn-input.ctl")
output_file = os.path.join(work_dir, "nuc-subs.yn")
ctl_h = open(ctl_file, "w")
ctl_h.write("seqfile = %s\noutfile = %s\nverbose = 0\n" %
(input_file, output_file))
ctl_h.write("icode = 0\nweighting = 0\ncommonf3x4 = 0\n")
ctl_h.close()
cl = YnCommandline(ctl_file)
print "\tyn00:", cl
result, r, e = Application.generic_run(cl)
# errors = e.read()
ds_value = None
dn_value = None
#while 1:
# line = r.readline()
# if not(line):
# break
# if line.find("kappa") == 0:
# line = line.strip()
# name, subs_string = line.split(" = ")
# sub_value = float(subs_string)
output_h = open(output_file)
for line in output_h.xreadlines():
if line.find("+-") >= 0 and line.find("dS") == -1:
parts = line.split(" +-")
ds_value = extract_subs_value(parts[1])
dn_value = extract_subs_value(parts[0])
if ds_value is None or dn_value is None:
h = open(output_file)
print "yn00 didn't work: \n%s" % h.read()
return ds_value, dn_value
def extract_subs_value(text):
"""Extract a subsitution value from a line of text.
This is just a friendly function to grab a float value for Ks and Kn
values from the junk I get from the last line of the yn00 file.
Line:
2 1 52.7 193.3 2.0452 0.8979 0.0193 0.0573 +- 0.0177
2.9732 +- 3.2002
Parts:
[' 2 1 52.7 193.3 2.0452 0.8979 0.0193 0.0573',
' 0.0177 2.9732', ' 3.2002\n']
So we want 0.0573 for Kn and 2.9732 for Ks.
"""
parts = text.split()
value = float(parts[-1])
return value
class YnCommandline:
"""Little commandline for yn00.
"""
def __init__(self, ctl_file):
self.ctl_file = ctl_file
self.parameters = []
def __str__(self):
return "yn00 %s" % self.ctl_file
def run_mrtrans(align_fasta, rec_1, rec_2, work_dir):
"""Align two nucleotide sequences with mrtrans and the protein alignment.
"""
try:
align_file = os.path.join(work_dir, "prot-align.fasta")
nuc_file = os.path.join(work_dir, "nuc.fasta")
output_file = os.path.join(work_dir, "nuc-align.mrtrans")
# make the protein alignment file
align_h = open(align_file, "w")
align_h.write(str(align_fasta))
align_h.close()
# make the nucleotide file
nuc_h = open(nuc_file, "w")
nuc_h.write(str(rec_1) + "\n")
nuc_h.write(str(rec_2) + "\n")
nuc_h.close()
# run the program
cl = MrTransCommandline(align_file, nuc_file, output_file)
result, r, e = Application.generic_run(cl)
errors = e.read()
if errors.find("could not translate") >= 0:
print "***mrtrans could not translate"
return None
else:
print "\tmrtrans:", cl
return output_file
finally:
if CLEAN_UP:
if os.path.exists(nuc_file):
os.remove(nuc_file)
if os.path.exists(align_file):
os.remove(align_file)
class MrTransCommandline:
"""Simple commandline faker.
"""
def __init__(self, prot_align_file, nuc_file, output_file):
self.prot_align_file = prot_align_file
self.nuc_file = nuc_file
self.output_file = output_file
self.parameters = []
def __str__(self):
return "mrtrans %s %s > %s" % (self.prot_align_file,
self.nuc_file,
self.output_file)
def clustal_align_protein(rec_1, rec_2, work_dir):
"""Align the two given proteins with clustalw.
"""
try:
fasta_file = os.path.join(work_dir, "prot-start.fasta")
align_file = os.path.join(work_dir, "prot.aln")
fasta_h = open(fasta_file, "w")
fasta_h.write(str(rec_1) + "\n")
fasta_h.write(str(rec_2) + "\n")
fasta_h.close()
clustal_cl = Clustalw.MultipleAlignCL(fasta_file)
clustal_cl.set_output(align_file, #output_type = 'CLUSTAL',
output_order = 'INPUT')
clustal_cl.set_type('PROTEIN')
alignment = Clustalw.do_alignment(clustal_cl)
print "\tDoing clustalw alignment: %s" % clustal_cl
converter = FormatConverter(alignment)
return converter.to_fasta()
finally:
if CLEAN_UP:
if os.path.exists(align_file):
os.remove(align_file)
if os.path.exists(fasta_file):
os.remove(fasta_file)
def process_john_title(title):
"""Process the crappily encoded fasta titles into info we need from them.
"""
# A01N001a&At1g01010
john_part, real_part = title.split("&")
seg_name, dup_name = john_part.split("N")
dup_num = dup_name[:-1] # get rid of the letter
return seg_name, dup_num, real_part
def fix_file(filename):
new_file = filename + ".fix"
input_h = open(filename)
output_h = open(new_file, "w")
it = Fasta.Iterator(input_h, Fasta.RecordParser())
fixed_rec_1 = fix_record(it.next())
fixed_rec_2 = fix_record(it.next())
assert len(fixed_rec_1.sequence) == len(fixed_rec_2.sequence)
output_h.write("\t2 %s\n" % (len(fixed_rec_1.sequence)))
output_h.write(str(fixed_rec_1) + "\n")
output_h.write(str(fixed_rec_2) + "\n")
return new_file
def fix_record(rec):
rec.sequence = rec.sequence.strip()
# get rid of all '.'s in the sequence -- we want '-'s
rec.sequence = rec.sequence.replace(".", "-")
assert len(rec.sequence) % 3.0 == 0
return rec
if __name__ == "__main__":
if len(sys.argv) != 4:
print "Incorrect arguments"
print __doc__
sys.exit()
main(sys.argv[1], sys.argv[2], sys.argv[3])
More information about the BioPython
mailing list