[BioPython] Calculating synonymous sites

Fri Oct 17 12:01:56 EDT 2003

Sean:
> > Hello, I was hoping that someone may have run across an existing method
> > for calculating the number of synonymous and non-synonymous sites.

Jason:
> Do you really need to code it yourself if you are new to programming or
> can you rely on other programs?
> 
> HY-PHY and PAML have a ML calculation of Ka and Ks and can be scripted
> with some work.

If you want to go the route of using an existing program, I've
attached a script I wrote for one of my lab-mates which takes files
of duplicated protein pairs and duplicated nucleotide pairs, aligns
them using mrtrans, and then calculates and extracts Ks and Ka
values using PAML.

This has some case-specific stuff in it (like parsing the info I
need from ugly FASTA titles), but could be a useful starting point
nevertheless.

Brad
-------------- next part --------------
#!/usr/bin/env python
"""Calculate synonymous mutation rates for duplicated gene pairs.

This does the following:
    1. Fetches a duplicated protein pair.
    2. Aligns the protein pair with clustalw
    3. Convert the output to Fasta format.
    4. Use this alignment info to align gene sequences using mrtrans
    5. Convert mrtrans output to sequential input looking like phy
    6. Run yn00 to calculate synonymous mutation rates.

This requires that mrtrans:
    ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar
and PAML (specificially yn00):
    http://abacus.gene.ucl.ac.uk/software/paml.html
be installed (to do the nucleotide alignments and synynmous calculations,
respectively).

Usage:
    dup_synonymous.py <dup protein file> <dup nucleotide file> <output file>

<dup protein file> is a file containing duplicated protein pairs two at a
time.

<dup nucleotide file> is a file containing duplicated nucleotide pairs
corresponding to the protein pairs in <dup protein file>. These will be
aligned based on alignments of the corresponding proteins.

<output file> is the name of a file to write the results to. The results are
output as a simple comma separated value file of pair name, synomymous rate,
non-synonymous rate.
"""
CLEAN_UP = 0

import sys
import os
from Bio import Application
from Bio import Fasta
from Bio import Clustalw
from Bio.Align.FormatConvert import FormatConverter

def main(protein_file, dna_file, output_file):
    output_h = open(output_file, "w")
    output_h.write("name,dS,dN\n")
    work_dir = os.path.join(os.getcwd(), "syn_analysis")
    if not(os.path.exists(work_dir)):
        os.makedirs(work_dir)
    prot_iterator = Fasta.Iterator(open(protein_file), Fasta.RecordParser())
    dna_iterator = Fasta.Iterator(open(dna_file), Fasta.RecordParser())
    while 1:
        p_rec_1 = prot_iterator.next()
        if p_rec_1 is None:
            break
        p_rec_2 = prot_iterator.next()
	if p_rec_2 is None:
            break
        n_rec_1 = dna_iterator.next()
	if n_rec_1 is None:
            break
        n_rec_2 = dna_iterator.next()
	if n_rec_2 is None:
            break

        print "--------", p_rec_1.title, p_rec_2.title
        align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2,
                                    work_dir)
        if mrtrans_fasta:
            fix_mrtrans = fix_file(mrtrans_fasta)
            ds_subs, dn_subs = find_synonymous(fix_mrtrans, work_dir)
            if ds_subs is not None:
                pair_name = "%s;%s" % (p_rec_1.title, p_rec_2.title)
                output_h.write("%s,%s,%s\n" % (pair_name, ds_subs, dn_subs))
                output_h.flush()

def find_synonymous(input_file, work_dir):
    """Run yn00 to find the synonymous subsitution rate for the alignment.
    """
    # create the .ctl file
    ctl_file = os.path.join(work_dir, "yn-input.ctl")
    output_file = os.path.join(work_dir, "nuc-subs.yn")
    ctl_h = open(ctl_file, "w")
    ctl_h.write("seqfile = %s\noutfile = %s\nverbose = 0\n" % 
                (input_file, output_file))
    ctl_h.write("icode = 0\nweighting = 0\ncommonf3x4 = 0\n")
    ctl_h.close()

    cl = YnCommandline(ctl_file)
    print "\tyn00:", cl
    result, r, e = Application.generic_run(cl)
    # errors = e.read()
    ds_value = None
    dn_value = None
    #while 1:
    #    line = r.readline()
    #    if not(line):
    #        break
    #    if line.find("kappa") == 0:
    #        line = line.strip()
    #        name, subs_string = line.split(" = ")
    #        sub_value = float(subs_string)
    output_h = open(output_file)
    for line in output_h.xreadlines():
        if line.find("+-") >= 0 and line.find("dS") == -1:
            parts = line.split(" +-")
            ds_value = extract_subs_value(parts[1])
            dn_value = extract_subs_value(parts[0])

    if ds_value is None or dn_value is None:
	h = open(output_file)
	print "yn00 didn't work: \n%s" % h.read()

    return ds_value, dn_value

def extract_subs_value(text):
    """Extract a subsitution value from a line of text.

    This is just a friendly function to grab a float value for Ks and Kn 
    values from the junk I get from the last line of the yn00 file.

    Line:
    2    1    52.7   193.3   2.0452  0.8979  0.0193 0.0573 +- 0.0177 
    2.9732 +- 3.2002

    Parts:
        ['   2    1    52.7   193.3   2.0452  0.8979  0.0193 0.0573', 
         ' 0.0177  2.9732', ' 3.2002\n']

    So we want 0.0573 for Kn and 2.9732 for Ks.
    """
    parts = text.split()
    value = float(parts[-1])

    return value

class YnCommandline:
    """Little commandline for yn00.
    """
    def __init__(self, ctl_file):
        self.ctl_file = ctl_file
        self.parameters = []

    def __str__(self):
        return "yn00 %s" % self.ctl_file

def run_mrtrans(align_fasta, rec_1, rec_2, work_dir):
    """Align two nucleotide sequences with mrtrans and the protein alignment.
    """
    try:
        align_file = os.path.join(work_dir, "prot-align.fasta")
        nuc_file = os.path.join(work_dir, "nuc.fasta")
        output_file = os.path.join(work_dir, "nuc-align.mrtrans")

        # make the protein alignment file
        align_h = open(align_file, "w")
        align_h.write(str(align_fasta))
        align_h.close()
        # make the nucleotide file
        nuc_h = open(nuc_file, "w")
        nuc_h.write(str(rec_1) + "\n")
        nuc_h.write(str(rec_2) + "\n")
        nuc_h.close()

        # run the program
        cl = MrTransCommandline(align_file, nuc_file, output_file)
        result, r, e = Application.generic_run(cl)
        errors = e.read()
        if errors.find("could not translate") >= 0:
	    print "***mrtrans could not translate"
            return None
        else:
            print "\tmrtrans:", cl
            return output_file
    finally:
	if CLEAN_UP:
            if os.path.exists(nuc_file):
                os.remove(nuc_file)
            if os.path.exists(align_file):
                os.remove(align_file)

class MrTransCommandline:
    """Simple commandline faker.
    """
    def __init__(self, prot_align_file, nuc_file, output_file):
        self.prot_align_file = prot_align_file
        self.nuc_file = nuc_file
        self.output_file = output_file

        self.parameters = []

    def __str__(self):
        return "mrtrans %s %s > %s" % (self.prot_align_file,
                                       self.nuc_file,
                                       self.output_file)

def clustal_align_protein(rec_1, rec_2, work_dir):
    """Align the two given proteins with clustalw.
    """
    try:
        fasta_file = os.path.join(work_dir, "prot-start.fasta")
        align_file = os.path.join(work_dir, "prot.aln")
        fasta_h = open(fasta_file, "w")
        fasta_h.write(str(rec_1) + "\n")
        fasta_h.write(str(rec_2) + "\n")
        fasta_h.close()
        clustal_cl = Clustalw.MultipleAlignCL(fasta_file)
        clustal_cl.set_output(align_file, #output_type = 'CLUSTAL',
                              output_order = 'INPUT')
        clustal_cl.set_type('PROTEIN')
        alignment = Clustalw.do_alignment(clustal_cl)
        print "\tDoing clustalw alignment: %s" % clustal_cl
        converter = FormatConverter(alignment)
        return converter.to_fasta()
    finally:
	if CLEAN_UP:
            if os.path.exists(align_file):
                os.remove(align_file)
            if os.path.exists(fasta_file):
                os.remove(fasta_file)

def process_john_title(title):
    """Process the crappily encoded fasta titles into info we need from them.
    """
    # A01N001a&At1g01010
    john_part, real_part = title.split("&")
    seg_name, dup_name = john_part.split("N")
    dup_num = dup_name[:-1] # get rid of the letter

    return seg_name, dup_num, real_part

def fix_file(filename):
    new_file = filename + ".fix"
    input_h = open(filename)
    output_h = open(new_file, "w")

    it = Fasta.Iterator(input_h, Fasta.RecordParser())
    fixed_rec_1 = fix_record(it.next())
    fixed_rec_2 = fix_record(it.next())
    assert len(fixed_rec_1.sequence) == len(fixed_rec_2.sequence)
    output_h.write("\t2 %s\n" % (len(fixed_rec_1.sequence)))
    output_h.write(str(fixed_rec_1) + "\n")
    output_h.write(str(fixed_rec_2) + "\n")

    return new_file

def fix_record(rec):
    rec.sequence = rec.sequence.strip()
    # get rid of all '.'s in the sequence -- we want '-'s
    rec.sequence = rec.sequence.replace(".", "-")
    assert len(rec.sequence) % 3.0 == 0
    return rec

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Incorrect arguments"
        print __doc__
        sys.exit()
    main(sys.argv[1], sys.argv[2], sys.argv[3])