#!/usr/bin/perl -w # protal2dna # version 2.0 # Katja Schuerer, schuerer@pasteur.fr (v 2.0) # Catherine Letondal, letondal@pasteur.fr (v 1.0) use strict; use Getopt::Std; use Bio::Seq; use Bio::SeqIO; use Bio::AlignIO; use Bio::SimpleAlign; use Bio::Tools::CodonTable; ### global variables and default values ### my $verbose = 0; # debug mode on/off my $order = 1; # identify protein and DNA sequences by order (=0 by ids) my $prot_file; # protein alignment file name my $dna_file; # DNA sequences file my $in_prot_aln; # input port to $prot_file my $in_dna_seqs; # input port to $dna_file my $prot_file_format = 'clustalw'; # $prot_file format my $dna_file_format = 'fasta'; # $dna_file format my $output_format = 'clustalw'; # format of the output alignment my @align_format_ok = ('fasta', 'pfam', 'clustalw', 'msf'); # accepted alignment formats my @seq_format_ok = ('fasta', 'embl', 'genbank', 'swiss', 'pir', 'gcg'); # accepted DNA sequence formats my $code = 1; # default code to use to translate DNA to protein sequences my $code_file; # file indicating code for sequences with different genetic codes # than default $code (sequences are identified by the identifier # use in the alignment my %code_special = (); # hash with special codes read from $code_file my $gap_char = '[\.\-]'; # accepted characters for gaps ### end variable declarations ### &parse_opt(); ### parse command line options if ($#ARGV < 0) { print STDERR "DNA sequence file missing\n"; &usage; exit 1; } if ($#ARGV == 0) { $dna_file = shift @ARGV; } else { $prot_file = shift @ARGV; $dna_file = shift @ARGV; } &get_codes if defined $code_file; ### initialisation genetic codes ### open input ports ### $in_dna_seqs = Bio::SeqIO->newFh (-file => $dna_file, -format => $dna_file_format); if ($prot_file) { $in_prot_aln = Bio::AlignIO->newFh (-file => $prot_file, -format => $prot_file_format); } else { $in_prot_aln = Bio::AlignIO->newFh (-fh => \*STDIN, -format => $prot_file_format); } ### handle alignment ### my $aln = <$in_prot_aln>; if (! (defined $aln)) { print STDERR "no alignment available in '", $prot_file_format, "' format from file '", $prot_file, "'\n"; exit 1; } if ($order) { &do_by_order(); } else { &do_by_ids(); } ### print DNA alignment ### my $out = Bio::AlignIO->newFh ( -fh => \*STDOUT, -format => $output_format); print $out $aln; ########################################################################### # main subfunction # ########################################################################### ### do work by identify protein and DNA sequences by order in files ### sub do_by_order { foreach my $pseq ($aln->eachSeq()) { my $dnaseq = <$in_dna_seqs>; if (! (defined $dnaseq)) { print STDERR "no DNA sequence available in '", $dna_file, "' for protein with id '", $pseq->id(), "'\n"; exit 1; } my $use_code = $code; if (exists $code_special{$pseq->id()}) { $use_code = $code_special{$pseq->id()}; } &handle_seq($pseq, $dnaseq, $use_code); } } ### do work by identify protein and DNA sequences by ID ### sub do_by_ids { my %dna_seqs = (); $dna_seqs{$_->id()} = $_ while (<$in_dna_seqs>); foreach my $pseq ($aln->eachSeq()) { my $dnaseq = $dna_seqs{$pseq->id()}; if (! (defined $dnaseq)) { print STDERR "no DNA sequence available in '", $dna_file, "' for protein with id '", $pseq->id(), "'\n"; exit 1; } my $use_code = $code; if (exists $code_special{$pseq->id()}) { $use_code = $code_special{$pseq->id()}; } &handle_seq($pseq, $dnaseq, $use_code); } } ### do work for one protein and corresponding DNA sequence ### sub handle_seq { my $pseq = shift @_; # protein sequence object my $dnaseq = shift @_; # corresponding dna sequence object my $code = shift @_; # number of genetic code to use for translation # search coding sequence in all 6 frames my $pseq_nogap = $pseq->seq(); $pseq_nogap =~ s/$gap_char//g; my $cds = &search_cds($dnaseq, $pseq_nogap, $code); if ($cds eq "") { print STDERR "no coding region found for ", $pseq->id(), " in ", $dnaseq->id(), "\n"; exit 1; } # replace protein sequence by gaped DNA sequence in the alignment $pseq->seq(&realign($pseq->seq(), $cds)); } ########################################################################### # subfunction to do true work # ########################################################################### ### search cds for protein $prot in DNA sequence $dna ### ### in all 6 frames with code $code ### ### return : extracted cds coding for $prot ### sub search_cds { my $dna = shift @_; # DNA sequence object my $prot = shift @_; # protein sequence as string to search for (without gaps) my $code = shift @_; # number of genetic code to use for translation # search on + strand my $offset = &search_on_strand($dna, $prot, $code); # search on - strand if not found on + strand if ($offset < 0) { $dna = $dna->revcom(); $offset = &search_on_strand($dna, $prot, $code); } if ($offset < 0) { return ""; } my $end = $offset+length($prot)*3-1; # hack if the last base of the last codon is missing # but the codon is yet determined my $add = ""; if ($end = $dna->length() - 1) { $end = $dna->length(); $add = "-"; $dna->warn($dna->id() . ": last codon incomplete -- filled with gaps"); } return $dna->subseq($offset, $end) . $add; } ### search cds for protein $prot in DNA sequence $dna on + strand ### ### return : start position in DNA sequence (starting by 1) sub search_on_strand { my $dna = shift @_; # DNA sequence object my $prot = shift @_; # protein sequence as string to search for (without gaps) my $code = shift @_; # number of genetic code to use for translation my $frame; my $offset = -1; my $cds_found = 0; print "prot: ", $prot, "\n" if $verbose; for ($frame = 0; (! $cds_found) && $frame < 3; $frame++) { # translate DNA sequence in frame my $trans = $dna->translate(undef, undef, $frame, $code); # search protein sequence in this frame print "trans frame $frame: ", $trans->seq(), "\n", "prot :", $prot, "\n" if $verbose; $offset = index($trans->seq(), $prot); $cds_found = 1 if $offset >= 0; } $frame--; $offset = $offset*3+$frame+1 if $offset >= 0; return $offset; } ### align cds $cds according to his translation $prot ### ### return : gaped cds ### sub realign { my $prot = shift @_; # protein sequence containing gaps as string my $cds = shift @_; # cds sequence without gap as string my $cds_gap = ""; my $i = 0; foreach my $aa (split "",$prot) { if ($aa =~ $gap_char) { $cds_gap .= $aa . $aa . $aa; # if different gapchars are used (Pfam) } else { $cds_gap .= substr($cds, $i, 3); $i += 3; } } return $cds_gap; } ########################################################################### # input/output function + command line parsing # ########################################################################### ### usage ### sub usage { my $program = `basename $0`; chop($program); print " $program tries to align the DNA sequences corresponding to the aligned protein (in the appropriate frame). $program [ -i ] [ -f output_format ] [ -p prot_align_format] [ -n dna_seq_format ] [ -G default_genetic_code ] [ -g genetic_code_file ] prot_align_file dna_seqs_file -v verbose -h this message -p f protein alignment file format ('fasta', 'pfam', 'clustalw', 'msf') default = 'clustalw' -n f format of dna sequences file corresponding to the aligned protein sequences ('fasta', 'embl', 'genbank', 'swiss', 'pir', 'gcg') default = 'fasta' -i same sequences identified by their ids default = same sequences identified by order in the file -f f output format of the DNA alignment (same as -p f) default = 'clustalw' -g n genetic code table 1 : Standard 2 : Vertebrate Mitochondrial 3 : Yeast Mitochondrial 4 : Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma 5 : Invertebrate Mitochondrial 6 : Ciliate Macronuclear and Dasycladacean 9 : Echinoderm Mitochondrial 10 : Euplotid Nuclear 11 : Bacterial 12 : Alternative Yeast Nuclear 13 : Ascidian Mitochondrial 14 : Flatworm Mitochondrial 15 : Blepharisma Macronuclear default = 1 -G f file containing genetic code numbers of sequences that differs from the default code use ( see -g n ) file format: one sequence per line with prot-id used in the alignemnt followed by code number to use "; } ### function to parse command line options ### sub parse_opt { my %opts = (); getopts('p:n:f:g:G:ichv', \%opts); if ($opts{'h'}) { &usage; exit(0); } if ($opts{'p'}) { $prot_file_format = $opts{'p'}; if (!(grep(/^$prot_file_format$/, @align_format_ok))) { print STDERR "unknown format '", $prot_file_format, "' for protein alignment file\naccepted formats: '", join("', '", @align_format_ok), "'\n"; exit 1; } } if ($opts{'n'}) { $dna_file_format = $opts{'n'}; if (!(grep(/^$dna_file_format$/, @seq_format_ok))) { print STDERR "unknown format '", $dna_file_format, "' for DNA sequence file\naccepted formats: '", join("', '", @seq_format_ok), "'\n"; exit 1; } } if ($opts{'f'}) { $output_format = $opts{'f'}; if (!(grep(/^$output_format$/, @align_format_ok))) { print STDERR "unknown format '", $output_format, "' for protein alignment file\naccepted formates: '", join("', '", @align_format_ok), "'\n"; exit 1; } } if ($opts{'g'}) { $code = $opts{'g'}; if ($code !~ /\d+/) { print STDERR "integer value as genetic code expected -- '", $code, "' provided\n"; exit 1; } if ($code < 1 || $code > 15) { print STDERR "invalid number of genetic code -- accepted 1 .. 15\n"; &usage, exit 1; } } if ($opts{'G'}) { $code_file = $opts{'G'}; } if ($opts{'i'}) { $order = 0; } if ($opts{'v'}) { $verbose = 1; } } ### function to set genetic codes ### sub get_codes { open(CODE, $code_file) or die $code_file, " -- $!\n"; while () { my @fields = split(/\s+/, $_); $code_special{$fields[0]} = $fields[1]; } close CODE; }