#!/usr/bin/perl -w


# protal2dna
# version 2.0
 
# Katja Schuerer, schuerer@pasteur.fr (v 2.0)
# Catherine Letondal, letondal@pasteur.fr (v 1.0)

use strict;

use Getopt::Std;

use Bio::Seq;
use Bio::SeqIO;
use Bio::AlignIO;
use Bio::SimpleAlign;
use Bio::Tools::CodonTable;

### global variables and default values ###

my $verbose = 0; # debug mode on/off
my $order = 1;   # identify protein and DNA sequences by order (=0 by ids) 

my $prot_file;   # protein alignment file name
my $dna_file;    # DNA sequences file
my $in_prot_aln; # input port to $prot_file
my $in_dna_seqs; # input port to $dna_file

my $prot_file_format = 'clustalw'; # $prot_file format
my $dna_file_format = 'fasta';     # $dna_file format
my $output_format = 'clustalw';    # format of the output alignment

my @align_format_ok = ('fasta', 'pfam', 'clustalw', 'msf'); # accepted alignment formats
my @seq_format_ok = ('fasta', 'embl', 'genbank', 'swiss', 'pir', 'gcg'); # accepted DNA sequence formats

my $code = 1;      # default code to use to translate DNA to protein sequences
my $code_file;     # file indicating code for sequences with different genetic codes
                   # than default $code (sequences are identified by the identifier
                   # use in the alignment
my %code_special = (); # hash with special codes read from $code_file

my $gap_char = '[\.\-]'; # accepted characters for gaps

### end variable declarations ###

&parse_opt(); ### parse command line options

if ($#ARGV < 0) { 
    print STDERR "DNA sequence file missing\n";
    &usage; exit 1;
}

if ($#ARGV == 0) { $dna_file = shift @ARGV; }
else { 
    $prot_file = shift @ARGV;
    $dna_file  = shift @ARGV;
}

&get_codes if defined $code_file; ### initialisation genetic codes

### open input ports ###

$in_dna_seqs = Bio::SeqIO->newFh (-file => $dna_file, 
				  -format => $dna_file_format);

if ($prot_file) {
    $in_prot_aln = Bio::AlignIO->newFh (-file => $prot_file, 
					-format => $prot_file_format);
}
else {
    $in_prot_aln = Bio::AlignIO->newFh (-fh => \*STDIN,
					-format => $prot_file_format);
}

### handle alignment ### 

my $aln = <$in_prot_aln>; 
if (! (defined $aln)) {
    print STDERR "no alignment available in '", $prot_file_format, 
                 "' format from file '", $prot_file, "'\n";
    exit 1;
}

if ($order) { &do_by_order(); }
else { &do_by_ids(); }

### print DNA alignment  ###

my $out = Bio::AlignIO->newFh ( -fh => \*STDOUT, -format => $output_format);
print $out $aln;


###########################################################################
# main subfunction                                                        #
###########################################################################

### do work by identify protein and DNA sequences by order in files ###
sub do_by_order {

    foreach my $pseq ($aln->eachSeq()) {
	my $dnaseq = <$in_dna_seqs>;
	if (! (defined $dnaseq)) {
	    print STDERR "no DNA sequence available in '", $dna_file, 
	    "' for protein with id '", $pseq->id(), "'\n";
	    exit 1;
	}
	my $use_code = $code;
	if (exists $code_special{$pseq->id()}) {
	    $use_code = $code_special{$pseq->id()};
	}

	&handle_seq($pseq, $dnaseq, $use_code);
    }
    
}

### do work by identify protein and DNA sequences by ID ###
sub do_by_ids {

    my %dna_seqs = ();
    $dna_seqs{$_->id()} = $_ while (<$in_dna_seqs>);

    foreach my $pseq ($aln->eachSeq()) {
	 my $dnaseq = $dna_seqs{$pseq->id()};

    if (! (defined $dnaseq)) {
	     print STDERR "no DNA sequence available in '", $dna_file, 
	     "' for protein with id '", $pseq->id(), "'\n";
	     exit 1;
	 }
	 my $use_code = $code;
	 if (exists $code_special{$pseq->id()}) {
	     $use_code = $code_special{$pseq->id()};
	 }

	 &handle_seq($pseq, $dnaseq, $use_code);
     }
    
}

### do work for one protein and corresponding DNA sequence ###
sub handle_seq {
    
    my $pseq = shift @_;   # protein sequence object
    my $dnaseq = shift @_; # corresponding dna sequence object
    my $code = shift @_; # number of genetic code to use for translation
    

    # search coding sequence in all 6 frames
    my $pseq_nogap = $pseq->seq();
    $pseq_nogap =~ s/$gap_char//g;
    
    my $cds = &search_cds($dnaseq, $pseq_nogap, $code);
    if ($cds eq "") {
	print STDERR "no coding region found for ", $pseq->id(), 
	" in ", $dnaseq->id(), "\n";
	exit 1;
    }

    # replace protein sequence by gaped DNA sequence in the alignment
    $pseq->seq(&realign($pseq->seq(), $cds));

}

###########################################################################
# subfunction to do true work                                             #
###########################################################################

### search cds for protein $prot in DNA sequence $dna ###
### in all 6 frames with code $code ###
### return : extracted cds coding for $prot ###
sub search_cds {
    
    my $dna  = shift @_; # DNA sequence object
    my $prot = shift @_; # protein sequence as string to search for (without gaps)
    my $code = shift @_; # number of genetic code to use for translation
    
    # search on + strand
    my $offset = &search_on_strand($dna, $prot, $code);
    
    # search on - strand if not found on + strand
    if ($offset < 0) {
	$dna = $dna->revcom(); 
	$offset = &search_on_strand($dna, $prot, $code);
    }
    
    if ($offset < 0) { return ""; }

    my $end = $offset+length($prot)*3-1;

    # hack if the last base of the last codon is missing
    # but the codon is yet determined
    my $add = "";
    if ($end =  $dna->length() - 1) {
	$end = $dna->length(); 
	$add = "-";
	$dna->warn($dna->id() . ": last codon incomplete -- filled with gaps");
    }

    return $dna->subseq($offset, $end) . $add;
}

### search cds for protein $prot in DNA sequence $dna on + strand ###
### return : start position in DNA sequence (starting by 1)
sub search_on_strand {

    my $dna  = shift @_; # DNA sequence object
    my $prot = shift @_; # protein sequence as string to search for (without gaps)
    my $code = shift @_; # number of genetic code to use for translation

    my $frame; 
    my $offset = -1;
    my $cds_found = 0;
    
    print "prot: ", $prot, "\n" if $verbose;

    for ($frame = 0; (! $cds_found) && $frame < 3; $frame++) {
	
	# translate DNA sequence in frame 
	my $trans = $dna->translate(undef, undef, $frame, $code);
	# search protein sequence in this frame
	print "trans frame $frame: ", $trans->seq(), "\n", "prot :", $prot, "\n" if $verbose;
	$offset = index($trans->seq(), $prot);

	$cds_found = 1 if $offset >= 0;
    }
    $frame--;

    $offset = $offset*3+$frame+1 if $offset >= 0;
    return $offset;
}

### align cds $cds according to his translation $prot ###
### return : gaped cds ###
sub realign {

    my $prot = shift @_; # protein sequence containing gaps as string
    my $cds  = shift @_; # cds sequence without gap as string

    my $cds_gap = "";
    my $i = 0;
    foreach my $aa (split "",$prot) {
	if ($aa =~ $gap_char) {
	    $cds_gap .= $aa . $aa . $aa; # if different gapchars are used (Pfam)
	}
	else {
	    $cds_gap .= substr($cds, $i, 3);
	    $i += 3;
	}
    }

    return $cds_gap;
}

###########################################################################
#  input/output function + command line parsing                           #
###########################################################################

### usage ###
sub usage {

  my $program = `basename $0`;
  chop($program);

  print "

    $program tries to align the DNA sequences corresponding to the aligned 
    protein (in the appropriate frame).

    $program [ -i ] 
             [ -f output_format ] [ -p prot_align_format] [ -n dna_seq_format ] 
             [ -G default_genetic_code ] [ -g genetic_code_file ] 
             prot_align_file dna_seqs_file

    -v     verbose

    -h     this message

    -p f   protein alignment file format ('fasta', 'pfam', 'clustalw', 'msf')
           default = 'clustalw'

    -n f   format of dna sequences file corresponding to the aligned protein 
           sequences ('fasta', 'embl', 'genbank', 'swiss', 'pir', 'gcg')
           default = 'fasta'
           
    -i     same sequences identified by their ids 
           default =  same sequences identified by order in the file

    -f f   output format of the DNA alignment (same as -p f)
           default = 'clustalw'

    -g n   genetic code table

            1 : Standard
            2 : Vertebrate Mitochondrial
            3 : Yeast Mitochondrial
            4 : Mold, Protozoan, Coelenterate Mitochondrial and 
                Mycoplasma/Spiroplasma
            5 : Invertebrate Mitochondrial
            6 : Ciliate Macronuclear and Dasycladacean
            9 : Echinoderm Mitochondrial
           10 : Euplotid Nuclear
           11 : Bacterial
           12 : Alternative Yeast Nuclear
           13 : Ascidian Mitochondrial
           14 : Flatworm Mitochondrial
           15 : Blepharisma Macronuclear
         
         default = 1

    -G f file containing genetic code numbers of sequences that differs
         from the default code use ( see -g n )
         file format: one sequence per line with prot-id used in the alignemnt
                      followed by code number to use
";
}

###  function to parse command line options ### 
sub parse_opt {

    my %opts = ();

    getopts('p:n:f:g:G:ichv', \%opts);
    
    if ($opts{'h'}) { &usage; exit(0); }

    if ($opts{'p'}) { 
	$prot_file_format = $opts{'p'}; 
	if (!(grep(/^$prot_file_format$/, @align_format_ok))) {
	    print STDERR "unknown format '", $prot_file_format, 
	                 "' for protein alignment file\naccepted formats: '",
	                 join("', '", @align_format_ok), "'\n";
	    exit 1;
	}
    }
    if ($opts{'n'}) { 
	$dna_file_format = $opts{'n'}; 
	if (!(grep(/^$dna_file_format$/, @seq_format_ok))) {
	    print STDERR "unknown format '", $dna_file_format, 
	                 "' for DNA sequence file\naccepted formats: '",
	                 join("', '", @seq_format_ok), "'\n";
	    exit 1;
	}
    }
    if ($opts{'f'}) { 
	$output_format = $opts{'f'}; 
	if (!(grep(/^$output_format$/, @align_format_ok))) {
	    print STDERR "unknown format '", $output_format, 
	                 "' for protein alignment file\naccepted formates: '",
	                 join("', '", @align_format_ok), "'\n";
	    exit 1;
	}
    } 

    if ($opts{'g'}) { 
	$code = $opts{'g'}; 
	if ($code !~ /\d+/) {
	    print STDERR "integer value as genetic code expected -- '", 
                          $code, "' provided\n";
	    exit 1;
	}
	if ($code < 1 || $code > 15) {
	    print STDERR "invalid number of genetic code -- accepted 1 .. 15\n";
	    &usage, exit 1;
	}
    }
    if ($opts{'G'}) { $code_file = $opts{'G'}; }

    if ($opts{'i'}) { $order = 0; } 

    if ($opts{'v'}) { $verbose = 1; } 

}

### function to set genetic codes ###
sub get_codes {

    open(CODE, $code_file) or die $code_file, " -- $!\n";
    while (<CODE>) {
	my @fields = split(/\s+/, $_);
	$code_special{$fields[0]} = $fields[1];
    }
    close CODE;
}