#!/usr/bin/perl -w use strict; use warnings; use Bio::Perl; use Bio::Seq; use Bio::SeqIO; use Getopt::Long; my $options_ref = {}; GetOptions( $options_ref, 'output_alignment|o=s', 'join_string|j=s', ); unless ( $options_ref->{output_alignment} ) { print "concatenate_alignments.pl -o <... input_alignment_n>\n"; print "Options: -j \n"; exit; } my @input_alignment_filenames = @ARGV; my $results_ref = { seqs => {}, seq_names => [], }; &get_seqs( \@input_alignment_filenames ); &print_alignment(); exit; # # Subroutines # sub get_seqs { my ( $input_alignment_filenames_ref ) = @_; foreach my $input_alignment ( @{ $input_alignment_filenames_ref } ) { my $in_seqio = Bio::SeqIO->new(-file => "$input_alignment", -format => 'fasta'); while ( my $seqobj = $in_seqio->next_seq ) { my $seq_id = $seqobj->id(); my $seq = $seqobj->seq(); if ( $results_ref->{seqs}->{$seq_id} ) { $results_ref->{seqs}->{$seq_id} = $options_ref->{join_string} ? $results_ref->{seqs}->{$seq_id} . $options_ref->{join_string} . $seqobj->seq() : $results_ref->{seqs}->{$seq_id} . $seqobj->seq(); } else { push @{ $results_ref->{seq_names} }, $seq_id; $results_ref->{seqs}->{$seq_id} = $seq; } } } } sub print_alignment { my $output_alignment_filename = $options_ref->{output_alignment}; my $out_seqio = Bio::SeqIO->new(-file => ">$output_alignment_filename", -format => 'fasta'); foreach my $seq_id ( @{ $results_ref->{seq_names} } ) { my $seq = $results_ref->{seqs}->{$seq_id}; my $out_seq_obj = Bio::Seq->new(-id => $seq_id, -seq => $seq ); $out_seqio->write_seq( $out_seq_obj ); } } =pod =head1 NAME concatenate_alignments.pl - concatenate sequences in seaparate alignment fasta files =head1 SYNOPSIS #Concatenate with no sequence inserted between alignments concatenate_alignments.pl -o <... input_alignment_n> #Concatenate with --- inserted between alignments concatenate_alignments.pl -o -j '---' <... input_alignment_n> =head1 DESCRIPTION Concatenates sequences from alignments in multiple fasta files. The sequences must have the same sequence id in each alignment file; otherwise, they'll be assigned to a new sequence. Sequences are output in the same order as they appear in the first alignment file. Any number of alignments can be included, providing the above conditions are met. However, all sequences are loaded into memory, so long sequences, or large numbers of sequences, may result in memory issues. =head1 OPTIONS -j String to insert between concatenated sequences, eg: -j '---' =head1 CONTRIBUTORS Greg Baillie =cut