[Bioperl-l] retrieving 19 vertebrate PECAN alignment using Compara (modified PracticeCompara.pl)

charlotte payne charlotte.payne at gmail.com
Mon Jan 16 11:36:51 UTC 2012


Hello, I'm quite new to BioPerl and new to this list, and would really
appreciate some help as I've been stuck on this for some time now! Thanks
so much, here's the problem..

I'm trying to download a sequence alignment in fasta format, for the 19
amniotic vertebrates The Compara website suggests that the term "amniotes"
can be used for the species_set_name in this case, but I  get the following
error message:

amniotes is not a valid species name for this instance

-------------------- EXCEPTION --------------------
MSG: Registry configuration file has no data for connecting to <amniotes&
STACK toplevel PracticeCompara3.pl:98
Ensembl API version = 64
---------------------------------------------------

Below is the script I'm using:

use Bio::Perl;
use DBD::mysql;
use lib '/net/filestore/home/liberty/ensembl/modules';

use lib '/net/filestore/home/liberty/ensembl-compara/modules';

use lib '/net/filestore/home/liberty/ensembl-functgenomics/modules';

use lib '/net/filestore/home/liberty/ensembl-variation/modules';
use strict;
use Bio::EnsEMBL::Registry;
#use Bio::EnsEMBL::Compara;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::SimpleAlign;
use Bio::AlignIO;
use Bio::LocatableSeq;
use Getopt::Long;

my $usage = qq{
perl DumpMultiAlign.pl
  Getting help:
    [--help]

  For the query slice:
    [--species species]
        Query species. Default is "human"
    [--coord_system coordinates_name]
        Query coordinate system. Default is "chromosome"
    --seq_region region_name
        Query region name, i.e. the chromosome name
    --seq_region_start start
    --seq_region_end end

  For the alignments:
    [--alignment_type method_link_name]
        The type of alignment. Default is "BLASTZ_NET"
    [--set_of_species species1:species2:species3:...]
        The list of species used to get those alignments. Default is
        "human:mouse". The names should correspond to the name of the
        core database in the registry_configuration_file or any of its
        aliases

  Ouput:
    [--output_format clustalw|fasta|...]
        The type of output you want. "clustalw" is the default.
    [--output_file filename]
        The name of the output file. By default the output is the
        standard output
};

my $species = "human";
my $coord_system = "chromosome";
my $seq_region = "6";
my $seq_region_start = 50145643;
my $seq_region_end   = 51259674;
my $method_link_type = "PECAN";
my $species_set_name = "amniotes";
my $output_file = "lama2.fasta";
my $output_format = "fasta";
my $help;

GetOptions(
    "help" => \$help,
    "species=s" => \$species,
    "coord_system=s" => \$coord_system,
    "seq_region=s" => \$seq_region,
    "seq_region_start=i" => \$seq_region_start,
    "seq_region_end=i" => \$seq_region_end,
    "method_link_type=s" => \$method_link_type,
    "species_set_name=s" => \$species_set_name,
    "output_format=s" => \$output_format,
    "output_file=s" => \$output_file);

# Print Help and exit
if ($help) {
    print $usage;
    exit(0);
}

if ($output_file) {
    open(STDOUT, ">$output_file") or die("Cannot open $output_file");
}

Bio::EnsEMBL::Registry->load_registry_from_db(
    -host => 'ensembldb.ensembl.org', -user => 'anonymous');

# Getting all the Bio::EnsEMBL::Compara::GenomeDB objects
my $genome_dbs;
my $genome_db_adaptor = Bio::EnsEMBL::Registry->get_adaptor(
    'Multi', 'compara', 'GenomeDB');

throw("Cannot connect to Compara") if (!$genome_db_adaptor);

foreach my $this_species (split(":", $species_set_name)) {
    my $this_meta_container_adaptor = Bio::EnsEMBL::Registry->get_adaptor(
        $this_species, 'core', 'MetaContainer');

    throw("Registry configuration file has no data for connecting to
<$this_species&")
        if (!$this_meta_container_adaptor);

    my $this_production_name =
$this_meta_container_adaptor->get_production_name;

    # Fetch Bio::EnsEMBL::Compara::GenomeDB object
    my $genome_db =
$genome_db_adaptor->fetch_by_name_assembly($this_production_name);

    # Add Bio::EnsEMBL::Compara::GenomeDB object to the list
    push(@$genome_dbs, $genome_db);
}

# Getting Bio::EnsEMBL::Compara::MethodLinkSpeciesSet object
my $method_link_species_set_adaptor = Bio::EnsEMBL::Registry->get_adaptor(
    'Multi', 'compara', 'MethodLinkSpeciesSet');

my $method_link_species_set =
    $method_link_species_set_adaptor->fetch_by_method_link_type_GenomeDBs(
      $method_link_type,
      $genome_dbs);

throw("The database do not contain any $method_link_type data for
$species_set_name!")
    if (!$method_link_species_set);

# Fetching the query Slice:
my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species, 'core',
'Slice');

throw("Registry configuration file has no data for connecting to
<$species>")
    if (!$slice_adaptor);

my $query_slice = $slice_adaptor->fetch_by_region('toplevel', $seq_region,
$seq_region_start, $seq_region_end);

throw("No Slice can be created with coordinates
$seq_region:$seq_region_start-".
    "$seq_region_end") if (!$query_slice);

# Fetching all the GenomicAlignBlock corresponding to this Slice:
my $genomic_align_block_adaptor = Bio::EnsEMBL::Registry->get_adaptor(
    'Multi', 'compara', 'GenomicAlignBlock');

my $genomic_align_blocks =
    $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice(
      $method_link_species_set,
      $query_slice);

my $all_aligns;

# Get a Bio::SimpleAlign object from every GenomicAlignBlock
foreach my $this_genomic_align_block (@$genomic_align_blocks) {
    my $simple_align = $this_genomic_align_block->get_SimpleAlign;
    push(@$all_aligns, $simple_align);
}

# print all the genomic alignments using a Bio::AlignIO object
my $alignIO = Bio::AlignIO->newFh(
    -interleaved => 0,
    -fh => \*STDOUT,
    -format => $output_format,
    -idlength => 10
);

foreach my $this_align (@$all_aligns) {
    print $alignIO $this_align;
}

exit;

my $outfile = "lama2.fasta";

open (OUTFILE, $outfile) or die "can't read file : $!";
my $firstline=<OUTFILE>;
print $firstline;

close (IN);



More information about the Bioperl-l mailing list