[Bioperl-l] bioperl and kegg(out of memory problem )

Chris Fields cjfields at uiuc.edu
Mon Apr 2 15:32:59 UTC 2007


Ambrose,

Data is persisting in your hashes (in particular DBLink objects),  
which is eating away at your memory.  If I take a sample KEGG gene  
file and simply parse it:

while (my $seq = $io->next_seq) {
     print $seq->accession,"\n";
}

there are no memory issues, but if I store the data in hashes  
declared outside the loop:

my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,% 
dblink_NCBIGENEID,%dblink_UniProt);
my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);

while (my $seq = $io->next_seq) {
     # store Bio::Seq data in hashes
}

I see problems with only one genome file with KEGG records.  You'll  
definitely run into memory issues if you are parsing many genome  
files, which you appear to be:

my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,% 
dblink_NCBIGENEID,%dblink_UniProt);
my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);

for my $genomefile (@genomelist) {
     while (my $seq = $io->next_seq) {
         # store Bio::Seq data in hashes
     }
}

Localizing the hashes to the genome or sequence loops should prevent  
the memory problem.

Note that the DBLink Annotation objects are overloaded so they act  
like a string ($ele =~ /^KO:/) but are actually  
Bio::Annotation::DBLink objects, something we will likely get rid of  
in the near future.

chris

On Apr 2, 2007, at 8:56 AM, Ambrose wrote:

>
>
> Hello ALL,
>
> I have the code below,which parses my kegg files.A host of the  
> files are parsed and the information is inserted into my databases  
> but unfortunate after the program runs for some hours it stops  
> showing the message out of memory.I assume that this happens  
> because the bioperl object is too big.Please just check the code below
>
> best regards Ambrose
>
>
> #!/usr/local/ActivePerl/bin/perl
> #
> #
>
> use strict;
> use Bio::SeqIO;
> use Bio::FASTASequence;
> use DBI;
> use Benchmark  qw(:all) ;
>
> my($ko,$prosite,$ncbigi,$ncbigeneid,$pfam,$uniprot,$ecn1, 
> $pathway_id1,$pathway_name1,$ec_num);
> my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,% 
> dblink_NCBIGENEID,%dblink_UniProt);
> my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);
> my( @kg_id);
> my $db="gbdb";
> my $host="localhost";
> my $userid="root";
> my $passwd="ubuntu";
> my $connectionInfo="dbi:mysql:$db;"."mysql_socket=/var/run/mysqld/ 
> mysqld.sock";
> my ($t1,$t2);
> my $dbh = DBI->connect($connectionInfo,$userid,$passwd);
> my $time_used;
>
>
>
>  eval { $dbh->do("DROP TABLE kegginfo") };
>  print "Dropping kegginfo failed: $@\n" if $@;
>  $dbh->do("CREATE TABLE kegginfo (kg_id BIGINT NOT NULL  
> AUTO_INCREMENT,
>                                    up_id INT UNSIGNED REFERENCES  
> uniprotinfo(up_id),
>                                                                    
> filename VARCHAR(50),
>                                                     kegg_id VARCHAR 
> (50),
>                                    keggaccn VARCHAR(50),
>                                                                    
> description VARCHAR(250),
>                                    ec_numbers VARCHAR(250),
>                                               pathway_id VARCHAR(250),
>                                               pathway_name VARCHAR 
> (250),
>                                               crc64 VARCHAR(50),
>                                    ko_id VARCHAR(50),
>                                    pfam_id VARCHAR(50),
>                                    ncbigi_id VARCHAR(50),
>                                    ncbigeneid_id VARCHAR(50),
>                                    uniprot_id VARCHAR(50),
>                                    prosite_id VARCHAR(50),
>                                    PRIMARY KEY (kg_id)
>                                  )");
>
>
> eval { $dbh->do("DROP TABLE keggntsequence") };
> print "Dropping keggntsequence failed: $@\n" if $@;
> $dbh->do("CREATE TABLE keggntsequence (kg_id BIGINT(15) UNSIGNED  
> REFERENCES uniprotinfo(kg_id),
>                                                     keggaccn VARCHAR 
> (50),
>                                   nucleotidesequence text
>                                  )");
>
> eval { $dbh->do("DROP TABLE keggaasequence") };
> print "Dropping keggaasequence failed: $@\n" if $@;
> $dbh->do("CREATE TABLE keggaasequence (kg_id BIGINT(15) UNSIGNED  
> REFERENCES uniprotinfo(kg_id),
>                                                     keggaccn VARCHAR 
> (50),
>                                                     crc64 VARCHAR(50),
>                                   aminoacidsequence text
>                                  )");
> eval { $dbh->do("DROP TABLE timestable") };
> print "Dropping timestable failed: $@\n" if $@;
> $dbh->do("CREATE TABLE timestable (aut_id BIGINT(15) UNSIGNED NOT  
> NULL AUTO_INCREMENT,
>                                    genome VARCHAR(100),
>                                     totaltime_seconds int(100),
>                                                                    
> PRIMARY KEY(aut_id))");
>
>
>
> open (LIST, "genomes.list") || die "Cannot open input kegg genomes  
> file genomes.list\n $! \n";
> $t1=new Benchmark;
> my @genomelist = ();
> while (my $line=<LIST>) {
>     #ignore comment lines
>     if ($line !~ /^#/) {
>         chomp $line;
>
>         push (@genomelist, $line); #store the filename
>     }
> }
>
> close LIST;
> my $count=0;
> foreach my $genomefile (@genomelist) {
>
>     #in case the user fails to remove some strange files from
>     #the genomes.list file.. check for the KEGG format
>     my $check=checkKeggFormat($genomefile);
>     if ($check==0) {
>         #if file is not kegg, start with the next one...
>         print "ERROR: $genomefile doesn't look like a KEGG file to  
> me! \n";
>         #<stdin>;
>         next;
>     }
> #print $genomefile,"\n";
>     my $stream = Bio::SeqIO->new(-file => $genomefile, -format =>  
> 'KEGG');
>
>     while ( my $seq = $stream->next_seq() ) {
>
>         my $primary_id = $seq->primary_id;
>         my $display_id = $seq->display_id; #name
>         my $keggaccn   = $seq->accession; #accn
>         my @description = $seq->annotation->get_Annotations 
> ('description');
>
>         my @dblinks     = $seq->annotation->get_Annotations('dblink');
>         my @orthologs   = $seq->annotation->get_Annotations 
> ('ortholog');
>         my @orthologs   = grep {$_->database eq 'KO'} $seq- 
> >annotation->get_Annotations('dblink');
>         my @class       = $seq->annotation->get_Annotations 
> ('pathway');
>          $ntseq{$keggaccn} = $seq->seq;
>          $aaseq{$keggaccn} = $seq->translate->seq;
>          $aaseq{$keggaccn} =~s /\*$//;
>                  my $fasta = ">".$count."\n".$aaseq{$keggaccn};
>          my $newseq = Bio::FASTASequence->new($fasta);
>          $crc64{$keggaccn}=$newseq->getCrc64();
> #print $keggaccn,"crc64:$crc64{$keggaccn}\n";
>
>         $count++;
>         if ($keggaccn eq "") { print "PRIMARY KEY NOT FOUND no  
> keggaccn\n";
>         next;}
>
>         if(@dblinks)
>         {
>                 my @dblink_KO=();
>                 my @dblink_Pfam=();
>                 my @dblink_PROSITE=();
>                 my @dblink_NCBIGI=();
>                 my @dblink_NCBIGENEID=();
>                 my @dblink_UniProt=();
>
>                 foreach my $ele (@dblinks) {
>                     if ($ele =~ /^KO:/){
>                         $ele=~s/KO://;
>                         push (@dblink_KO,$ele);
>                         $dblink_KO{$keggaccn}=$ele;
>                         next;
>                     }
>                         #parse Pfam: dblink
>                     if ($ele =~ /^Pfam:/){
>                         $ele=~s/Pfam://;
>                         push (@dblink_Pfam,$ele);
>                         $dblink_Pfam{$keggaccn}=$ele;
>                         next;
>                     }
>                         #parse PROSITE: dblink
>                     if ($ele =~ /^PROSITE:/){
>                         $ele=~s/PROSITE://;
>                         push (@dblink_PROSITE,$ele);
>                         $dblink_PROSITE{$keggaccn}=$ele;
>                         next;
>                     }
>                         #parse NCBI-GI: dblink
>                     if ($ele =~ /^NCBI-GI:/){
>                         $ele=~s/NCBI-GI://;
>                         push (@dblink_NCBIGI,$ele);
>                         $dblink_NCBIGI{$keggaccn}=$ele;
>                         next;
>                     }
>                         #parse NCBI-GeneID: dblink
>                     if ($ele =~ /^NCBI-GeneID:/){
>                         $ele=~s/NCBI-GeneID://;
>                         push (@dblink_NCBIGENEID,$ele);
>                         $dblink_NCBIGENEID{$keggaccn}=$ele;
>                         next;
>                         }
>                         #parse UniProt: dblink
>                     if ($ele =~ /^UniProt:/){
>                         $ele=~s/UniProt://;
>                         push (@dblink_UniProt,$ele);
>                         $dblink_UniProt{$keggaccn}=$ele;
>                         next;
>                     }
>
>                 }#end foreach     #finished parsing all dblinks
>         }#end if @dblinks
>         if(@class)
>         {
>             foreach my $pathway (@class) {
>
>                 $pathway=~s/^\s+|\s+$//;
>                 my @arr = split (/\s+/,$pathway);
>                 my $pathway_id = $arr[0];
>                 shift @arr;
>                 my $pathway_name = join(" ", at arr);
>                 $pathway_name{$keggaccn}=$pathway_name;
>                 $pathway_id{$keggaccn}=$pathway_id;
>                 #print $pathway_id{$keggaccn},"\t",$pathway_name 
> {$keggaccn},"\n";
>
>             }
>
>         }
>
>         my @ecnumbers=();
>         @ecnumbers = extractECnumbers(@description);
>         if(@ecnumbers)
>         {
>                 if (@ecnumbers!=0)
>                 {
>                     foreach my $ecn (@ecnumbers)
>                     {
>                        $ecnumbers{$keggaccn}=$ecn;
>                     }#end foreach
>                 }
>                 else {
>                     #print "ECnumbers:\n";
>                      }
>         }
>
>
> #         print $keggaccn,"\t",$dblink_UniProt{$keggaccn},"\t", 
> $dblink_NCBIGENEID{$keggaccn},
> #                 "\t",$dblink_NCBIGI{$keggaccn},"\t","ec:$ecnumbers 
> {$keggaccn}","\t",
> #                  "p1:$pathway_id{$keggaccn}","\t","p2: 
> $pathway_name{$keggaccn}","\n";
> #
>                 $dbh->do("INSERT INTO kegginfo VALUES  
> (?,?, ?, ?, ?, ?,?,?,?,?,?,?,?,?,?,?)",
>          undef,"NULL","NULL",$genomefile,$display_id, 
> $keggaccn, at description,$ecnumbers{$keggaccn},
>                   $pathway_id{$keggaccn},$pathway_name{$keggaccn}, 
> $crc64{$keggaccn},$dblink_KO{$keggaccn},
>                  $dblink_Pfam{$keggaccn},$dblink_NCBIGI{$keggaccn}, 
> $dblink_NCBIGENEID{$keggaccn},
>                  $dblink_UniProt{$keggaccn},$dblink_PROSITE 
> {$keggaccn});
>
>
>         $dbh->do("INSERT INTO keggaasequence VALUES (?,?,?,?)",
>             undef,"",$keggaccn,$crc64{$keggaccn},$aaseq{$keggaccn});
>
>
>         $dbh->do("INSERT INTO keggntsequence VALUES (?,?,?)",
>             undef,"",$keggaccn,$ntseq{$keggaccn});
>
>
>     }
>      $t2=new Benchmark;
>     $time_used=timeThis($t1,$t2,"Finished parsing file $genomefile");
>     $dbh->do("INSERT INTO timestable VALUES (?,?,?)",
>     undef,"NULL",$genomefile,$time_used);
>
> }
>
>
> $dbh->do("CREATE INDEX keggIindex ON kegginfo (kg_id,keggaccn)");
> print "Index created on kegginfo\n";
>
> $dbh->do("CREATE INDEX keggaasequence1 ON keggaasequence  
> (kg_id,keggaccn)");
> print "Index created on keggaasequence\n";
>
> $dbh->do("CREATE INDEX keggntsequence1 ON keggntsequence  
> (kg_id,keggaccn)");
> print "Index created on keggntsequence\n";
>
>
> print"Updating the tables................\n";
>
>
> $dbh->do("update kegginfo,keggaasequence set  
> keggaasequence.kg_id=kegginfo.kg_id
>          where
>                 kegginfo.keggaccn=keggaasequence.keggaccn");
>         print " keggaasequence kg_id\n";
>
> $dbh->do("update kegginfo,keggntsequence set  
> keggntsequence.kg_id=kegginfo.kg_id
>          where
>                 kegginfo.keggaccn=keggntsequence.keggaccn");
>         print " keggaasequence kg_id\n";
>
>
>
> sub extractECnumbers ($) {
>     #sample description lines
>      #riboflavin kinase / FMN adenylyltransferase [EC:2.7.1.26  
> 2.7.7.2]
>     #ATP synthase F0 subunit c [EC:3.6.3.14]
>
>     my @desc=shift;
>     my $description = join ("", at desc);
>     my @ecnumbers=();
>     #print "parsing ec for $description..\n";
>     #check if EC number exists
>     if ($description=~/\[EC:/) {
>
>         my @array = split (/\[EC:/,$description);
>         $array[1]=~s/]//g;
>         shift @array; #remove the annotation , only EC numbers remain
>         foreach my $ele (@array) {
>             $ele=~s/^\s+|\s+$//g;
>             $ele= "EC:".$ele;
>             push (@ecnumbers,$ele);
>         }
>         return @ecnumbers;
>     }
>     else {
>         #return an empty value
>         return ;
>
>     }
>
> }
>
>
>
>
>
>
>
> sub checkKeggFormat ($) {
> =head2
>
> checkKeggFormat
>
> make sure that the file is a valid KEGG file
> function checks the first two lines,
> 1st must start with ENTRY
> 2nd must start with DEFINITION
>
> returns 0 or 1
>
> =cut
>     my $genomefile=shift;
>
>     open (TEST,$genomefile) || die "Cannot open file $genomefile  
> for reading \n";
>     my $testline=<TEST>;
> #print "$testline\n";
>     if ($testline=~/^ENTRY/) {
>         #continue
>         #$testline=<TEST>;#double check
>         #if ($testline=~/^NAME/) {
>             #this looks like a valid kegg file
>             return 1;
>         #}
>         #else {
>         #    close TEST;
>         #    return 0;
>         #}
>     }
>     else {
>         close TEST;
>         return 0;
>     }
>
> }
>
> sub timeThis ($$$)
> {
>     my ($start,$end,$message) = @_;
>     my $td = timediff($end, $start);
>     my $t = timestr($td);
>         print "$message : ",$t,"\n";
>         my @array = split (/\s+/,$t);
> #20 wallclock secs (14.23 usr +  0.84 sys = 15.07 CPU)
>         return $array[0]; #return the no. of seconds.
> }
>
>
>
>
> ---------------------------------
> Looking for earth-friendly autos?
>  Browse Top Cars by "Green Rating" at Yahoo! Autos' Green Center.
> _______________________________________________
> Bioperl-l mailing list
> Bioperl-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-l

Christopher Fields
Postdoctoral Researcher
Lab of Dr. Robert Switzer
Dept of Biochemistry
University of Illinois Urbana-Champaign






More information about the Bioperl-l mailing list