[BioRuby-cvs] bioruby/lib/bio/appl/blast rpsblast.rb,NONE,1.1

Naohisa Goto ngoto at dev.open-bio.org
Tue Apr 15 13:54:41 UTC 2008


Update of /home/repository/bioruby/bioruby/lib/bio/appl/blast
In directory dev.open-bio.org:/tmp/cvs-serv32038/lib/bio/appl/blast

Added Files:
	rpsblast.rb 
Log Message:
Newly added RPS-Blast default (-m 0) output parser.


--- NEW FILE: rpsblast.rb ---
#
# = bio/appl/blast/rpsblast.rb - NCBI RPS Blast default output parser
# 
# Copyright::  Copyright (C) 2008 Naohisa Goto <ng at bioruby.org>
# License::    The Ruby License
#
# $Id: rpsblast.rb,v 1.1 2008/04/15 13:54:39 ngoto Exp $
#
# == Description
#
# NCBI RPS Blast (Reversed Position Specific Blast) default
# (-m 0 option) output parser class, Bio::Blast::RPSBlast::Report
# and related classes/modules.
#
# == References
#
# * Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer,
#   Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997),
#   "Gapped BLAST and PSI-BLAST: a new generation of protein database search
#   programs", Nucleic Acids Res. 25:3389-3402.
# * ftp://ftp.ncbi.nih.gov/blast/documents/rpsblast.html
# * http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml
#

require 'bio/appl/blast/format0'

module Bio
class Blast

  # NCBI RPS Blast (Reversed Position Specific Blast) namespace.
  # Currently, this module is existing only for separating namespace.
  # To parse RPSBlast results, see Bio::Blast::RPSBlast::Report documents.
  module RPSBlast

    # NCBI RPS Blast (Reversed Position Specific Blast)
    # default output parser.
    #
    # It supports defalut (-m 0 option) output of the "rpsblast" command.
    #
    # Because this class inherits Bio::Blast::Default::Report,
    # almost all methods are eqaul to Bio::Blast::Default::Report.
    # Only DELIMITER (and RS) and few methods are different.
    #
    # Note for multi-fasta result: When parsing output of rpsblast command
    # with multi-fasta sequences as input data,
    # each query's result is stored as an "iteration" of PSI-Blast,
    # because rpsblast's output with multi-fasta input is hard to split
    # by query.
    # This behavior may be changed in the future.
    #
    # Note for nucleotide results: This class is not tested with
    # nucleotide query and/or nucleotide databases.
    #
    class Report < Bio::Blast::Default::Report
      # Delimter of each entry for TBLAST. Bio::FlatFile uses it.
      DELIMITER = RS = "\nRPS-BLAST"

      # (Integer) excess read size included in DELIMITER.
      DELIMITER_OVERRUN = 9 # "RPS-BLAST"

      # Creates a new Report object from a string.
      #
      # Note for multi-fasta results: When parsing an output of rpsblast
      # command running with multi-fasta sequences,
      # each query's result is stored as an "iteration" of PSI-Blast,
      # because rpsblast's output with multi-fasta input is hard to split
      # by query.
      # This behavior may be changed in the future.
      #
      # Note for nucleotide results: This class is not tested with
      # nucleotide query and/or nucleotide databases.
      #
      def initialize(str)
        str = str.sub(/\A\s+/, '')
        # remove trailing entries for sure
        str.sub!(/\n(RPS\-BLAST.*)/m, "\n") 
        @entry_overrun = $1
        @entry = str
        data = str.split(/(?:^[ \t]*\n)+/)

        format0_split_headers(data)
        @iterations = format0_split_search(data)
        format0_split_stat_params(data)
      end

      # Returns definition of the query.
      # For a result of multi-fasta input, the first query's definition
      # is returned (The same as <tt>iterations.first.query_def</tt>).
      def query_def
        iterations.first.query_def
      end

      # Returns length of the query.
      # For a result of multi-fasta input, the first query's length
      # is returned (The same as <tt>iterations.first.query_len</tt>).
      def query_len
        iterations.first.query_len
      end

      private

      # Splits headers into the first line, reference, query line and
      # database line.
      def format0_split_headers(data)
        @f0header = data.shift
        @f0references = []
        while data[0] and /\ADatabase\:/ !~ data[0]
          @f0references.push data.shift
        end
        @f0database = data.shift
        # In special case, a void line is inserted after database name.
        if /\A +[\d\,]+ +sequences\; +[\d\,]+ total +letters\s*\z/ =~ data[0] then
          @f0database.concat "\n"
          @f0database.concat data.shift
        end
      end

      # Splits the search results.
      def format0_split_search(data)
        iterations = []
        dummystr = 'Searching..................................................done'
        if r = data[0] and /^Searching/ =~ r then
          dummystr = data.shift
        end
        while r = data[0] and /^Query\=/ =~ r
          iterations << Iteration.new(data, dummystr)
        end
        iterations
      end

      # Iteration class for RPS-Blast.
      # Though RPS-Blast does not iterate like PSI-BLAST, 
      # it aims to store a result of single query sequence.
      #
      # Normally, the instance of the class is generated
      # by Bio::Blast::RPSBlast::Report object.
      # 
      class Iteration < Bio::Blast::Default::Report::Iteration
        # Creates a new Iteration object.
        # It is designed to be called only internally from
        # the Bio::Blast::RPSBlast::Report class.
        # Users shall not use the method directly.
        def initialize(data, dummystr)
          if /\AQuery\=/ =~ data[0] then
            sc = StringScanner.new(data.shift)
            sc.skip(/\s*/)
            if sc.skip_until(/Query\= */) then
              q = []
              begin
                q << sc.scan(/.*/)
                sc.skip(/\s*^ ?/)
              end until !sc.rest or r = sc.skip(/ *\( *([\,\d]+) *letters *\)\s*\z/)
              @query_len = sc[1].delete(',').to_i if r
              @query_def = q.join(' ')
            end
          end
          data.unshift(dummystr)
          
          super(data)
        end

        # definition of the query
        attr_reader :query_def

        # length of the query sequence
        attr_reader :query_len
        
      end #class Iteration
      
    end #class Report

  end #module RPSBlast

end #module Blast
end #module Bio





More information about the bioruby-cvs mailing list