[BioRuby] [PATCH] GO annotations fixes and improvements
Ralf Stephan
ralf at ark.in-berlin.de
Tue Aug 3 06:58:16 UTC 2010
Hello,
seeing the file bio/db/go.rb is seven years old,
I have fixed and improved the GO annotations
parsing (now GAF1, GAF2, Phenote) and output
(GAF1, GAF2) for inclusion in next bioruby version.
0001-Fix-parsing-of-GAF-1.0-files-preliminary-adaptation.patch
0002-Add-parsing-and-output-of-GAF-2.0-files.patch
0003-Add-documentation-copyright.patch
0004-Add-Phenote-GOA-file-format-parsing-GAF1-output.patch
I hope you will accept the patch set. Enjoy,
ralf
>From 05b435e0e3f791d0fae38a5d76cbc522835bf085 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Mon, 2 Aug 2010 19:43:58 +0200
Subject: [PATCH] Fix parsing of GAF 1.0 files, preliminary adaptations
---
lib/bio/db/go.rb | 42 ++++++++++++++++++++++++++++--------------
1 files changed, 28 insertions(+), 14 deletions(-)
diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index 6b5d539..a8d3f47 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -186,6 +186,18 @@ class GO
# p [entry.entry_id, entry.evidence, entry.goid]
# end
#
+ class ArrayOrString
+ def initialize(arg)
+ @var = arg
+ end
+ def join(char)
+ if @var.instance_of? String
+ then return @var
+ else return @var.join(char)
+ end
+ end
+ end
+
class GeneAssociation # < Bio::DB
# Delimiter
@@ -253,30 +265,34 @@ class GO
#
attr_reader :assigned_by
-
+
alias entry_id db_object_id
- # Parsing an entry (in a line) in the gene_association flatfile.
- def initialize(entry)
- tmp = entry.chomp.split(/\t/)
+ # Assign fields of an entry (in a line).
+ def assign(tmp)
@db = tmp[0]
@db_object_id = tmp[1]
@db_object_symbol = tmp[2]
@qualifier = tmp[3] #
@goid = tmp[4]
- @db_reference = tmp[5].split(/\|/) #
+ @db_reference = ArrayOrString.new(tmp[5].split(/\|/)) #
@evidence = tmp[6]
- @with = tmp[7].split(/\|/) #
+ @with = ArrayOrString.new(tmp[7].split(/\|/)) #
@aspect = tmp[8]
@db_object_name = tmp[9] #
- @db_object_synonym = tmp[10].split(/\|/) #
+ @db_object_synonym = ArrayOrString.new(tmp[10].split(/\|/)) #
@db_object_type = tmp[11]
@taxon = tmp[12] # taxon:4932
@date = tmp[13] # 20010118
@assigned_by = tmp[14]
end
+ # Parsing an entry (in a line) in the gene_association flatfile.
+ def initialize(entry)
+ tmp = entry.chomp.split(/\t/)
+ self.assign(tmp)
+ end
# Returns GO_ID in /\d{7}/ format. Giving not nil arg, returns
# /GO:\d{7}/ style.
@@ -293,17 +309,15 @@ class GO
# Bio::GO::GeneAssociation#to_str -> a line of gene_association file.
def to_str
- return [@db, @db_object_id, @db_object_symbol, @quialifier, @goid,
- @qualifier.join("|"), @evidence, @with.join("|"), @aspect,
+ return [@db, @db_object_id, @db_object_symbol, @qualifier, @goid,
+ @db_reference.join("|"), @evidence, @with.join("|"), @aspect,
@db_object_name, @db_object_synonym.join("|"), @db_object_type,
@taxon, @date, @assigned_by].join("\t")
end
end # class GeneAssociation
-
-
- # = Container class for files in geneontology.org/go/external2go/*2go.
+# = Container class for files in geneontology.org/go/external2go/*2go.
#
# The line syntax is:
#
@@ -402,8 +416,8 @@ class GO
end
end # class External2go
-
-end # class GO
+
+end
end # module Bio
--
1.5.5
>From 1dbca2952239c4028a89a507d1badd5935c9e477 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Mon, 2 Aug 2010 20:12:36 +0200
Subject: [PATCH] Add parsing and output of GAF 2.0 files
---
lib/bio/db/go.rb | 32 ++++++++++++++++++++++++++++++++
1 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index a8d3f47..affbe66 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -266,6 +266,11 @@ class GO
#
attr_reader :assigned_by
+ attr_reader :annotation_extension
+
+ attr_reader :gene_product_form_id
+
+
alias entry_id db_object_id
@@ -286,6 +291,8 @@ class GO
@taxon = tmp[12] # taxon:4932
@date = tmp[13] # 20010118
@assigned_by = tmp[14]
+ @annotation_extension = tmp[15]
+ @gene_product_form_id = tmp[16]
end
# Parsing an entry (in a line) in the gene_association flatfile.
@@ -317,6 +324,31 @@ class GO
end # class GeneAssociation
+ class GeneAssociation2 < GeneAssociation
+
+ # Iterator through all entries
+ def self.parser(str)
+ if block_given?
+ str.each_line(DELIMITER) {|line|
+ next if /^!/ =~ line
+ yield GeneAssociation2.new(line)
+ }
+ else
+ galist = []
+ str.each_line(DELIMITER) {|line|
+ next if /^!/ =~ line
+ galist << GeneAssociation2.new(line)
+ }
+ return galist
+ end
+ end
+
+ # Bio::GO::GeneAssociation#to_str -> a line of gene_association file.
+ def to_str
+ return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t")
+ end
+ end
+
# = Container class for files in geneontology.org/go/external2go/*2go.
#
# The line syntax is:
--
1.5.5
>From 4370b2bf3dc53f49334f9fb3948dc2fb584b75e5 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Mon, 2 Aug 2010 20:28:45 +0200
Subject: [PATCH] Add documentation, copyright
---
bin/bioruby | 47 ------
bin/br_biofetch.rb | 47 ------
bin/br_bioflat.rb | 293 -----------------------------------
bin/br_biogetseq.rb | 45 ------
bin/br_pmfetch.rb | 422 ---------------------------------------------------
lib/bio/db/go.rb | 21 +++-
6 files changed, 18 insertions(+), 857 deletions(-)
delete mode 100755 bin/bioruby
delete mode 100755 bin/br_biofetch.rb
delete mode 100755 bin/br_bioflat.rb
delete mode 100755 bin/br_biogetseq.rb
delete mode 100755 bin/br_pmfetch.rb
diff --git a/bin/bioruby b/bin/bioruby
deleted file mode 100755
index 9980af8..0000000
--- a/bin/bioruby
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = BioRuby shell - command line interface for the BioRuby library
-#
-# Copyright:: Copyright (C) 2005, 2006, 2007
-# Toshiaki Katayama <k at bioruby.org>
-# License:: The Ruby License
-#
-# $Id:$
-#
-
-begin
- require 'rubygems'
- gem 'bio', '>= 1.1.0'
-rescue LoadError
- require 'bio'
-end
-require 'bio/shell'
-
-# required to run commands (getseq, ls etc.)
-include Bio::Shell
-
-# setup command line options, working directory, and irb configurations
-Bio::Shell::Setup.new
-
-# loading workspace and command history
-Bio::Shell.load_session
-
-# sets default email address for Entrez eUtils.
-Bio::NCBI.default_email ||= 'staff at bioruby.org'
-
-# main loop
-if Bio::Shell.cache[:rails]
- Bio::Shell.cache[:rails].join
-else
- Signal.trap("SIGINT") do
- Bio::Shell.cache[:irb].signal_handle
- end
-
- catch(:IRB_EXIT) do
- Bio::Shell.cache[:irb].eval_input
- end
-end
-
-# saving workspace, command history and configuration before exit
-Bio::Shell.save_session
-
diff --git a/bin/br_biofetch.rb b/bin/br_biofetch.rb
deleted file mode 100755
index 40319cf..0000000
--- a/bin/br_biofetch.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = biofetch - BioFetch client
-#
-# Copyright:: Copyright (C) 2002
-# Toshiaki Katayama <k at bioruby.org>
-# License:: The Ruby License
-#
-# $Id: br_biofetch.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $
-#
-
-require 'bio/io/fetch'
-
-def usage
- default_url = 'http://bioruby.org/cgi-bin/biofetch.rb'
- another_url = 'http://www.ebi.ac.uk/cgi-bin/dbfetch'
- puts "#{$0} [-s[erver] #{another_url}] db id [style] [format]"
- puts " server : URL of the BioFetch CGI (default is #{default_url})"
- puts " db : database name (embl, genbank, etc.)"
- puts " id : entry id"
- puts " style : 'raw' or 'html' (default is 'raw')"
- puts " format : change the output format ('default', 'fasta', etc.)"
-end
-
-if ARGV.empty? or ARGV[0] =~ /^--?h/
- usage
- exit 1
-end
-
-case ARGV[0]
-when /^--?s/ # User specified server
- ARGV.shift
- serv = Bio::Fetch.new(ARGV.shift)
- puts serv.fetch(*ARGV)
-when /^--?e/ # EBI server
- ARGV.shift
- serv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
- puts serv.fetch(*ARGV)
-when /^--?r/ # BioRuby server
- ARGV.shift
- serv = Bio::Fetch.new('http://bioruby.org/cgi-bin/biofetch.rb')
- puts serv.fetch(*ARGV)
-else # Default server
- puts Bio::Fetch.query(*ARGV)
-end
-
-
diff --git a/bin/br_bioflat.rb b/bin/br_bioflat.rb
deleted file mode 100755
index 279da9b..0000000
--- a/bin/br_bioflat.rb
+++ /dev/null
@@ -1,293 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = bioflat - OBDA flat file indexer (executable)
-#
-# Copyright:: Copyright (C) 2002
-# Naohisa Goto <ng at bioruby.org>
-# License:: The Ruby License
-#
-# $Id: br_bioflat.rb,v 1.17 2007/04/05 23:35:39 trevor Exp $
-#
-
-require 'bio'
-
-def usage
- print <<EOM
-Search:
- #{$0} [--search] [options...] [DIR/]DBNAME KEYWORDS
-or
- #{$0} [--search] --location DIR --dbname DBNAME [options...] KEYWORDS
-
-Search options:
- --namespace NAME set serch namespace to NAME
- (or --name NAME) You can set this option many times to specify
- more than one namespace.
-
-Create index:
- #{$0} --create --location DIR --dbname DBNAME [--format <genbank|embl|fasta>] [options...] [--files] FILES
-Update index:
- #{$0} --update --location DIR --dbname DBNAME [options...] [--files] FILES
-
-Create index options:
- --primary=UNIQUE set primary namespece to UNIQUE
- Default primary/secondary namespaces depend on
- each format of flatfiles.
- --secondary=KEY set secondary namespaces.
- You may use this option many times to specify
- more than one namespace.
- --add-secondary=KEY add secondary namespaces to default specification.
- You can use this option many times.
-
-Options only valid for --create (or --update) --type flat:
- --sort=/path/to/sort use external sort program (e.g. /usr/bin/sort)
- --sort=BUILTIN use builtin sort routine
- (default: /usr/bin/sort or BUILTIN)
- --env=/path/to/env use env program to run sort (default: /usr/bin/env)
- --env-arg=XXXXXX argument given to the env program (default: LC_ALL=C)
- (multiple --env-arg=XXXXXX can be specified)
-
-Options only valid for --update:
- --renew re-read all flatfiles and update whole index
-
-Backward compatibility:
- --makeindex DIR/DBNAME
- same as --create --type flat --location DIR --dbname DBNAME
- --makeindexBDB DIR/DBNAME
- same as --create --type bdb --location DIR --dbname DBNAME
- --format=CLASS
- instead of genbank|embl|fasta, specifing a class name is allowed
-
-Show namespaces:
- #{$0} --show-namespaces [--location DIR --dbname DBNAME] [DIR/DBNAME]
-or
- #{$0} --show-namespaces [--format=CLASS]
-or
- #{$0} --show-namespaces --files file
-
-EOM
-
-end
-
-
-def do_index(mode = :create)
- case ARGV[0]
- when /^\-\-?make/
- dbpath = ARGV[1]
- args = ARGV[2..-1]
- is_bdb = nil
- when /^\-\-?make.*bdb/i
- dbname = ARGV[1]
- args = ARGV[2..-1]
- is_bdb = Bio::FlatFileIndex::MAGIC_BDB
- when /^\-\-create/, /^\-\-update/
- args = ARGV[1..-1]
- else
- usage
- end
-
- options = {}
-
- while args.first =~ /^\-/
- case x = args.shift
-
- # OBDA stuff
-
- when /^\-\-?format$/
- args.shift
- format = nil # throw this f*ckin' mess for auto detect :)
- when /^\-\-?location/
- location = args.shift.chomp('/')
- when /^\-\-?dbname/
- dbname = args.shift
- when /^\-\-?(index)?type/
- indextype = args.shift
- case indextype
- when /bdb/
- is_bdb = Bio::FlatFileIndex::MAGIC_BDB
- when /flat/
- is_bdb = nil
- else
- usage
- end
-
- # BioRuby extension
-
- when /^\-\-?files/i
- break
-
- when /^\-\-?format\=(.*)/i
- format = $1
-
- when /^\-\-?sort\=(.*)/i
- options['sort_program'] = $1
- options['onmemory'] = nil
- when /^\-\-?no\-?te?mp/i
- options['onmemory'] = true
-
- when /^\-\-?env\=(.*)/i
- options['env_program'] = $1
-
- when /^\-\-?env-arg(?:ument)?\=(.*)/i
- options['env_program_arguments'] ||= []
- options['env_program_arguments'].push $1
-
- when /^\-\-?primary.*\=(.*)/i
- options['primary_namespace'] = $1
-
- when /^\-\-?add-secondary.*\=(.*)/i
- unless options['additional_secondary_namespaces'] then
- options['additional_secondary_namespaces'] = []
- end
- options['additional_secondary_namespaces'] << $1 if $1.length > 0
-
- when /^\-\-?secondary.*\=(.*)/i
- unless options['secondary_namespaces'] then
- options['secondary_namespaces'] = []
- end
- options['secondary_namespaces'] << $1 if $1.length > 0
-
- when /^\-\-?renew/
- options['renew'] = true
-
- else
- $stderr.print "Warning: ignoring invalid option #{x.inspect}\n"
- end
- end
-
- dbpath = File.join(location, dbname) unless dbpath
- if mode == :update then
- Bio::FlatFileIndex::update_index(dbpath, format, options, *args)
- else
- Bio::FlatFileIndex::makeindex(is_bdb, dbpath, format, options, *args)
- end
-end
-
-
-def do_search
- dbname = nil
- location = nil
- names = []
- while x = ARGV.shift
- case x
- when /\A\-\-?search/i
- #do nothing
- when /\A\-\-?location/i
- location = ARGV.shift.to_s.chomp('/')
- when /\A\-\-?dbname/i
- dbname = ARGV.shift
- when /\A\-\-?name(?:space)?(?:\=(.+))?/i
- if $1 then
- names << $1
- elsif x = ARGV.shift
- names << x
- end
- else
- ARGV.unshift x
- break
- end
- end
- dbname = ARGV.shift unless dbname
- dbname = File.join(location, dbname) unless location.to_s.empty?
- db = Bio::FlatFileIndex.open(dbname)
- ARGV.each do |key|
- $stderr.print "Searching for \'#{key}\'...\n"
- #r = db.search(key)
- #$stderr.print "OK, #{r.size} entry found\n"
- #if r.size > 0 then
- # print r
- #end
- begin
- if names.empty? then
- r = db.include?(key)
- else
- r = db.include_in_namespaces?(key, *names)
- end
- rescue RuntimeError
- $stderr.print "ERROR: #{$!}\n"
- next
- end
- r = [] unless r
- $stderr.print "OK, #{r.size} entry found\n"
- r.each do |i|
- print db.search_primary(i)
- end
- end
- db.close
-end
-
-
-def do_show_namespaces
- dbname = nil
- location = nil
- files = nil
- format = nil
- names = []
- while x = ARGV.shift
- case x
- when /\A\-\-?(show\-)?name(space)?s/i
- #do nothing
- when /\A\-\-?location/i
- location = ARGV.shift.to_s.chomp('/')
- when /\A\-\-?dbname/i
- dbname = ARGV.shift
- when /\A\-\-?format(?:\=(.+))?/i
- if $1 then
- format = $1
- elsif x = ARGV.shift
- format = x
- end
- when /\A\-\-?files/i
- files = ARGV
- break
- else
- ARGV.unshift x
- break
- end
- end
- if files then
- k = nil
- files.each do |x|
- k = Bio::FlatFile.autodetect_file(x)
- break if k
- end
- if k then
- $stderr.print "Format: #{k.to_s}\n"
- format = k
- else
- $stderr.print "ERROR: couldn't determine file format\n"
- return
- end
- end
- $stderr.print "Namespaces: (first line: primary namespace)\n"
- if format then
- parser = Bio::FlatFileIndex::Indexer::Parser.new(format)
- print parser.primary.name, "\n"
- puts parser.secondary.keys
- else
- dbname = ARGV.shift unless dbname
- dbname = File.join(location, dbname) unless location.to_s.empty?
- db = Bio::FlatFileIndex.open(dbname)
- puts db.namespaces
- db.close
- end
-end
-
-if ARGV.size > 1
- case ARGV[0]
- when /--make/, /--create/
- Bio::FlatFileIndex::DEBUG.out = true
- do_index
- when /--update/
- Bio::FlatFileIndex::DEBUG.out = true
- do_index(:update)
- when /\A\-\-?(show\-)?name(space)?s/i
- do_show_namespaces
- when /--search/
- do_search
- else #default is search
- do_search
- end
-else
- usage
-end
-
diff --git a/bin/br_biogetseq.rb b/bin/br_biogetseq.rb
deleted file mode 100755
index 76c94de..0000000
--- a/bin/br_biogetseq.rb
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = biogetseq - OBDA sequence data retrieval (executable)
-#
-# Copyright:: Copyright (C) 2003
-# Toshiaki Katayama <k at bioruby.org>
-# License:: The Ruby License
-#
-# $Id: br_biogetseq.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $
-#
-
-require 'bio'
-
-def usage
- print <<END
- #{$0} --dbname <dbname> [--namespace <namespace>] entry_id [entry_id]
-END
- exit 1
-end
-
-if ARGV.size < 3
- usage
-end
-
-while ARGV.first =~ /^-/
- case ARGV.shift
- when /^\-\-format/
- ARGV.shift
- raise NotImplementedError
- when /^\-\-dbname/
- dbname = ARGV.shift
- when /^\-\-namespace/
- namespace = ARGV.shift
- end
-end
-
-reg = Bio::Registry.new
-db = reg.get_database(dbname)
-if namespace
- db['namespace'] = namespace
-end
-ARGV.each do |entry|
- puts db.get_by_id(entry)
-end
-
diff --git a/bin/br_pmfetch.rb b/bin/br_pmfetch.rb
deleted file mode 100755
index eb0f4ed..0000000
--- a/bin/br_pmfetch.rb
+++ /dev/null
@@ -1,422 +0,0 @@
-#!/usr/bin/env ruby
-#
-# = pmfetch - PubMed client
-#
-# Copyright:: Copyright (C) 2004, 2005
-# Toshiaki Katayama <k at bioruby.org>
-# License:: The Ruby License
-#
-# $Id:$
-#
-
-require 'bio'
-
-PROG_VER = "Powered by BioRuby #{Bio::BIORUBY_VERSION_ID}"
-PROG_NAME = File.basename($0)
-
-
-require 'getoptlong'
-
-
-### formatting
-
-class String
- def fill(fill_column = 80, prefix = '', separater = ' ')
- prefix = ' ' * prefix if prefix.is_a?(Integer)
- maxlen = fill_column - prefix.length
- raise "prefix is longer than fill_column" if maxlen <= 0
-
- cursor = pos = 0
- lines = []
- while cursor < self.length
- line = self[cursor, maxlen]
- pos = line.rindex(separater)
- pos = nil if line.length < maxlen
- if pos
- len = pos + separater.length
- lines << self[cursor, len]
- cursor += len
- else
- lines << self[cursor, maxlen]
- cursor += maxlen
- end
- end
- return lines.join("\n#{prefix}")
- end
-end
-
-
-module Bio
- class Reference
- def report
- if (num = @authors.size) > 10
- authors = "#{@authors[0]} et al. (#{num} authors)"
- elsif num > 4
- sep = ',' * (num - 1)
- authors = "#{@authors[0]}#{sep} #{@authors[-1]}"
- else
- authors = authors_join(' & ')
- end
- journal = "#{@journal} #{@year} #{@volume}(#{@issue}):#{@pages}"
-
- indent = 8
- prefix = ' ' * indent
- [
- "#{@pages[/\d+/]}".ljust(indent) + "#{@title}".fill(78, indent),
- authors,
- "#{journal} [PMID:#{@pubmed}]",
- ].join("\n#{prefix}")
- end
- end
-end
-
-
-class PMFetch
-
- class Examples < StandardError; end
- class Version < StandardError; end
- class Usage < StandardError; end
-
- ### default options
-
- def initialize
- @format = 'rd'
- @search_opts = {
- 'retmax' => 20,
- }
- @query = nil
- @query_opts = []
- @pmid_list_only = false
-
- pmfetch
- end
-
-
- ### main
-
- def pmfetch
- begin
- set_options
- parse_options
- check_query
- rescue PMFetch::Examples
- puts examples
- exit
- rescue PMFetch::Version
- puts version
- exit
- rescue PMFetch::Usage
- puts usage
- exit
- rescue GetoptLong::MissingArgument, GetoptLong::InvalidOption
- puts usage
- exit
- end
-
- list = pm_esearch
-
- if list.empty?
- ;
- elsif @pmid_list_only
- puts list
- else
- pm_efetch(list)
- end
- end
-
-
- ### help
-
- def usage
-%Q[
-Usage: #{PROG_NAME} [options...] "query string"
- or #{PROG_NAME} --query "query string" [other options...]
-
-Options:
- -q --query "genome AND virus" Query string for PubMed search
- -t --title "mobile elements" Title of the article to search
- -j --journal "genome res" Journal title to search
- -v --volume # Journal volume to search
- -i --issue # Journal issue to search
- -p --page # First page number of the article to search
- -a --author "Altschul SF" Author name to search
- -m --mesh "SARS virus" MeSH term to search
- -f --format bibtex Summary output format
- --pmidlist Output only a list of PubMed IDs
- -n --retmax # Number of articles to retrieve at the maximum
- -N --retstart # Starting number of the articles to retrieve
- -s --sort pub+date Sort method for the summary output
- --reldate # Search articles published within recent # days
- --mindate YYYY/MM/DD Search articles published after the date
- --maxdate YYYY/MM/DD Search articles published before the date
- --help Output this help, then exit
- --examples Output examples, then exit
- --version Output version number, then exit
-
-Formats:
- endnote, medline, bibitem, bibtex, report, rd,
- nature, science, genome_res, genome_biol, nar, current, trends, cell
-
-Sort:
- author, journal, pub+date, page
-
-See the following pages for the PubMed search options:
- http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html
- http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
-
-#{version}
-
-]
- end
-
- def version
- PROG_VER
- end
-
- def examples
- DATA.read.gsub('PMFetch', PROG_NAME)
- end
-
-
- private
-
-
- ### options
-
- def set_options
- @parser = GetoptLong.new
-
- @parser.set_options(
- [ '--query', '-q', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--title', '-t', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--journal', '-j', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--volume', '-v', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--issue', '-i', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--page', '-p', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--author', '-a', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--mesh', '-m', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--format', '-f', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--pmidlist', GetoptLong::NO_ARGUMENT ],
- [ '--retmax', '-n', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--retstart', '-N', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--sort', '-s', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--reldate', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--mindate', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--maxdate', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--examples', GetoptLong::NO_ARGUMENT ],
- [ '--help', GetoptLong::NO_ARGUMENT ],
- [ '--version', GetoptLong::NO_ARGUMENT ]
- )
- end
-
- def parse_options
- @parser.each_option do |optname, optarg|
- case optname
- when /--query/
- @query = optarg
- when /--title/
- @query_opts << "#{optarg}[ti]"
- when /--journal/
- @query_opts << "#{optarg}[ta]"
- when /--volume/
- @query_opts << "#{optarg}[vi]"
- when /--issue/
- @query_opts << "#{optarg}[ip]"
- when /--page/
- @query_opts << "#{optarg}[pg]"
- when /--author/
- @query_opts << "#{optarg}[au]"
- when /--mesh/
- @query_opts << "#{optarg}[mh]"
- when /--format/
- @format = optarg
- when /--pmidlist/
- @pmid_list_only = true
- when /--examples/
- raise PMFetch::Examples
- when /--help/
- raise PMFetch::Usage
- when /--version/
- raise PMFetch::Version
- when /--sort/
- @sort = optarg
- @search_opts["sort"] = @sort unless @sort == "page"
- else
- optname.delete!('-')
- @search_opts[optname] = optarg
- end
- end
- end
-
-
- ### check query
-
- def check_query
- p @query if $DEBUG
- @query ||= ARGV.join(" ") unless ARGV.empty?
-
- p @query if $DEBUG
- @query_str = [ @query, @query_opts ].flatten.compact.join(" AND ")
-
- p @query_str if $DEBUG
- if @query_str.empty?
- raise PMFetch::Usage
- end
- end
-
-
- ### search
-
- def pm_esearch
- return Bio::PubMed.esearch(@query_str, @search_opts)
- end
-
- def pm_efetch(list)
- entries = Bio::PubMed.efetch(list)
-
- if @format == 'medline'
- medline_format(entries)
- else
- entries = parse_entries(entries)
- if @sort == 'page'
- entries = sort_entries(entries)
- end
- if @format == 'report'
- report_format(entries)
- else
- other_format(entries)
- end
- end
- end
-
-
- ### output
-
- def medline_format(entries)
- entries.each do |entry|
- puts entry
- puts '//'
- end
- end
-
- def parse_entries(entries)
- entries.map { |entry| Bio::MEDLINE.new(entry) }
- end
-
- def sort_entries(entries)
- if RUBY_VERSION > "1.8.0"
- entries.sort_by { |x|
- [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i ]
- }
- else
- entries.map { |x|
- [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i, x ]
- }.sort { |a, b|
- a[0..3] <=> b[0..3]
- }.map { |y|
- y.pop
- }
- end
- end
-
- def report_format(entries)
- entries.each do |entry|
- puts entry.reference.report
- puts
- end
- end
-
- def other_format(entries)
- entries.each do |entry|
- puts entry.reference.format(@format)
- puts
- end
- end
-
-end
-
-
-PMFetch.new
-
-
-__END__
-
-= Examples : PubMed search
-
-These four lines will do the same job.
-
- % PMFetch transcription factor
- % PMFetch "transcription factor"
- % PMFetch --query "transcription factor"
- % PMFetch -q "transcription factor"
-
-
-Retrieve max 100 artiecles (20 is a NCBI's default) at a time, use --retmax as
-
- % PMFetch -q "transcription factor" --retmax 100
-
-and, to retrieve next 100 articles, use --retstart as
-
- % PMFetch -q "transcription factor" --retmax 100 --retstart 100
-
-
-You can narrow the search target for an issue of the journal.
-
- % PMFetch --journal development --volume 131 --issue 3 transcription factor
-
-
-Short options are also available.
-
- % PMFetch -j development -v 131 -i 3 transcription factor
-
-
-Search articles indexed in PubMed within these 90 days.
-
- % PMFetch -q "transcription factor" --reldate 90
-
-
-Search articles indexed in PubMed during the period of 2001/04/01 to 2001/08/31
-
- % PMFetch -q "transcription factor" --mindate 2001/04/01 --maxdate 2001/08/31
-
-
-Output format can be changed by --format option.
-
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f report
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f rd
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f endnote
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f medline
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibitem
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibtex
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f nature
- % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f science
-
-
-Generate title listings for the journal report meeting (don't forget
-to inclease the number of --retmax for fetching all titles).
-
- % PMFetch -f report -j development -v 131 -i 3 -n 100
-
-
-Search by author name.
-
- % PMFetch -a "Karlin S"
- % PMFetch -a "Koonin EV"
-
-
-Search by MeSH term.
-
- % PMFetch -m "computational biology"
- % PMFetch -m "SARS virus"
-
-
-Search by PubMed ID (PMID).
-
- % PMFetch 12345
-
-
-Output PMID only.
-
- % PMFetch --pmidlist tardigrada
-
-
diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index affbe66..62f78ba 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -1,8 +1,9 @@
#
# = bio/db/go.rb - Classes for Gene Ontology
#
-# Copyright:: Copyright (C) 2003
+# Copyright:: Copyright (C) 2003, 2010
# Mitsuteru C. Nakao <n at bioruby.org>
+# R. Stephan <ralf at ark.in-berlin.de>
# License:: The Ruby License
#
# $Id:$
@@ -174,8 +175,8 @@ class GO
# = Bio::GO::GeneAssociation
# $CVSROOT/go/gene-associations/gene_association.*
#
- # Data parser for the gene_association go annotation.
- # See also the file format http://www.geneontology.org/doc/GO.annotation.html#file
+ # Data parser for the gene_association go annotation 1.0.
+ # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml
#
# == Example
#
@@ -324,6 +325,20 @@ class GO
end # class GeneAssociation
+ # = Bio::GO::GeneAssociation2
+ #
+ # Data parser for the gene_association go annotation 2.0.
+ # See also the file format http://www.geneontology.org/GO.format.gaf-2_0.shtml
+ #
+ # == Example
+ #
+ # mgi_data = File.open('gene_association.mgi').read
+ # mgi = Bio::GO::GeneAssociation2.parser(mgi_data)
+ #
+ # Bio::GO::GeneAssociation.parser(mgi_data) do |entry|
+ # p [entry.entry_id, entry.evidence, entry.goid]
+ # end
+ #
class GeneAssociation2 < GeneAssociation
# Iterator through all entries
--
1.5.5
>From c6729520a9faf985975fb7f5b93128cdbe31b0e8 Mon Sep 17 00:00:00 2001
From: R. Stephan <ralf at ark.in-berlin.de>
Date: Tue, 3 Aug 2010 08:47:31 +0200
Subject: [PATCH] Add Phenote GOA file format parsing, GAF1 output
---
lib/bio/db/go.rb | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 60 insertions(+), 1 deletions(-)
diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb
index 62f78ba..b265c7e 100644
--- a/lib/bio/db/go.rb
+++ b/lib/bio/db/go.rb
@@ -358,12 +358,71 @@ class GO
end
end
- # Bio::GO::GeneAssociation#to_str -> a line of gene_association file.
+ # Bio::GO::GeneAssociation2#to_str -> a line of gene_association file.
def to_str
return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t")
end
end
+ # = Bio::GO::Phenote_GOA
+ #
+ # Data parser for the Phenote file format which is similar to GAF1.
+ # We serialize to GAF1 format (to_str).
+ # See http://www.phenote.org
+ # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml
+ #
+ # == Example
+ #
+ # mgi_data = File.open('gene_association.mgi').read
+ # mgi = Bio::GO::Phenote_GOA.parser(mgi_data)
+ #
+ # Bio::GO::Phenote_GOA.parser(mgi_data) do |entry|
+ # p.to_str
+ # end
+
+ class Phenote_GOA < GeneAssociation
+
+ # Retruns an Array of parsed Phenote file.
+ # Block is acceptable.
+ def self.parser(str)
+ if block_given?
+ str.each_line(DELIMITER) {|line|
+ next if /^DB\t/ =~ line
+ yield Phenote_GOA.new(line)
+ }
+ else
+ galist = []
+ str.each_line(DELIMITER) {|line|
+ next if /^DB\t/ =~ line
+ galist << Phenote_GOA.new(line)
+ }
+ return galist
+ end
+ end
+
+ # Assign fields of an entry (in a line) in Phenote format.
+ def assign(tmp)
+ @db = tmp[0]
+ @db_object_id = tmp[1]
+ @db_object_symbol = tmp[2]
+ @qualifier = tmp[3] #
+ @goid = tmp[4]
+ # We ignore Phenote's tmp[5]
+ @db_reference = ArrayOrString.new(tmp[6].split(/\|/)) #
+ @evidence = tmp[7]
+ @with = ArrayOrString.new(tmp[8].split(/\|/)) #
+ @aspect = tmp[9]
+ @db_object_name = tmp[10] #
+ @db_object_synonym = ArrayOrString.new(tmp[11].split(/\|/)) #
+ @db_object_type = tmp[12]
+ @taxon = tmp[13] # taxon:4932
+ @date = tmp[14] # 20010118
+ @assigned_by = tmp[15]
+ # We ignore Phenote's tmp[16-18]
+ end
+ end
+
+ #
# = Container class for files in geneontology.org/go/external2go/*2go.
#
# The line syntax is:
--
1.5.5
Ralf Stephan
http://www.ark.in-berlin.de
pub 1024D/C5114CB2 2009-06-07 [expires: 2011-06-06]
Key fingerprint = 76AE 0D21 C06C CBF9 24F8 7835 1809 DE97 C511 4CB2
More information about the BioRuby
mailing list