From ralf at ark.in-berlin.de Tue Aug 3 02:58:16 2010 From: ralf at ark.in-berlin.de (Ralf Stephan) Date: Tue, 3 Aug 2010 08:58:16 +0200 Subject: [BioRuby] [PATCH] GO annotations fixes and improvements Message-ID: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> Hello, seeing the file bio/db/go.rb is seven years old, I have fixed and improved the GO annotations parsing (now GAF1, GAF2, Phenote) and output (GAF1, GAF2) for inclusion in next bioruby version. 0001-Fix-parsing-of-GAF-1.0-files-preliminary-adaptation.patch 0002-Add-parsing-and-output-of-GAF-2.0-files.patch 0003-Add-documentation-copyright.patch 0004-Add-Phenote-GOA-file-format-parsing-GAF1-output.patch I hope you will accept the patch set. Enjoy, ralf >From 05b435e0e3f791d0fae38a5d76cbc522835bf085 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Mon, 2 Aug 2010 19:43:58 +0200 Subject: [PATCH] Fix parsing of GAF 1.0 files, preliminary adaptations --- lib/bio/db/go.rb | 42 ++++++++++++++++++++++++++++-------------- 1 files changed, 28 insertions(+), 14 deletions(-) diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index 6b5d539..a8d3f47 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -186,6 +186,18 @@ class GO # p [entry.entry_id, entry.evidence, entry.goid] # end # + class ArrayOrString + def initialize(arg) + @var = arg + end + def join(char) + if @var.instance_of? String + then return @var + else return @var.join(char) + end + end + end + class GeneAssociation # < Bio::DB # Delimiter @@ -253,30 +265,34 @@ class GO # attr_reader :assigned_by - + alias entry_id db_object_id - # Parsing an entry (in a line) in the gene_association flatfile. - def initialize(entry) - tmp = entry.chomp.split(/\t/) + # Assign fields of an entry (in a line). + def assign(tmp) @db = tmp[0] @db_object_id = tmp[1] @db_object_symbol = tmp[2] @qualifier = tmp[3] # @goid = tmp[4] - @db_reference = tmp[5].split(/\|/) # + @db_reference = ArrayOrString.new(tmp[5].split(/\|/)) # @evidence = tmp[6] - @with = tmp[7].split(/\|/) # + @with = ArrayOrString.new(tmp[7].split(/\|/)) # @aspect = tmp[8] @db_object_name = tmp[9] # - @db_object_synonym = tmp[10].split(/\|/) # + @db_object_synonym = ArrayOrString.new(tmp[10].split(/\|/)) # @db_object_type = tmp[11] @taxon = tmp[12] # taxon:4932 @date = tmp[13] # 20010118 @assigned_by = tmp[14] end + # Parsing an entry (in a line) in the gene_association flatfile. + def initialize(entry) + tmp = entry.chomp.split(/\t/) + self.assign(tmp) + end # Returns GO_ID in /\d{7}/ format. Giving not nil arg, returns # /GO:\d{7}/ style. @@ -293,17 +309,15 @@ class GO # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. def to_str - return [@db, @db_object_id, @db_object_symbol, @quialifier, @goid, - @qualifier.join("|"), @evidence, @with.join("|"), @aspect, + return [@db, @db_object_id, @db_object_symbol, @qualifier, @goid, + @db_reference.join("|"), @evidence, @with.join("|"), @aspect, @db_object_name, @db_object_synonym.join("|"), @db_object_type, @taxon, @date, @assigned_by].join("\t") end end # class GeneAssociation - - - # = Container class for files in geneontology.org/go/external2go/*2go. +# = Container class for files in geneontology.org/go/external2go/*2go. # # The line syntax is: # @@ -402,8 +416,8 @@ class GO end end # class External2go - -end # class GO + +end end # module Bio -- 1.5.5 >From 1dbca2952239c4028a89a507d1badd5935c9e477 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Mon, 2 Aug 2010 20:12:36 +0200 Subject: [PATCH] Add parsing and output of GAF 2.0 files --- lib/bio/db/go.rb | 32 ++++++++++++++++++++++++++++++++ 1 files changed, 32 insertions(+), 0 deletions(-) diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index a8d3f47..affbe66 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -266,6 +266,11 @@ class GO # attr_reader :assigned_by + attr_reader :annotation_extension + + attr_reader :gene_product_form_id + + alias entry_id db_object_id @@ -286,6 +291,8 @@ class GO @taxon = tmp[12] # taxon:4932 @date = tmp[13] # 20010118 @assigned_by = tmp[14] + @annotation_extension = tmp[15] + @gene_product_form_id = tmp[16] end # Parsing an entry (in a line) in the gene_association flatfile. @@ -317,6 +324,31 @@ class GO end # class GeneAssociation + class GeneAssociation2 < GeneAssociation + + # Iterator through all entries + def self.parser(str) + if block_given? + str.each_line(DELIMITER) {|line| + next if /^!/ =~ line + yield GeneAssociation2.new(line) + } + else + galist = [] + str.each_line(DELIMITER) {|line| + next if /^!/ =~ line + galist << GeneAssociation2.new(line) + } + return galist + end + end + + # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. + def to_str + return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t") + end + end + # = Container class for files in geneontology.org/go/external2go/*2go. # # The line syntax is: -- 1.5.5 >From 4370b2bf3dc53f49334f9fb3948dc2fb584b75e5 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Mon, 2 Aug 2010 20:28:45 +0200 Subject: [PATCH] Add documentation, copyright --- bin/bioruby | 47 ------ bin/br_biofetch.rb | 47 ------ bin/br_bioflat.rb | 293 ----------------------------------- bin/br_biogetseq.rb | 45 ------ bin/br_pmfetch.rb | 422 --------------------------------------------------- lib/bio/db/go.rb | 21 +++- 6 files changed, 18 insertions(+), 857 deletions(-) delete mode 100755 bin/bioruby delete mode 100755 bin/br_biofetch.rb delete mode 100755 bin/br_bioflat.rb delete mode 100755 bin/br_biogetseq.rb delete mode 100755 bin/br_pmfetch.rb diff --git a/bin/bioruby b/bin/bioruby deleted file mode 100755 index 9980af8..0000000 --- a/bin/bioruby +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env ruby -# -# = BioRuby shell - command line interface for the BioRuby library -# -# Copyright:: Copyright (C) 2005, 2006, 2007 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id:$ -# - -begin - require 'rubygems' - gem 'bio', '>= 1.1.0' -rescue LoadError - require 'bio' -end -require 'bio/shell' - -# required to run commands (getseq, ls etc.) -include Bio::Shell - -# setup command line options, working directory, and irb configurations -Bio::Shell::Setup.new - -# loading workspace and command history -Bio::Shell.load_session - -# sets default email address for Entrez eUtils. -Bio::NCBI.default_email ||= 'staff at bioruby.org' - -# main loop -if Bio::Shell.cache[:rails] - Bio::Shell.cache[:rails].join -else - Signal.trap("SIGINT") do - Bio::Shell.cache[:irb].signal_handle - end - - catch(:IRB_EXIT) do - Bio::Shell.cache[:irb].eval_input - end -end - -# saving workspace, command history and configuration before exit -Bio::Shell.save_session - diff --git a/bin/br_biofetch.rb b/bin/br_biofetch.rb deleted file mode 100755 index 40319cf..0000000 --- a/bin/br_biofetch.rb +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env ruby -# -# = biofetch - BioFetch client -# -# Copyright:: Copyright (C) 2002 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id: br_biofetch.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $ -# - -require 'bio/io/fetch' - -def usage - default_url = 'http://bioruby.org/cgi-bin/biofetch.rb' - another_url = 'http://www.ebi.ac.uk/cgi-bin/dbfetch' - puts "#{$0} [-s[erver] #{another_url}] db id [style] [format]" - puts " server : URL of the BioFetch CGI (default is #{default_url})" - puts " db : database name (embl, genbank, etc.)" - puts " id : entry id" - puts " style : 'raw' or 'html' (default is 'raw')" - puts " format : change the output format ('default', 'fasta', etc.)" -end - -if ARGV.empty? or ARGV[0] =~ /^--?h/ - usage - exit 1 -end - -case ARGV[0] -when /^--?s/ # User specified server - ARGV.shift - serv = Bio::Fetch.new(ARGV.shift) - puts serv.fetch(*ARGV) -when /^--?e/ # EBI server - ARGV.shift - serv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') - puts serv.fetch(*ARGV) -when /^--?r/ # BioRuby server - ARGV.shift - serv = Bio::Fetch.new('http://bioruby.org/cgi-bin/biofetch.rb') - puts serv.fetch(*ARGV) -else # Default server - puts Bio::Fetch.query(*ARGV) -end - - diff --git a/bin/br_bioflat.rb b/bin/br_bioflat.rb deleted file mode 100755 index 279da9b..0000000 --- a/bin/br_bioflat.rb +++ /dev/null @@ -1,293 +0,0 @@ -#!/usr/bin/env ruby -# -# = bioflat - OBDA flat file indexer (executable) -# -# Copyright:: Copyright (C) 2002 -# Naohisa Goto -# License:: The Ruby License -# -# $Id: br_bioflat.rb,v 1.17 2007/04/05 23:35:39 trevor Exp $ -# - -require 'bio' - -def usage - print <] [options...] [--files] FILES -Update index: - #{$0} --update --location DIR --dbname DBNAME [options...] [--files] FILES - -Create index options: - --primary=UNIQUE set primary namespece to UNIQUE - Default primary/secondary namespaces depend on - each format of flatfiles. - --secondary=KEY set secondary namespaces. - You may use this option many times to specify - more than one namespace. - --add-secondary=KEY add secondary namespaces to default specification. - You can use this option many times. - -Options only valid for --create (or --update) --type flat: - --sort=/path/to/sort use external sort program (e.g. /usr/bin/sort) - --sort=BUILTIN use builtin sort routine - (default: /usr/bin/sort or BUILTIN) - --env=/path/to/env use env program to run sort (default: /usr/bin/env) - --env-arg=XXXXXX argument given to the env program (default: LC_ALL=C) - (multiple --env-arg=XXXXXX can be specified) - -Options only valid for --update: - --renew re-read all flatfiles and update whole index - -Backward compatibility: - --makeindex DIR/DBNAME - same as --create --type flat --location DIR --dbname DBNAME - --makeindexBDB DIR/DBNAME - same as --create --type bdb --location DIR --dbname DBNAME - --format=CLASS - instead of genbank|embl|fasta, specifing a class name is allowed - -Show namespaces: - #{$0} --show-namespaces [--location DIR --dbname DBNAME] [DIR/DBNAME] -or - #{$0} --show-namespaces [--format=CLASS] -or - #{$0} --show-namespaces --files file - -EOM - -end - - -def do_index(mode = :create) - case ARGV[0] - when /^\-\-?make/ - dbpath = ARGV[1] - args = ARGV[2..-1] - is_bdb = nil - when /^\-\-?make.*bdb/i - dbname = ARGV[1] - args = ARGV[2..-1] - is_bdb = Bio::FlatFileIndex::MAGIC_BDB - when /^\-\-create/, /^\-\-update/ - args = ARGV[1..-1] - else - usage - end - - options = {} - - while args.first =~ /^\-/ - case x = args.shift - - # OBDA stuff - - when /^\-\-?format$/ - args.shift - format = nil # throw this f*ckin' mess for auto detect :) - when /^\-\-?location/ - location = args.shift.chomp('/') - when /^\-\-?dbname/ - dbname = args.shift - when /^\-\-?(index)?type/ - indextype = args.shift - case indextype - when /bdb/ - is_bdb = Bio::FlatFileIndex::MAGIC_BDB - when /flat/ - is_bdb = nil - else - usage - end - - # BioRuby extension - - when /^\-\-?files/i - break - - when /^\-\-?format\=(.*)/i - format = $1 - - when /^\-\-?sort\=(.*)/i - options['sort_program'] = $1 - options['onmemory'] = nil - when /^\-\-?no\-?te?mp/i - options['onmemory'] = true - - when /^\-\-?env\=(.*)/i - options['env_program'] = $1 - - when /^\-\-?env-arg(?:ument)?\=(.*)/i - options['env_program_arguments'] ||= [] - options['env_program_arguments'].push $1 - - when /^\-\-?primary.*\=(.*)/i - options['primary_namespace'] = $1 - - when /^\-\-?add-secondary.*\=(.*)/i - unless options['additional_secondary_namespaces'] then - options['additional_secondary_namespaces'] = [] - end - options['additional_secondary_namespaces'] << $1 if $1.length > 0 - - when /^\-\-?secondary.*\=(.*)/i - unless options['secondary_namespaces'] then - options['secondary_namespaces'] = [] - end - options['secondary_namespaces'] << $1 if $1.length > 0 - - when /^\-\-?renew/ - options['renew'] = true - - else - $stderr.print "Warning: ignoring invalid option #{x.inspect}\n" - end - end - - dbpath = File.join(location, dbname) unless dbpath - if mode == :update then - Bio::FlatFileIndex::update_index(dbpath, format, options, *args) - else - Bio::FlatFileIndex::makeindex(is_bdb, dbpath, format, options, *args) - end -end - - -def do_search - dbname = nil - location = nil - names = [] - while x = ARGV.shift - case x - when /\A\-\-?search/i - #do nothing - when /\A\-\-?location/i - location = ARGV.shift.to_s.chomp('/') - when /\A\-\-?dbname/i - dbname = ARGV.shift - when /\A\-\-?name(?:space)?(?:\=(.+))?/i - if $1 then - names << $1 - elsif x = ARGV.shift - names << x - end - else - ARGV.unshift x - break - end - end - dbname = ARGV.shift unless dbname - dbname = File.join(location, dbname) unless location.to_s.empty? - db = Bio::FlatFileIndex.open(dbname) - ARGV.each do |key| - $stderr.print "Searching for \'#{key}\'...\n" - #r = db.search(key) - #$stderr.print "OK, #{r.size} entry found\n" - #if r.size > 0 then - # print r - #end - begin - if names.empty? then - r = db.include?(key) - else - r = db.include_in_namespaces?(key, *names) - end - rescue RuntimeError - $stderr.print "ERROR: #{$!}\n" - next - end - r = [] unless r - $stderr.print "OK, #{r.size} entry found\n" - r.each do |i| - print db.search_primary(i) - end - end - db.close -end - - -def do_show_namespaces - dbname = nil - location = nil - files = nil - format = nil - names = [] - while x = ARGV.shift - case x - when /\A\-\-?(show\-)?name(space)?s/i - #do nothing - when /\A\-\-?location/i - location = ARGV.shift.to_s.chomp('/') - when /\A\-\-?dbname/i - dbname = ARGV.shift - when /\A\-\-?format(?:\=(.+))?/i - if $1 then - format = $1 - elsif x = ARGV.shift - format = x - end - when /\A\-\-?files/i - files = ARGV - break - else - ARGV.unshift x - break - end - end - if files then - k = nil - files.each do |x| - k = Bio::FlatFile.autodetect_file(x) - break if k - end - if k then - $stderr.print "Format: #{k.to_s}\n" - format = k - else - $stderr.print "ERROR: couldn't determine file format\n" - return - end - end - $stderr.print "Namespaces: (first line: primary namespace)\n" - if format then - parser = Bio::FlatFileIndex::Indexer::Parser.new(format) - print parser.primary.name, "\n" - puts parser.secondary.keys - else - dbname = ARGV.shift unless dbname - dbname = File.join(location, dbname) unless location.to_s.empty? - db = Bio::FlatFileIndex.open(dbname) - puts db.namespaces - db.close - end -end - -if ARGV.size > 1 - case ARGV[0] - when /--make/, /--create/ - Bio::FlatFileIndex::DEBUG.out = true - do_index - when /--update/ - Bio::FlatFileIndex::DEBUG.out = true - do_index(:update) - when /\A\-\-?(show\-)?name(space)?s/i - do_show_namespaces - when /--search/ - do_search - else #default is search - do_search - end -else - usage -end - diff --git a/bin/br_biogetseq.rb b/bin/br_biogetseq.rb deleted file mode 100755 index 76c94de..0000000 --- a/bin/br_biogetseq.rb +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env ruby -# -# = biogetseq - OBDA sequence data retrieval (executable) -# -# Copyright:: Copyright (C) 2003 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id: br_biogetseq.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $ -# - -require 'bio' - -def usage - print < [--namespace ] entry_id [entry_id] -END - exit 1 -end - -if ARGV.size < 3 - usage -end - -while ARGV.first =~ /^-/ - case ARGV.shift - when /^\-\-format/ - ARGV.shift - raise NotImplementedError - when /^\-\-dbname/ - dbname = ARGV.shift - when /^\-\-namespace/ - namespace = ARGV.shift - end -end - -reg = Bio::Registry.new -db = reg.get_database(dbname) -if namespace - db['namespace'] = namespace -end -ARGV.each do |entry| - puts db.get_by_id(entry) -end - diff --git a/bin/br_pmfetch.rb b/bin/br_pmfetch.rb deleted file mode 100755 index eb0f4ed..0000000 --- a/bin/br_pmfetch.rb +++ /dev/null @@ -1,422 +0,0 @@ -#!/usr/bin/env ruby -# -# = pmfetch - PubMed client -# -# Copyright:: Copyright (C) 2004, 2005 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id:$ -# - -require 'bio' - -PROG_VER = "Powered by BioRuby #{Bio::BIORUBY_VERSION_ID}" -PROG_NAME = File.basename($0) - - -require 'getoptlong' - - -### formatting - -class String - def fill(fill_column = 80, prefix = '', separater = ' ') - prefix = ' ' * prefix if prefix.is_a?(Integer) - maxlen = fill_column - prefix.length - raise "prefix is longer than fill_column" if maxlen <= 0 - - cursor = pos = 0 - lines = [] - while cursor < self.length - line = self[cursor, maxlen] - pos = line.rindex(separater) - pos = nil if line.length < maxlen - if pos - len = pos + separater.length - lines << self[cursor, len] - cursor += len - else - lines << self[cursor, maxlen] - cursor += maxlen - end - end - return lines.join("\n#{prefix}") - end -end - - -module Bio - class Reference - def report - if (num = @authors.size) > 10 - authors = "#{@authors[0]} et al. (#{num} authors)" - elsif num > 4 - sep = ',' * (num - 1) - authors = "#{@authors[0]}#{sep} #{@authors[-1]}" - else - authors = authors_join(' & ') - end - journal = "#{@journal} #{@year} #{@volume}(#{@issue}):#{@pages}" - - indent = 8 - prefix = ' ' * indent - [ - "#{@pages[/\d+/]}".ljust(indent) + "#{@title}".fill(78, indent), - authors, - "#{journal} [PMID:#{@pubmed}]", - ].join("\n#{prefix}") - end - end -end - - -class PMFetch - - class Examples < StandardError; end - class Version < StandardError; end - class Usage < StandardError; end - - ### default options - - def initialize - @format = 'rd' - @search_opts = { - 'retmax' => 20, - } - @query = nil - @query_opts = [] - @pmid_list_only = false - - pmfetch - end - - - ### main - - def pmfetch - begin - set_options - parse_options - check_query - rescue PMFetch::Examples - puts examples - exit - rescue PMFetch::Version - puts version - exit - rescue PMFetch::Usage - puts usage - exit - rescue GetoptLong::MissingArgument, GetoptLong::InvalidOption - puts usage - exit - end - - list = pm_esearch - - if list.empty? - ; - elsif @pmid_list_only - puts list - else - pm_efetch(list) - end - end - - - ### help - - def usage -%Q[ -Usage: #{PROG_NAME} [options...] "query string" - or #{PROG_NAME} --query "query string" [other options...] - -Options: - -q --query "genome AND virus" Query string for PubMed search - -t --title "mobile elements" Title of the article to search - -j --journal "genome res" Journal title to search - -v --volume # Journal volume to search - -i --issue # Journal issue to search - -p --page # First page number of the article to search - -a --author "Altschul SF" Author name to search - -m --mesh "SARS virus" MeSH term to search - -f --format bibtex Summary output format - --pmidlist Output only a list of PubMed IDs - -n --retmax # Number of articles to retrieve at the maximum - -N --retstart # Starting number of the articles to retrieve - -s --sort pub+date Sort method for the summary output - --reldate # Search articles published within recent # days - --mindate YYYY/MM/DD Search articles published after the date - --maxdate YYYY/MM/DD Search articles published before the date - --help Output this help, then exit - --examples Output examples, then exit - --version Output version number, then exit - -Formats: - endnote, medline, bibitem, bibtex, report, rd, - nature, science, genome_res, genome_biol, nar, current, trends, cell - -Sort: - author, journal, pub+date, page - -See the following pages for the PubMed search options: - http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html - http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html - -#{version} - -] - end - - def version - PROG_VER - end - - def examples - DATA.read.gsub('PMFetch', PROG_NAME) - end - - - private - - - ### options - - def set_options - @parser = GetoptLong.new - - @parser.set_options( - [ '--query', '-q', GetoptLong::REQUIRED_ARGUMENT ], - [ '--title', '-t', GetoptLong::REQUIRED_ARGUMENT ], - [ '--journal', '-j', GetoptLong::REQUIRED_ARGUMENT ], - [ '--volume', '-v', GetoptLong::REQUIRED_ARGUMENT ], - [ '--issue', '-i', GetoptLong::REQUIRED_ARGUMENT ], - [ '--page', '-p', GetoptLong::REQUIRED_ARGUMENT ], - [ '--author', '-a', GetoptLong::REQUIRED_ARGUMENT ], - [ '--mesh', '-m', GetoptLong::REQUIRED_ARGUMENT ], - [ '--format', '-f', GetoptLong::REQUIRED_ARGUMENT ], - [ '--pmidlist', GetoptLong::NO_ARGUMENT ], - [ '--retmax', '-n', GetoptLong::REQUIRED_ARGUMENT ], - [ '--retstart', '-N', GetoptLong::REQUIRED_ARGUMENT ], - [ '--sort', '-s', GetoptLong::REQUIRED_ARGUMENT ], - [ '--reldate', GetoptLong::REQUIRED_ARGUMENT ], - [ '--mindate', GetoptLong::REQUIRED_ARGUMENT ], - [ '--maxdate', GetoptLong::REQUIRED_ARGUMENT ], - [ '--examples', GetoptLong::NO_ARGUMENT ], - [ '--help', GetoptLong::NO_ARGUMENT ], - [ '--version', GetoptLong::NO_ARGUMENT ] - ) - end - - def parse_options - @parser.each_option do |optname, optarg| - case optname - when /--query/ - @query = optarg - when /--title/ - @query_opts << "#{optarg}[ti]" - when /--journal/ - @query_opts << "#{optarg}[ta]" - when /--volume/ - @query_opts << "#{optarg}[vi]" - when /--issue/ - @query_opts << "#{optarg}[ip]" - when /--page/ - @query_opts << "#{optarg}[pg]" - when /--author/ - @query_opts << "#{optarg}[au]" - when /--mesh/ - @query_opts << "#{optarg}[mh]" - when /--format/ - @format = optarg - when /--pmidlist/ - @pmid_list_only = true - when /--examples/ - raise PMFetch::Examples - when /--help/ - raise PMFetch::Usage - when /--version/ - raise PMFetch::Version - when /--sort/ - @sort = optarg - @search_opts["sort"] = @sort unless @sort == "page" - else - optname.delete!('-') - @search_opts[optname] = optarg - end - end - end - - - ### check query - - def check_query - p @query if $DEBUG - @query ||= ARGV.join(" ") unless ARGV.empty? - - p @query if $DEBUG - @query_str = [ @query, @query_opts ].flatten.compact.join(" AND ") - - p @query_str if $DEBUG - if @query_str.empty? - raise PMFetch::Usage - end - end - - - ### search - - def pm_esearch - return Bio::PubMed.esearch(@query_str, @search_opts) - end - - def pm_efetch(list) - entries = Bio::PubMed.efetch(list) - - if @format == 'medline' - medline_format(entries) - else - entries = parse_entries(entries) - if @sort == 'page' - entries = sort_entries(entries) - end - if @format == 'report' - report_format(entries) - else - other_format(entries) - end - end - end - - - ### output - - def medline_format(entries) - entries.each do |entry| - puts entry - puts '//' - end - end - - def parse_entries(entries) - entries.map { |entry| Bio::MEDLINE.new(entry) } - end - - def sort_entries(entries) - if RUBY_VERSION > "1.8.0" - entries.sort_by { |x| - [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i ] - } - else - entries.map { |x| - [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i, x ] - }.sort { |a, b| - a[0..3] <=> b[0..3] - }.map { |y| - y.pop - } - end - end - - def report_format(entries) - entries.each do |entry| - puts entry.reference.report - puts - end - end - - def other_format(entries) - entries.each do |entry| - puts entry.reference.format(@format) - puts - end - end - -end - - -PMFetch.new - - -__END__ - -= Examples : PubMed search - -These four lines will do the same job. - - % PMFetch transcription factor - % PMFetch "transcription factor" - % PMFetch --query "transcription factor" - % PMFetch -q "transcription factor" - - -Retrieve max 100 artiecles (20 is a NCBI's default) at a time, use --retmax as - - % PMFetch -q "transcription factor" --retmax 100 - -and, to retrieve next 100 articles, use --retstart as - - % PMFetch -q "transcription factor" --retmax 100 --retstart 100 - - -You can narrow the search target for an issue of the journal. - - % PMFetch --journal development --volume 131 --issue 3 transcription factor - - -Short options are also available. - - % PMFetch -j development -v 131 -i 3 transcription factor - - -Search articles indexed in PubMed within these 90 days. - - % PMFetch -q "transcription factor" --reldate 90 - - -Search articles indexed in PubMed during the period of 2001/04/01 to 2001/08/31 - - % PMFetch -q "transcription factor" --mindate 2001/04/01 --maxdate 2001/08/31 - - -Output format can be changed by --format option. - - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f report - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f rd - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f endnote - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f medline - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibitem - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibtex - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f nature - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f science - - -Generate title listings for the journal report meeting (don't forget -to inclease the number of --retmax for fetching all titles). - - % PMFetch -f report -j development -v 131 -i 3 -n 100 - - -Search by author name. - - % PMFetch -a "Karlin S" - % PMFetch -a "Koonin EV" - - -Search by MeSH term. - - % PMFetch -m "computational biology" - % PMFetch -m "SARS virus" - - -Search by PubMed ID (PMID). - - % PMFetch 12345 - - -Output PMID only. - - % PMFetch --pmidlist tardigrada - - diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index affbe66..62f78ba 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -1,8 +1,9 @@ # # = bio/db/go.rb - Classes for Gene Ontology # -# Copyright:: Copyright (C) 2003 +# Copyright:: Copyright (C) 2003, 2010 # Mitsuteru C. Nakao +# R. Stephan # License:: The Ruby License # # $Id:$ @@ -174,8 +175,8 @@ class GO # = Bio::GO::GeneAssociation # $CVSROOT/go/gene-associations/gene_association.* # - # Data parser for the gene_association go annotation. - # See also the file format http://www.geneontology.org/doc/GO.annotation.html#file + # Data parser for the gene_association go annotation 1.0. + # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml # # == Example # @@ -324,6 +325,20 @@ class GO end # class GeneAssociation + # = Bio::GO::GeneAssociation2 + # + # Data parser for the gene_association go annotation 2.0. + # See also the file format http://www.geneontology.org/GO.format.gaf-2_0.shtml + # + # == Example + # + # mgi_data = File.open('gene_association.mgi').read + # mgi = Bio::GO::GeneAssociation2.parser(mgi_data) + # + # Bio::GO::GeneAssociation.parser(mgi_data) do |entry| + # p [entry.entry_id, entry.evidence, entry.goid] + # end + # class GeneAssociation2 < GeneAssociation # Iterator through all entries -- 1.5.5 >From c6729520a9faf985975fb7f5b93128cdbe31b0e8 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Tue, 3 Aug 2010 08:47:31 +0200 Subject: [PATCH] Add Phenote GOA file format parsing, GAF1 output --- lib/bio/db/go.rb | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 60 insertions(+), 1 deletions(-) diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index 62f78ba..b265c7e 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -358,12 +358,71 @@ class GO end end - # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. + # Bio::GO::GeneAssociation2#to_str -> a line of gene_association file. def to_str return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t") end end + # = Bio::GO::Phenote_GOA + # + # Data parser for the Phenote file format which is similar to GAF1. + # We serialize to GAF1 format (to_str). + # See http://www.phenote.org + # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml + # + # == Example + # + # mgi_data = File.open('gene_association.mgi').read + # mgi = Bio::GO::Phenote_GOA.parser(mgi_data) + # + # Bio::GO::Phenote_GOA.parser(mgi_data) do |entry| + # p.to_str + # end + + class Phenote_GOA < GeneAssociation + + # Retruns an Array of parsed Phenote file. + # Block is acceptable. + def self.parser(str) + if block_given? + str.each_line(DELIMITER) {|line| + next if /^DB\t/ =~ line + yield Phenote_GOA.new(line) + } + else + galist = [] + str.each_line(DELIMITER) {|line| + next if /^DB\t/ =~ line + galist << Phenote_GOA.new(line) + } + return galist + end + end + + # Assign fields of an entry (in a line) in Phenote format. + def assign(tmp) + @db = tmp[0] + @db_object_id = tmp[1] + @db_object_symbol = tmp[2] + @qualifier = tmp[3] # + @goid = tmp[4] + # We ignore Phenote's tmp[5] + @db_reference = ArrayOrString.new(tmp[6].split(/\|/)) # + @evidence = tmp[7] + @with = ArrayOrString.new(tmp[8].split(/\|/)) # + @aspect = tmp[9] + @db_object_name = tmp[10] # + @db_object_synonym = ArrayOrString.new(tmp[11].split(/\|/)) # + @db_object_type = tmp[12] + @taxon = tmp[13] # taxon:4932 + @date = tmp[14] # 20010118 + @assigned_by = tmp[15] + # We ignore Phenote's tmp[16-18] + end + end + + # # = Container class for files in geneontology.org/go/external2go/*2go. # # The line syntax is: -- 1.5.5 Ralf Stephan http://www.ark.in-berlin.de pub 1024D/C5114CB2 2009-06-07 [expires: 2011-06-06] Key fingerprint = 76AE 0D21 C06C CBF9 24F8 7835 1809 DE97 C511 4CB2 From ralf at ark.in-berlin.de Tue Aug 3 03:13:31 2010 From: ralf at ark.in-berlin.de (Ralf Stephan) Date: Tue, 3 Aug 2010 09:13:31 +0200 Subject: [BioRuby] [PATCH] GO annotations fixes and improvements In-Reply-To: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> References: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> Message-ID: <2CA96C77-E1BC-4958-AA2A-9CC97F346917@ark.in-berlin.de> Please ignore changes to bin/* in patch 0003 Sorry, ralf From ngoto at gen-info.osaka-u.ac.jp Tue Aug 3 12:13:27 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Wed, 4 Aug 2010 01:13:27 +0900 Subject: [BioRuby] [PATCH] GO annotations fixes and improvements In-Reply-To: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> References: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> Message-ID: <20100803161327.BF9DE1CBC410@idnmail.gen-info.osaka-u.ac.jp> Hi Ralf, Thank you to send patches. I reviewed the patch. Please see the comments below. Some part of the patches will be merged soon, and some would be later, and some will not be merged. On Tue, 3 Aug 2010 08:58:16 +0200 Ralf Stephan wrote: > --- a/lib/bio/db/go.rb > +++ b/lib/bio/db/go.rb > @@ -186,6 +186,18 @@ class GO > # p [entry.entry_id, entry.evidence, entry.goid] > # end > # > + class ArrayOrString > + def initialize(arg) > + @var = arg > + end > + def join(char) > + if @var.instance_of? String > + then return @var > + else return @var.join(char) > + end > + end > + end I disagree with the class. For GAF, there is no need to introduce such new wrapper class. > @@ -253,30 +265,34 @@ class GO > > # > attr_reader :assigned_by > - > + > alias entry_id db_object_id > > > - # Parsing an entry (in a line) in the gene_association flatfile. > - def initialize(entry) > - tmp = entry.chomp.split(/\t/) > + # Assign fields of an entry (in a line). > + def assign(tmp) I don't like the method name. The word "assign" is used in the context of Gene Ontology Annotation, and it is better not to use the word for the class internal use to avoid confusion. > @@ -293,17 +309,15 @@ class GO > > # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. > def to_str > - return [@db, @db_object_id, @db_object_symbol, @quialifier, @goid, > - @qualifier.join("|"), @evidence, @with.join("|"), @aspect, > + return [@db, @db_object_id, @db_object_symbol, @qualifier, @goid, > + @db_reference.join("|"), @evidence, @with.join("|"), @aspect, > @db_object_name, @db_object_synonym.join("|"), @db_object_type, > @taxon, @date, @assigned_by].join("\t") > end This seems bug fix. Thanks! By the way, I think it is good to change to_str to to_s, because the GeneAssociation class do not need to behave like a string. > --- a/lib/bio/db/go.rb > +++ b/lib/bio/db/go.rb > @@ -266,6 +266,11 @@ class GO > # > attr_reader :assigned_by > > + attr_reader :annotation_extension > + > + attr_reader :gene_product_form_id > + > + If you want to add GeneAssociation2 class, these new attributes should only be added in the GeneAssociation2 class. Alternatively, it is also good to support both GAF 1.0 and 2.0 in the GeneAssociation class. > alias entry_id db_object_id > > > @@ -286,6 +291,8 @@ class GO > @taxon = tmp[12] # taxon:4932 > @date = tmp[13] # 20010118 > @assigned_by = tmp[14] > + @annotation_extension = tmp[15] > + @gene_product_form_id = tmp[16] > end > > # Parsing an entry (in a line) in the gene_association flatfile. > @@ -317,6 +324,31 @@ class GO > > end # class GeneAssociation > > + class GeneAssociation2 < GeneAssociation > + > + # Iterator through all entries > + def self.parser(str) > + if block_given? > + str.each_line(DELIMITER) {|line| > + next if /^!/ =~ line > + yield GeneAssociation2.new(line) > + } > + else > + galist = [] > + str.each_line(DELIMITER) {|line| > + next if /^!/ =~ line > + galist << GeneAssociation2.new(line) > + } > + return galist > + end > + end > + > + # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. > + def to_str > + return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t") > + end > + end > + The role of the GeneAssociation2 class will be carefully considered. It might be merged to the GeneAssociation class. The method name "parser" may be changed, or the method might not be merged. > + class Phenote_GOA < GeneAssociation The name of the class would be changed, based on the format name used in the Phenote community. > + # Assign fields of an entry (in a line) in Phenote format. > + def assign(tmp) > + @db = tmp[0] > + @db_object_id = tmp[1] > + @db_object_symbol = tmp[2] > + @qualifier = tmp[3] # > + @goid = tmp[4] > + # We ignore Phenote's tmp[5] Please do not ignore. When supporting a new data format, all data should be parsed and stored unless it is technically very difficult. Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From jimktrainslists at gmail.com Wed Aug 4 15:32:02 2010 From: jimktrainslists at gmail.com (James Keener) Date: Wed, 4 Aug 2010 15:32:02 -0400 Subject: [BioRuby] Consensus sequence In-Reply-To: References: Message-ID: At alignment.rb:118 there is this function: # Returns consensus character of the site. # If consensus is found, eturns a single-letter string. # If not, returns nil. def consensus_string(threshold = 1.0) return nil if self.size <= 0 return self[0] if self.sort.uniq.size == 1 h = Hash.new(0) self.each { |x| h[x] += 1 } total = self.size b = h.to_a.sort do |x,y| z = (y[1] <=> x[1]) z = (self.index(x[0]) <=> self.index(y[0])) if z == 0 z end if total * threshold <= b[0][1] then b[0][0] else nil end end Now, I have 2 questions about it. 1) Why is it sorting? Shouldn't it use a linear search? 2) How can the count of the greatest residue (b[0][1]) be larger than or equal to the total number of residues? Also, there is a whole set of functions I am adding (group entropy and some book keeping/housecleaning things) and would like to commit them back. What is the best way to commit them back? Jim From tomoakin at kenroku.kanazawa-u.ac.jp Wed Aug 4 20:28:04 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Thu, 5 Aug 2010 09:28:04 +0900 Subject: [BioRuby] Consensus sequence In-Reply-To: References: Message-ID: Hi, > 2) How can the count of the greatest residue (b[0][1]) be larger > than or equal to the total number of residues? It is obvious that the count of the greatest residue is equal to the total number of residues if all the residues are identical. Presumably, the parameter threshold should be 0 <= threshold <= 1.0 > 1) Why is it sorting? Shouldn't it use a linear search? I really don't know. So this is just my feeling, but this could be simplicity, readability, and extensibility... Did you compare the performance with linear search? -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From ngoto at gen-info.osaka-u.ac.jp Thu Aug 5 02:16:41 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Thu, 5 Aug 2010 15:16:41 +0900 Subject: [BioRuby] Consensus sequence In-Reply-To: References: Message-ID: <20100805061641.B9C2D1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> Hi, On Thu, 5 Aug 2010 09:28:04 +0900 Tomoaki NISHIYAMA wrote: > Hi, > > > 2) How can the count of the greatest residue (b[0][1]) be larger > > than or equal to the total number of residues? > > > It is obvious that the count of the greatest residue is equal to the > total number of residues > if all the residues are identical. > > Presumably, the parameter threshold should be > 0 <= threshold <= 1.0 In addition, the bahavior is undefined when the threshold is out of the range. > > 1) Why is it sorting? Shouldn't it use a linear search? I forget what I was thinking when I wrote it in 2003. > I really don't know. So this is just my feeling, > but this could be simplicity, readability, and extensibility... > Did you compare the performance with linear search? For simplicity and readability, using Enumerable#max (Array#max) seems to be the straightforward way, though I don't know much about the performance. On Wed, 4 Aug 2010 15:32:02 -0400 James Keener wrote: > Also, there is a whole set of functions I am adding (group entropy and some book keeping/housecleaning things) and would like to commit them back. What is the best way to commit them back? Please create your fork on GitHub and push them to the GitHub fork. It seems http://github.com/fredrikj/bioruby also make some modifications to the alignment classes (in lib/bio/appl/seala.rb in the repository). -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From sararayburn at gmail.com Thu Aug 5 13:33:14 2010 From: sararayburn at gmail.com (Sara Rayburn) Date: Thu, 5 Aug 2010 12:33:14 -0500 Subject: [BioRuby] GSoC project update Message-ID: Hi all, This week I've implemented the generalized version of the speciation/duplication inference algorithm. This extension allows the algorithm to be used on trees with non-binary nodes, however it is based on the unverified implementation found in the java Forester package. My remaining goals for the rest of the project time are 1) preparing the code for merging with the main bioruby repository. 2) trying to construct an informal proof of correctness of the generalized sdi algorithm. Thanks, Sara Rayburn From missy at be.to Thu Aug 5 22:53:15 2010 From: missy at be.to (MISHIMA, Hiroyuki) Date: Fri, 06 Aug 2010 11:53:15 +0900 Subject: [BioRuby] Indexing fasta file with Ruby 1.9.1 In-Reply-To: <30c617b1-dcc4-42b9-9ffe-498fc663708b@ingm.it> References: <30c617b1-dcc4-42b9-9ffe-498fc663708b@ingm.it> Message-ID: <4C5B791B.6070307@be.to> Hi Raoul and all, Raoul Bonnal wrote (2010/07/30 20:28): > Caught error: # "bta-miR-3596":String> in "mature.fa" position 1178667 The following patch seems to work... --- ./indexer-orig.rb 2010-08-06 11:40:52.000000000 +0900 +++ /usr/local/lib/ruby/gems/1.9.1/gems/bio-1.4.0/lib/bio/io/flatfile/indexer.rb 2010-08-06 11:38:53.000000000 +0900 @@ -155,8 +155,15 @@ def parse_secondary self.secondary.each do |x| p = x.proc.call(@entry) - p.each do |y| - yield x.name, y if y.length > 0 + + if p.respond_to? :each + p.each do |y| + yield x.name, y if y.length > 0 + end + else + p.each_line do |y| + yield x.name, y if y.length > 0 + end end end end This is typical incompatibility between Ruby-1.8 and -1.9. In Ruby-1.9, String#each should be replaced by String#each_line. irb-1.8> "abc\ndef".each {|l| p l} "abc\n" "def" => "abc\ndef" irb-1.9> "abc\ndef".each {|l| p l} NoMethodError: undefined method `each' for "abc\ndef":String from (irb):1 from /usr/local/bin/irb-1.9:12:in `
' irb-1.9> "abc\ndef".each_line {|l| p l} "abc\n" "def" => "abc\ndef" Because the "p" variable can be String or Array in the "parse_secondary" method, I used "respond_to?". I do not know this instant patch is right way or no. Sincerely yours, Hiro -- MISHIMA, Hiroyuki, DDS, Ph.D. COE Research Fellow Department of Human Genetics Nagasaki University Graduate School of Biomedical Sciences From anurag08priyam at gmail.com Sun Aug 8 11:32:57 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Sun, 8 Aug 2010 21:02:57 +0530 Subject: [BioRuby] development updates? Message-ID: Today I had a chat with Jan when I found that a manuscript on BioRuby is under progress. I do not remember it being discussed on the list. Perhaps BioRuby's development process needs to be a little more transparent, in the sense that everybody should be kept in the loop for a more synergic development process. Also chance of a new contributors chiming in increases with a more active( updated ) list. I would suggest that the list be constantly updated : 1. short and long term goals - targets for minor and major releases, prioritizing bugs or feature requests or design decisions. 2. what is cooking - each developer could update the list on what he/she is working at, or/and a fortnightly or a monthly update on the cummulative development status( how much of the target has been achieved and stuff ) 3. important decisions and changes. Perhaps, a development specific list can be setup to keep the user and the developer space segregated. A development list can also be attached to the issue tracker so that developers are automatically updated on new bugs and feature requests. Or, a blog can be setup where regular commiters have posting access. P.S : The idea behind this mail is to spark some discussion on a more efficient software development culture and hopefully adopt it :). -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From anurag08priyam at gmail.com Sun Aug 8 12:09:56 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Sun, 8 Aug 2010 21:39:56 +0530 Subject: [BioRuby] ohloh Message-ID: i just added http://github.com/bioruby/bioruby.git master as an enlistment to ohlo's BioRuby page. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From anurag08priyam at gmail.com Mon Aug 9 01:31:13 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Mon, 9 Aug 2010 11:01:13 +0530 Subject: [BioRuby] development updates? In-Reply-To: <20100808222310.GA15100@thebird.nl> References: <20100808222310.GA15100@thebird.nl> Message-ID: > > What does the fact that *you* are not aware of a manuscript have to do > with the development process? I miss the connection. > I just thought it would be better if *all* the developers( important or less important ) are updated on what is happening. I see it happening on the few other lists I have been subscribed to all the time. > As BioRuby is an OSS project, feel free to take the lead. You > understand OSS development, right? > Well to be very frank this has been my very first attempt at OSS contribution. So, may be I really do not understand it very well. > Talk is cheap. Making stuff happen - that is what counts. > FYI, that is exactly what I am trying to do. I see very different culture in other development teams and I find it better. I was trying to bring some of it in. If it is unwelcome, I have no issues. Out of the things I suggested, there is only one thing that I can do in my capacity : update the list on what I have been doing, and I do that. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From pjotr.public14 at thebird.nl Mon Aug 9 02:43:32 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 08:43:32 +0200 Subject: [BioRuby] development updates? In-Reply-To: <20100809060025.GA17390@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> Message-ID: <20100809064332.GA19790@thebird.nl> Sorry, that was a bit rabid. Basically you think we should be organised differently, and/or share information differently. On the mailing list you'll find a history of that. Every OSS project is different. There is no single way. If you want it differently, take initiative - rather than talking about 'culture' in other projects. BioRuby has grown this way, and it works for us. As I see it, all major developments have been discussed via the list. The latest developments are GSoC, plugins and BioLib BAM/SAM support (which is on the biolib mailing list). Again, I don't mind you want it different. But the way to change things is to act. With actions gain you respect. And most respect is gained by writing code. That is true in every OSS project I know. Everyone can criticise, few really change things. Pj. On Mon, Aug 09, 2010 at 08:00:25AM +0200, Pjotr Prins wrote: > First you steal our announcement of a paper - I am sure Jan did not > intend you to blurt that out on the list. Second you breach my trust > by quoting a private message on the list. Third, you assume our > development process is not transparent. Based on what? > > Maybe there is a simple answer: we don't need to communicate that > much. > > BioRuby is not a centrally run project, business or organisation. > BioRuby is OSS. Feel free to run with the project. Fork, code, > document, organise, whatever. > > So far, I see talk and hand waving. If you want to organise something, > be factual and concrete. Actions speak louder than words. You also may > want to read the history of the mailing list. > > I suggest you earn your stripes with getting your code accepted to > BioRuby, first. That is a pretty steep hill anyway. Last time I > checked your code. > > And I suggest at least two apologies, if you want further responses > from me. > > Pj > > On Mon, Aug 09, 2010 at 11:01:13AM +0530, Anurag Priyam wrote: > > > > > > What does the fact that *you* are not aware of a manuscript have to do > > > with the development process? I miss the connection. > > > > > > > I just thought it would be better if *all* the developers( important > > or less important ) are updated on what is happening. I see it > > happening on the few other lists I have been subscribed to all the > > time. > > > > > As BioRuby is an OSS project, feel free to take the lead. You > > > understand OSS development, right? > > > > > > > Well to be very frank this has been my very first attempt at OSS > > contribution. So, may be I really do not understand it very well. > > > > > Talk is cheap. Making stuff happen - that is what counts. > > > > > > > FYI, that is exactly what I am trying to do. I see very different > > culture in other development teams and I find it better. I was trying > > to bring some of it in. If it is unwelcome, I have no issues. > > > > Out of the things I suggested, there is only one thing that I can do > > in my capacity : update the list on what I have been doing, and I do > > that. > > > > -- > > Anurag Priyam, > > 3rd Year Undergraduate, > > Department of Mechanical Engineering, > > IIT Kharagpur. > > +91-9775550642 From andrew.j.grimm at gmail.com Mon Aug 9 03:17:17 2010 From: andrew.j.grimm at gmail.com (Andrew Grimm) Date: Mon, 9 Aug 2010 17:17:17 +1000 Subject: [BioRuby] development updates? In-Reply-To: References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> Message-ID: While there may be a difference between making suggestions and doing stuff, there's also a difference between making suggestions and personally criticising people. I think that personal criticisms should be avoided when possible. I am no longer surprised when Rails people behave in an incivil way, but I hoped that non-Rails ruby people would still follow "Matz is nice, so we are nice". Andrew Grimm On Mon, Aug 9, 2010 at 4:43 PM, Pjotr Prins wrote: > Sorry, that was a bit rabid. Basically you think we should be > organised differently, and/or share information differently. On the > mailing list you'll find a history of that. > > Every OSS project is different. There is no single way. If you want > it differently, take initiative - rather than talking about 'culture' > in other projects. > > BioRuby has grown this way, and it works for us. As I see it, all > major developments have been discussed via the list. The latest > developments are GSoC, plugins and BioLib BAM/SAM support (which is > on the biolib mailing list). > > Again, I don't mind you want it different. But the way to change > things is to act. With actions gain you respect. And most respect is > gained by writing code. That is true in every OSS project I know. > > Everyone can criticise, few really change things. > > Pj. > > On Mon, Aug 09, 2010 at 08:00:25AM +0200, Pjotr Prins wrote: >> First you steal our announcement of a paper - I am sure Jan did not >> intend you to blurt that out on the list. Second you breach my trust >> by quoting a private message on the list. Third, you assume our >> development process is not transparent. Based on what? >> >> Maybe there is a simple answer: we don't need to communicate that >> much. >> >> BioRuby is not a centrally run project, business or organisation. >> BioRuby is OSS. Feel free to run with the project. Fork, code, >> document, organise, whatever. >> >> So far, I see talk and hand waving. If you want to organise something, >> be factual and concrete. Actions speak louder than words. You also may >> want to read the history of the mailing list. >> >> I suggest you earn your stripes with getting your code accepted to >> BioRuby, first. That is a pretty steep hill anyway. Last time I >> checked your code. >> >> And I suggest at least two apologies, if you want further responses >> from me. >> >> Pj >> >> On Mon, Aug 09, 2010 at 11:01:13AM +0530, Anurag Priyam wrote: >> > > >> > > What does the fact that *you* are not aware of a manuscript have to do >> > > with the development process? I miss the connection. >> > > >> > >> > I just thought it would be better if *all* the developers( important >> > or less important ) are updated on what is happening. I see it >> > happening on the few other lists I have been subscribed to all the >> > time. >> > >> > > As BioRuby is an OSS project, feel free to take the lead. You >> > > understand OSS development, right? >> > > >> > >> > Well to be very frank this has been my very first attempt at OSS >> > contribution. So, may be I really do not understand it very well. >> > >> > > Talk is cheap. Making stuff happen - that is what counts. >> > > >> > >> > FYI, that is exactly what I am trying to do. I see very different >> > culture in other development teams and I find it better. I was trying >> > to bring some of it in. If it is unwelcome, I have no issues. >> > >> > Out of the things I suggested, there is only one thing that I can do >> > in my capacity : update the list on what I have been doing, and I do >> > that. >> > >> > -- >> > Anurag Priyam, >> > 3rd Year Undergraduate, >> > Department of Mechanical Engineering, >> > IIT Kharagpur. >> > +91-9775550642 > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From anurag08priyam at gmail.com Mon Aug 9 03:49:19 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Mon, 9 Aug 2010 13:19:19 +0530 Subject: [BioRuby] development updates? In-Reply-To: <20100809060025.GA17390@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> Message-ID: > First you steal our announcement of a paper - I am sure Jan did not > intend you to blurt that out on the list. Oops. Extremely sorry about that. > Second you breach my trust > by quoting a private message on the list. I had intended my mail to be *open* so it perfectly seemed apt to me that others know what you think about it. I am extremely sorry that you took it otherwise. I did not mean any offense. > Third, you assume our > development process is not transparent. Based on what? As I said, I like things to be more open. Again, I was just suggesting that doing something *might* be better. I am not forcing anyone to adopt it, not that I can. > > Maybe there is a simple answer: we don't need to communicate that > much. Maybe. > > So far, I see talk and hand waving. If you want to organise something, > be factual and concrete. Actions speak louder than words. You also may > want to read the history of the mailing list. > > I suggest you earn your stripes with getting your code accepted to > BioRuby, first. That is a pretty steep hill anyway. Last time I > checked your code. > Well, I am definitely embarrassed that you think so. I can only hope that it changes as I contribute more. Last time you checked my code it definitely worked. I see no wrong in first writing something that works and then make it more correct or elegant. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From pjotr.public14 at thebird.nl Mon Aug 9 03:50:05 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 09:50:05 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> Message-ID: <20100809075005.GA21017@thebird.nl> On Mon, Aug 09, 2010 at 05:17:17PM +1000, Andrew Grimm wrote: > While there may be a difference between making suggestions and doing > stuff, there's also a difference between making suggestions and > personally criticising people. I think that personal criticisms should > be avoided when possible. I did not sent a personal criticism reply to the list originally. I sent Anurag a private response. *He* quoted it on the mailing list. I am allowed to be critical on that. Anyway, no worries. Chapter closed, as far as I am concerned. Anurag can reply privately, if he wants. Pj. From pjotr.public14 at thebird.nl Mon Aug 9 03:53:52 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 09:53:52 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> Message-ID: <20100809075352.GC21017@thebird.nl> Apologies accepted. Pj. On Mon, Aug 09, 2010 at 01:19:19PM +0530, Anurag Priyam wrote: > > First you steal our announcement of a paper - I am sure Jan did not > > intend you to blurt that out on the list. > > Oops. Extremely sorry about that. > > > Second you breach my trust > > by quoting a private message on the list. > > I had intended my mail to be *open* so it perfectly seemed apt to me > that others know what you think about it. I am extremely sorry that > you took it otherwise. I did not mean any offense. > > > Third, you assume our > > development process is not transparent. Based on what? > > As I said, I like things to be more open. Again, I was just > suggesting that doing something *might* be better. I am not forcing > anyone to adopt it, not that I can. > > > > > Maybe there is a simple answer: we don't need to communicate that > > much. > > Maybe. > > > > > So far, I see talk and hand waving. If you want to organise something, > > be factual and concrete. Actions speak louder than words. You also may > > want to read the history of the mailing list. > > > > I suggest you earn your stripes with getting your code accepted to > > BioRuby, first. That is a pretty steep hill anyway. Last time I > > checked your code. > > > > Well, I am definitely embarrassed that you think so. I can only hope > that it changes as I contribute more. Last time you checked my code it > definitely worked. I see no wrong in first writing something that > works and then make it more correct or elegant. > > -- > Anurag Priyam, > 3rd Year Undergraduate, > Department of Mechanical Engineering, > IIT Kharagpur. > +91-9775550642 From anurag08priyam at gmail.com Mon Aug 9 03:57:34 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Mon, 9 Aug 2010 13:27:34 +0530 Subject: [BioRuby] development updates? In-Reply-To: <20100809064332.GA19790@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> Message-ID: > Sorry, that was a bit rabid. No issues :). -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From andrew.j.grimm at gmail.com Mon Aug 9 07:05:32 2010 From: andrew.j.grimm at gmail.com (Andrew Grimm) Date: Mon, 9 Aug 2010 21:05:32 +1000 Subject: [BioRuby] development updates? In-Reply-To: <20100809075005.GA21017@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> <20100809075005.GA21017@thebird.nl> Message-ID: No worries, mate! ;) Andrew On Mon, Aug 9, 2010 at 5:50 PM, Pjotr Prins wrote: > On Mon, Aug 09, 2010 at 05:17:17PM +1000, Andrew Grimm wrote: >> While there may be a difference between making suggestions and doing >> stuff, there's also a difference between making suggestions and >> personally criticising people. I think that personal criticisms should >> be avoided when possible. > > I did not sent a personal criticism reply to the list originally. I > sent Anurag a private response. ?*He* quoted it on the mailing list. I > am allowed to be critical on that. > > Anyway, no worries. Chapter closed, as far as I am concerned. Anurag > can reply privately, if he wants. > > Pj. > From rutgeraldo at gmail.com Mon Aug 9 09:45:26 2010 From: rutgeraldo at gmail.com (Rutger Vos) Date: Mon, 9 Aug 2010 21:45:26 +0800 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: I certainly appreciate Pjotr's point that whoever writes the code calls the shots - and since I have written zero code my opinion carries no weight. That said, now that this thread was started there is probably no harm in trying to address the points in the original post, if only for posterity's sake. I would start by saying that I commend Anurag for his (youthful, if he doesn't mind) enthusiasm. It seems to me that the points he raises are already being addressed in a way that evidently suits the bioruby community: > I would suggest that the list be constantly updated : > 1. short and long term goals - targets for minor and major releases, > prioritizing bugs or feature requests or design decisions. > There is currently on the bioruby homepage a link to a bug tracker and a feature request tracker. There is also one on the github page. Adding another technological fix (some tracker tool) will do nothing but fragment things. As far as design decisions are concerned, none of the OBF(-like) projects I follow are really designed by committee, so in general there are no formal decisions that need to be communicated to lower echelons. > 2. what is cooking - each developer could update the list on what he/she is > working at, or/and a fortnightly or a monthly update on the cummulative > development status( how much of the target has been achieved and stuff ) > On the bioruby homepage are links to a number of blogs by people who very generously take the time to record what they do so that others can learn from that and use it. That is probably about as good as it's gonna get given the time constraints that researcher/programmers are under. > 3. important decisions and changes. > I doubt that this is how things work. People use bioruby to get their work done, and they add things if they are deemed useful. This isn't the kind of open source project where the user base vastly outnumbers the developer community (e.g. apache, firefox, and so on) so that there would need to be impressive "milestones" and cool sounding code names sent through formal lines of communication. > Perhaps, a development specific list can be setup to keep the user and the > developer space segregated. A development list can also be attached to the > issue tracker so that developers are automatically updated on new bugs and > feature requests. > I have been subscribed to a number of -guts at example.org mailing lists to which bugs and commits are automatically piped. They have been of dubious value - I filter the messages automatically from my inbox to some folder which I then never visit, it turns out. In any case, to say that a list can be setup is to say that a real person should spend time setting up a list. Maybe that is not necessary given that source code repositories and bug trackers have RSS feeds. > Or, a blog can be setup where regular commiters have posting access. > Again, this is something that a real person would need to do. To the extent that I know the people in the bioruby core (I only "know" a couple) I know that they are all people with heavy academic work loads - perhaps even (heaven forbid) with departmental duties on top of that. They do what they can, I assume they already pretty much exhaust their copious amounts of spare time as it is :) > P.S : The idea behind this mail is to spark some discussion on a more > efficient software development culture and hopefully adopt it :). > I don't think cultures are ever adopted unless there is very, very forceful management that imposes it (which of course there isn't for OBF projects) or unless it grows around certain key people with long term involvement in a project. But this is probably just a roundabout way of repeating Pjotr's point that the way to change things is to act. Rutger -- Dr. Rutger A. Vos School of Biological Sciences Philip Lyle Building, Level 4 University of Reading Reading RG6 6BX United Kingdom Tel: +44 (0) 118 378 7535 http://www.nexml.org http://rutgervos.blogspot.com From cjfields at illinois.edu Mon Aug 9 11:59:15 2010 From: cjfields at illinois.edu (Chris Fields) Date: Mon, 9 Aug 2010 10:59:15 -0500 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: Just want to add my 2c, from the bioperl perspective. The vast majority of bioperl devs have other jobs or obligations, so when they develop for whatever Bio* it tends to be to scratch a particular itch (fulfill something they need), and not much beyond that. With BioPerl the tendency is that whoever codes first wins; talking about design leads to bikeshedding and tends to go nowhere unless you have something to point to (just drop in on the threads on the perl6 mail list sometime for many examples of this). Yes, sometimes we have conflicts of opinion and get into spats, sometimes new code clobbers tests (git/github helps here), and sometime we even get a pretty decent design out of it (sometimes not :), but regardless every time we have code to work with or something to point to for our efforts. It's good to discuss things, but you have to produce something of value at the end of the day. So, just to reiterate what Rutger and Pjotr are saying, actions speak volumes. Take up the reins on something, get involved, and actually do something that benefits the project you are interested in. chris On Aug 9, 2010, at 8:45 AM, Rutger Vos wrote: > I certainly appreciate Pjotr's point that whoever writes the code calls the > shots - and since I have written zero code my opinion carries no weight. > That said, now that this thread was started there is probably no harm in > trying to address the points in the original post, if only for posterity's > sake. I would start by saying that I commend Anurag for his (youthful, if he > doesn't mind) enthusiasm. It seems to me that the points he raises are > already being addressed in a way that evidently suits the bioruby community: > > >> I would suggest that the list be constantly updated : >> 1. short and long term goals - targets for minor and major releases, >> prioritizing bugs or feature requests or design decisions. >> > > There is currently on the bioruby homepage a link to a bug tracker and a > feature request tracker. There is also one on the github page. Adding > another technological fix (some tracker tool) will do nothing but fragment > things. > > As far as design decisions are concerned, none of the OBF(-like) projects I > follow are really designed by committee, so in general there are no formal > decisions that need to be communicated to lower echelons. > > >> 2. what is cooking - each developer could update the list on what he/she is >> working at, or/and a fortnightly or a monthly update on the cummulative >> development status( how much of the target has been achieved and stuff ) >> > > On the bioruby homepage are links to a number of blogs by people who very > generously take the time to record what they do so that others can learn > from that and use it. That is probably about as good as it's gonna get given > the time constraints that researcher/programmers are under. > > >> 3. important decisions and changes. >> > > I doubt that this is how things work. People use bioruby to get their work > done, and they add things if they are deemed useful. This isn't the kind of > open source project where the user base vastly outnumbers the developer > community (e.g. apache, firefox, and so on) so that there would need to be > impressive "milestones" and cool sounding code names sent through formal > lines of communication. > > >> Perhaps, a development specific list can be setup to keep the user and the >> developer space segregated. A development list can also be attached to the >> issue tracker so that developers are automatically updated on new bugs and >> feature requests. >> > > I have been subscribed to a number of -guts at example.org mailing > lists to which bugs and commits are automatically piped. They have been of > dubious value - I filter the messages automatically from my inbox to some > folder which I then never visit, it turns out. In any case, to say that a > list can be setup is to say that a real person should spend time setting up > a list. Maybe that is not necessary given that source code repositories and > bug trackers have RSS feeds. > > >> Or, a blog can be setup where regular commiters have posting access. >> > > Again, this is something that a real person would need to do. To the extent > that I know the people in the bioruby core (I only "know" a couple) I know > that they are all people with heavy academic work loads - perhaps even > (heaven forbid) with departmental duties on top of that. They do what they > can, I assume they already pretty much exhaust their copious amounts of > spare time as it is :) > > >> P.S : The idea behind this mail is to spark some discussion on a more >> efficient software development culture and hopefully adopt it :). >> > > I don't think cultures are ever adopted unless there is very, very forceful > management that imposes it (which of course there isn't for OBF projects) or > unless it grows around certain key people with long term involvement in a > project. But this is probably just a roundabout way of repeating Pjotr's > point that the way to change things is to act. > > Rutger > > -- > Dr. Rutger A. Vos > School of Biological Sciences > Philip Lyle Building, Level 4 > University of Reading > Reading > RG6 6BX > United Kingdom > Tel: +44 (0) 118 378 7535 > http://www.nexml.org > http://rutgervos.blogspot.com > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From pjotr.public14 at thebird.nl Mon Aug 9 13:28:51 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 19:28:51 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: <20100809172851.GA28959@thebird.nl> On Mon, Aug 09, 2010 at 10:59:15AM -0500, Chris Fields wrote: > Just want to add my 2c, from the bioperl perspective. The vast > majority of bioperl devs have other jobs or obligations, so when > they develop for whatever Bio* it tends to be to scratch a > particular itch (fulfill something they need), and not much beyond > that. The few people that go beyond scratching an itch, are the ones we should really treasure as they get little in return (apart from criticism). In particular people like Naohisa (BioRuby) and Chris (BioPerl), who work on testing and integration, get really very little recognition for their efforts. Likewise people like Hilmar, Rutger and Christian, who put in effort guiding students in GSoC and get very little recognition for their work. This is a good place to thank you. I notice what you do :). And I think you are making the world a better place. Thank you for going beyond your remit and spending your free time. Pj. From hlapp at drycafe.net Mon Aug 9 21:45:51 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Mon, 9 Aug 2010 21:45:51 -0400 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Hi Anurag: On Aug 8, 2010, at 11:32 AM, Anurag Priyam wrote: > Today I had a chat with Jan when I found that a manuscript on > BioRuby is > under progress. I do not remember it being discussed on the list. I'm curious - why do you think should it have been discussed on the list (assuming that your point here is that you think that's what should have happened)? -hilmar -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From anurag08priyam at gmail.com Tue Aug 10 00:02:37 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Tue, 10 Aug 2010 09:32:37 +0530 Subject: [BioRuby] development updates? In-Reply-To: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Message-ID: > I'm curious - why do you think should it have been discussed on the list > (assuming that your point here is that you think that's what should have > happened)? Well, I have been lurking around in Debian, and Git mailing list for a while and nothing happens off the list there. I kind of like that. Actually I was expecting something like that here too. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From anurag08priyam at gmail.com Tue Aug 10 00:07:58 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Tue, 10 Aug 2010 09:37:58 +0530 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Message-ID: >> I'm curious - why do you think should it have been discussed on the list >> (assuming that your point here is that you think that's what should have >> happened)? > > Well, I have been lurking around in Debian, and Git mailing list for a > while and nothing happens off the list there. I kind of like that. > Actually I was expecting something like that here too. That actually was the basis of my entire mail. But after Rutger's response I kind of understand what I was not seeing before. So, it is fine :). -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From georgkam at gmail.com Tue Aug 10 03:37:11 2010 From: georgkam at gmail.com (George Githinji) Date: Tue, 10 Aug 2010 10:37:11 +0300 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? Message-ID: Hi all, The Regional Students Group for Eastern Africa (RSG-EA) is one of the grass-root level bodies of the International Society for Computational Biology Student Council (ISCB-SC). The group has membership from ten countries namely Burundi, Democratic Republic of Congo, Djibouti, Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. Recently we proposed to organize a biohakathon three day event to: 1) Learn how to collaborate on bioinformatics programming projects using open source tools. 2) Forge an East African bioinformatics programming community. 3) Contribute a module/code to Bioruby library. The event has been sponsored by a grant from ISCB and ILRI/Beca bioinformatics platform in Nairobi, Kenya. We would like to seek for a suitable project work from one of the developer(s) and the community. The project should ideally be of beginner to intermediate level difficulty. A third of the participants will be of intermediate level programming skills with experience from Java,Python and Perl. while the rest will have beginner level skills. We were also wondering whether it would be possible to get one of the lead contributors to bioruby project to give a short 15-20 minutes introductory talk to the participants. We have excellent video conferencing facilities at the ILRI/Beca hub. The event is slated to take place in late September. Thank you -- --------------- Sincerely George KEMRI/Wellcome-Trust Research Program Skype: george_g2 Blog: http://biorelated.wordpress.com/ From ralf at ark.in-berlin.de Tue Aug 10 05:37:35 2010 From: ralf at ark.in-berlin.de (Ralf Stephan) Date: Tue, 10 Aug 2010 11:37:35 +0200 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: Message-ID: <0F6E98FF-4C66-4B5C-87AC-016D56B5A96D@ark.in-berlin.de> On Aug 10, 2010, at 9:37 AM, George Githinji wrote: > 1) Learn how to collaborate on bioinformatics programming projects > using open source tools. You might consider the realistic approach: 1A) Use bioruby for an interesting project 1B) Find bug (there is always one!) 1C) Fix bug 1D) Send patch (or use github, but why) This is exactly how your students will later be confronted with the possibility of Open Source collaboration. Regards, Ralf Stephan http://www.ark.in-berlin.de pub 1024D/C5114CB2 2009-06-07 [expires: 2011-06-06] Key fingerprint = 76AE 0D21 C06C CBF9 24F8 7835 1809 DE97 C511 4CB2 From hlapp at drycafe.net Tue Aug 10 13:18:29 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Tue, 10 Aug 2010 13:18:29 -0400 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Message-ID: <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> On Aug 10, 2010, at 12:02 AM, Anurag Priyam wrote: > I have been lurking around in Debian, and Git mailing list for a > while and nothing happens off the list there. How do you know that? -hilmar -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From anurag08priyam at gmail.com Tue Aug 10 13:25:41 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Tue, 10 Aug 2010 22:55:41 +0530 Subject: [BioRuby] development updates? In-Reply-To: <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: >> I have been lurking around in Debian, and Git mailing list for a while and >> nothing happens off the list there. > > > How do you know that? Not literally. Just be around on the git list and you will know what I mean. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From cjfields at illinois.edu Tue Aug 10 13:24:12 2010 From: cjfields at illinois.edu (Chris Fields) Date: Tue, 10 Aug 2010 12:24:12 -0500 Subject: [BioRuby] development updates? In-Reply-To: <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: <991C4D70-3B10-4F78-8889-B522E358658B@illinois.edu> On Aug 10, 2010, at 12:18 PM, Hilmar Lapp wrote: > On Aug 10, 2010, at 12:02 AM, Anurag Priyam wrote: > >> I have been lurking around in Debian, and Git mailing list for a while and nothing happens off the list there. > > > How do you know that? > > -hilmar The mail list version of Schr?dinger's cat? :> chris From pjotr.public14 at thebird.nl Tue Aug 10 14:00:24 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 10 Aug 2010 20:00:24 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: <20100810180024.GB9182@thebird.nl> On Tue, Aug 10, 2010 at 10:55:41PM +0530, Anurag Priyam wrote: > >> I have been lurking around in Debian, and Git mailing list for a while and > >> nothing happens off the list there. Debian Bio-Med is off-list. Debian has a board that discusses (partly) off-list. Ubuntu, a derivative of Debian, is off-list. Just examples. I am certain help pages and 'manuscripts' are not fully discussed on the list. Personal criticism would be off-list (normally). Especially with a gigantic project like Debian, it would be suicide to do everything via the list. Pj. From hlapp at drycafe.net Tue Aug 10 14:28:22 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Tue, 10 Aug 2010 14:28:22 -0400 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: Hi Anurag: On Aug 10, 2010, at 1:25 PM, Anurag Priyam wrote: >>> I have been lurking around in Debian, and Git mailing list for a >>> while and >>> nothing happens off the list there. >> >> >> How do you know that? > > Not literally. Just be around on the git list and you will know what > I mean. I am trying to make you take a step back and think. Please don't be evasive. You do not know what happens offline on that list, nor do you know for any other mailing list or community - it's by the definition of the word "off-line" that you don't know. Just because you see a lot happening on-list doesn't mean that a lot can't also happen off-list. What you encountered is that the nature of things discussed online and those discussed offline is not identical between, say, the bioruby list and the git list. But that's not a surprise - pick any two lists or communities and you will find a difference. That is because a community's practices are defined by the people who populate them, and different communities are populated by different people. Those differences are not bad; rather, I suggest you try to appreciate them. What should ultimately count is the success of the community in fostering coherence, and in creating useful software. If you feel that Bioruby is falling short on those counts, and that shifting the balance of things discussed onlist and offlist in a certain direction would help rectify that, then I think most if not everyone here would be curious to hear your specific thoughts. But just pointing to another community and saying it is different isn't very productive - it's just pointing out the expected. Cheers, -hilmar -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From anurag08priyam at gmail.com Tue Aug 10 16:00:06 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Wed, 11 Aug 2010 01:30:06 +0530 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: > I am trying to make you take a step back and think. Please don't be evasive. > You do not know what happens offline on that list, nor do you know for any > other mailing list or community - it's by the definition of the word > "off-line" that you don't know. Just because you see a lot happening on-list > doesn't mean that a lot can't also happen off-list. > > What you encountered is that the nature of things discussed online and those > discussed offline is not identical between, say, the bioruby list and the > git list. But that's not a surprise - pick any two lists or communities and > you will find a difference. That is because a community's practices are > defined by the people who populate them, and different communities are > populated by different people. > > > Those differences are not bad; rather, I suggest you try to appreciate them. > What should ultimately count is the success of the community in fostering > coherence, and in creating useful software. > Right. I understand your point and I am not being critical of the differences. I have tried to explain my point below. > If you feel that Bioruby is falling short on those counts, and that shifting > the balance of things discussed onlist and offlist in a certain direction > would help rectify that, then I think most if not everyone here would be > curious to hear your specific thoughts. But just pointing to another > community and saying it is different isn't very productive - it's just > pointing out the expected. I do not feel that BioRuby is falling short on any counts. The jist of my mail was only this : "Hey, guys on this list they do this X. I think that X is cool. *Maybe* you guys will like it too. *Maybe* we could do it too." I was just being enthusiastic about that X, and wanted to *share* it. However things have gone totally awry. Maybe the way I essayed it down was wrong, or maybe I failed to realize that I should be a little more formal here. Or, maybe just that I should grow up a little more. I tend to be overly enthusiastic about things. In any case, it is my mistake and will you all please forgive me about it? -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From pjotr.public14 at thebird.nl Tue Aug 10 16:15:24 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 10 Aug 2010 22:15:24 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: <20100810201524.GA12462@thebird.nl> > I have been lurking around in Debian, and Git mailing list for a > while and nothing happens off the list there. Ultimately code talks. As Hilmar points out. In OSS a successful project is one that remains relevant. Refrain: We are all here to keep BioRuby relevant. That is the point of this mailing list. > I was just being enthusiastic about that X, and wanted to *share* > it. However things have gone totally awry. Maybe the way I essayed > it down was wrong, or maybe I failed to realize that I should be a > little more formal here. Or, maybe just that I should grow up a > little more. I tend to be overly enthusiastic about things. In any > case, it is my mistake and will you all please forgive me about it? No need to forgive you for being enthusiastic. Enthusiasm is cool. It would be great if you use your energy to run with an aspect of your ideas. For one, documentation needs work. Prioritizing work and distribution of work would be good too (if you can get people to agree). The finding of synergy in a project is laudable. Some projects benefit from strong leadership - like the Linux kernel. Note that strong leadership comes with respect - and in OSS that is based (again) on code. Meanwhile, a problem, as Chris pointed out, is that everyone is really busy. Programming happens in short bursts of activity, and tends to be kinda 'ad hoc'. I think it is actually pretty amazing that we have good individuals who modify 'ad hoc' code for broader use, so it becomes available to more people. Refrain: We are all here to keep BioRuby relevant. That is the point of this mailing list. Pj. From georgkam at gmail.com Wed Aug 11 01:53:12 2010 From: georgkam at gmail.com (George Githinji) Date: Wed, 11 Aug 2010 08:53:12 +0300 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: <0F6E98FF-4C66-4B5C-87AC-016D56B5A96D@ark.in-berlin.de> References: <0F6E98FF-4C66-4B5C-87AC-016D56B5A96D@ark.in-berlin.de> Message-ID: Thanks Ralf. I appreciate your advice. On Tue, Aug 10, 2010 at 12:37 PM, Ralf Stephan wrote: > > On Aug 10, 2010, at 9:37 AM, George Githinji wrote: >> ?1) Learn how to collaborate on bioinformatics programming projects >> using open source tools. > > You might consider the realistic approach: > 1A) Use bioruby for an interesting project > 1B) Find bug (there is always one!) > 1C) Fix bug > 1D) Send patch (or use github, but why) > > This is exactly how your students will later be confronted > with the possibility of Open Source collaboration. > > Regards, > > Ralf Stephan > http://www.ark.in-berlin.de > pub ? 1024D/C5114CB2 2009-06-07 [expires: 2011-06-06] > ? ? ?Key fingerprint = 76AE 0D21 C06C CBF9 24F8 ?7835 1809 DE97 C511 4CB2 > > > > > -- --------------- Sincerely George KEMRI/Wellcome-Trust Research Program Skype: george_g2 Blog: http://biorelated.wordpress.com/ From pjotr.public14 at thebird.nl Thu Aug 12 10:30:12 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 12 Aug 2010 16:30:12 +0200 Subject: [BioRuby] GFF3 Message-ID: <20100812143012.GA31206@thebird.nl> I intend to use GFF3 and document its use. In my gff3 github branch (see http://github.com/pjotrp/bioruby/tree/gff3) I have just added a first example for fetching sequence data from GFF3. First I took an example from Lincoln Stein (in his BioPerl repository) and stuck that in ./test/data/gff/test.gff3. This data contains empty lines - so I modified the GFF3 parser to ignore those. Before I continue, I also wonder about the wisdom of including a Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the @definition with @entry_id. Not only that, the sequence contains white space, which does not match GFF's positioning data: #> Now, to print FASTA I now do: gff3.sequences.each do | item | print item.to_fasta(item.entry_id, 70) end (to_fasta is being deprecated) To get a FASTA sequence I would like to do the sane: gff3.sequences.each do | item | rec = Bio::FastaFormat.new('> '+item.definition.strip+"\n"+item.data) print rec end where item.data is just the clean sequence. The current implementation is rather uninituitive. I realise GFF3 contains FASTA, but there is no reason to store it like that. How about removing the contained Bio::FastaFormat and just use a sequence string? And remove the white space by default? It does also away with FASTA formatting - the to_fasta in GFF3. I can make the changes, if you agree. Pj. From mh6 at sanger.ac.uk Thu Aug 12 10:42:23 2010 From: mh6 at sanger.ac.uk (Michael Paulini) Date: Thu, 12 Aug 2010 15:42:23 +0100 Subject: [BioRuby] GFF3 In-Reply-To: <20100812143012.GA31206@thebird.nl> References: <20100812143012.GA31206@thebird.nl> Message-ID: <4C64084F.7090905@sanger.ac.uk> Pjotr, are you coming over to the GMOD meeting in Cambridge? Because if we need/want to make changes to teh GFF3 specifications, we could discuss it there, as some from Lincoln's group will also be there. ... and yes, the inlined fasta at the end is not a perfect solution. Michael On 12/08/10 15:30, Pjotr Prins wrote: > I intend to use GFF3 and document its use. > > In my gff3 github branch (see http://github.com/pjotrp/bioruby/tree/gff3) I > have just added a first example for fetching sequence data from GFF3. First I > took an example from Lincoln Stein (in his BioPerl repository) and stuck that > in ./test/data/gff/test.gff3. This data contains empty lines - so I modified > the GFF3 parser to ignore those. > > Before I continue, I also wonder about the wisdom of including a > Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the > @definition with @entry_id. Not only that, the sequence contains white space, > which does not match GFF's positioning data: > > # @source_data=# @data="\nACGAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTA\nGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACA\nCCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGAT\nAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTAGT\nGCCCAAGAATGCGATCCCAGAAGTCTTGGTTCTAAAGTCGTCGGAAAGATTTGAGGAACTGCCATACAGC\nCCGTGGGTGAAACTGTCGACATCCATTGTGCGAATAGGCCTGCTAGTGAC\n\n", > @definition="test01">> > > Now, to print FASTA I now do: > > gff3.sequences.each do | item | > print item.to_fasta(item.entry_id, 70) > end > > (to_fasta is being deprecated) > > To get a FASTA sequence I would like to do the sane: > > gff3.sequences.each do | item | > rec = Bio::FastaFormat.new('> '+item.definition.strip+"\n"+item.data) > print rec > end > > where item.data is just the clean sequence. > > The current implementation is rather uninituitive. I realise GFF3 contains > FASTA, but there is no reason to store it like that. How about removing the > contained Bio::FastaFormat and just use a sequence string? And remove the white > space by default? > > It does also away with FASTA formatting - the to_fasta in GFF3. > > I can make the changes, if you agree. > > Pj. > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby -- The Wellcome Trust Sanger Institute is operated by Genome Research Limited, a charity registered in England with number 1021457 and a company registered in England with number 2742969, whose registered office is 215 Euston Road, London, NW1 2BE. From ngoto at gen-info.osaka-u.ac.jp Thu Aug 12 11:12:05 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Fri, 13 Aug 2010 00:12:05 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100812143012.GA31206@thebird.nl> References: <20100812143012.GA31206@thebird.nl> Message-ID: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Hi, On Thu, 12 Aug 2010 16:30:12 +0200 Pjotr Prins wrote: > I intend to use GFF3 and document its use. > > In my gff3 github branch (see http://github.com/pjotrp/bioruby/tree/gff3) I > have just added a first example for fetching sequence data from GFF3. First I > took an example from Lincoln Stein (in his BioPerl repository) and stuck that > in ./test/data/gff/test.gff3. Could you please tell me the complete URL of the Lincoln's test data? Why I'd like to know the origin is: I submitted the test.gff3 to the GFF3 Validator, (http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online ) and it is reported as "Invalid". So, I'd like to know if this is intended or not, and that best way to know that is seeing the file's development history. > This data contains empty lines - so I modified > the GFF3 parser to ignore those. How to treat empty lines is undefined in the GFF3 spec. (http://www.sequenceontology.org/gff3.shtml) It may be good to ignore empty lines. > Before I continue, I also wonder about the wisdom of including a > Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the > @definition with @entry_id. Not only that, the sequence contains white space, > which does not match GFF's positioning data: > > # @source_data=# @data="\nACGAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTA\nGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACA\nCCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGAT\nAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTAGT\nGCCCAAGAATGCGATCCCAGAAGTCTTGGTTCTAAAGTCGTCGGAAAGATTTGAGGAACTGCCATACAGC\nCCGTGGGTGAAACTGTCGACATCCATTGTGCGAATAGGCCTGCTAGTGAC\n\n", > @definition="test01">> You can see that FastaFormat object is stored in the @source_data. It will be parsed only when the sequence is really needed. This is a kind of lazy evaluation. Please execute puts gff3.sequences[0][0..100] and report what sequence is shown. > Now, to print FASTA I now do: > > gff3.sequences.each do | item | > print item.to_fasta(item.entry_id, 70) > end gff3.sequences.each do | item | print item.output(:fasta) end -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From pjotr.public14 at thebird.nl Thu Aug 12 12:10:16 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 12 Aug 2010 18:10:16 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100812161016.GA32552@thebird.nl> On Fri, Aug 13, 2010 at 12:12:05AM +0900, Naohisa GOTO wrote: > Could you please tell me the complete URL of the Lincoln's > test data? Why I'd like to know the origin is: > I submitted the test.gff3 to the GFF3 Validator, > (http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online ) > and it is reported as "Invalid". So, I'd like to know if this > is intended or not, and that best way to know that is seeing > the file's development history. proper test data for the module by Lincoln: http://github.com/bioperl/bioperl-live/blob/master/t/data/biodbgff/test.gff3 > > This data contains empty lines - so I modified > > the GFF3 parser to ignore those. > > How to treat empty lines is undefined in the GFF3 spec. > (http://www.sequenceontology.org/gff3.shtml) > It may be good to ignore empty lines. I think so. > > Before I continue, I also wonder about the wisdom of including a > > Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the > > @definition with @entry_id. Not only that, the sequence contains white space, > > which does not match GFF's positioning data: > > > > # > @source_data=# > @data="\nACGAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTA\nGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACA\nCCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGAT\nAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTAGT\nGCCCAAGAATGCGATCCCAGAAGTCTTGGTTCTAAAGTCGTCGGAAAGATTTGAGGAACTGCCATACAGC\nCCGTGGGTGAAACTGTCGACATCCATTGTGCGAATAGGCCTGCTAGTGAC\n\n", > > @definition="test01">> > > You can see that FastaFormat object is stored in the @source_data. > It will be parsed only when the sequence is really needed. > This is a kind of lazy evaluation. Very lazy ;) But duplication of ID and containment of extraneous information. Not so efficient with space. We may want to change that. The main problem is that it is not intuitive to have a FastaFormat inside a Sequence object. But that could be just me. > Please execute > puts gff3.sequences[0][0..100] > and report what sequence is shown. > > > Now, to print FASTA I now do: > > > > gff3.sequences.each do | item | > > print item.to_fasta(item.entry_id, 70) > > end > > gff3.sequences.each do | item | > print item.output(:fasta) > end I should have known ;) Pj. From pjotr.public14 at thebird.nl Thu Aug 12 12:12:13 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 12 Aug 2010 18:12:13 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <4C64084F.7090905@sanger.ac.uk> References: <20100812143012.GA31206@thebird.nl> <4C64084F.7090905@sanger.ac.uk> Message-ID: <20100812161213.GB32552@thebird.nl> On Thu, Aug 12, 2010 at 03:42:23PM +0100, Michael Paulini wrote: > are you coming over to the GMOD meeting in Cambridge? I can't make it. Maybe next time. Pj. From mail at maasha.dk Fri Aug 13 08:25:46 2010 From: mail at maasha.dk (Martin Asser Hansen) Date: Fri, 13 Aug 2010 14:25:46 +0200 Subject: [BioRuby] Benchmarking FASTA file parsing Message-ID: Hello, I am new to Ruby and was testing bioruby (1.4.0) for parsing FASTA files. A rough comparison with Perl indicated that the bioruby parser was slow. Now I have hacked a parser of my own in Ruby in order to benchmark the bioruby parser. The result is disappointing -> my hack is roughly 3x faster. Admittedly, my hack should probably do a bit of format consistency checking, but that will only take a few % off the speed. Could someone explain why the bioruby parser is so slow? Is it possible to optimize the code without major rewriting? Here is the benchmark result: user system total real Hack 5.440000 0.010000 5.450000 ( 5.494207) Bio 18.410000 0.020000 18.430000 ( 18.579867) The code is shown below. Cheers, Martin #!/usr/bin/env ruby require 'stringio' require 'bio' require 'benchmark' class Fasta include Enumerable def initialize(io) @io = io end def each while entry = get_entry do yield entry end end def get_entry block = @io.gets("\n>") return nil if block.nil? block.chomp!("\n>") block.sub!( /^\s|^>/, "") (seq_name, seq) = block.split("\n", 2) seq.gsub!(/\s/, "") entry = {} entry[:seq_name] = seq_name entry[:seq] = seq entry end end data = <5_gECOjxwXsN1/1 AACGNTACTATCGTGACATGCGTGCAGGATTACAC >3_8ICOjxwXsN1/1 ACTCNAGGGTTCGATTCCCTTCAACCGCCCCATAA >3_GUCOjxwXsN1/1 TTGCNTCCTTCTTCTGCCTTCGTTGGCTCAGATTG >5_BWCOjxwXsN1/1 TATATACAGGAATCCATTGTTGTTTAGATTCAGTT >7_NZCOjxwXsN1/1 AGGTGATCCAGCCGCACCTTCCGATACGGCTACCT >3_2VCOjxwXsN1/1 CTTTTCCAGGTGTGTAGACATCTTCACCCATTAAG >5_kVCOjxwXsN1/1 CTACACCTAAGTTACATCGTCCATTATTTTCCAAT >1_GbCOjxwXsN1/1 CCAGACAACTAGGATGTTGGCTTAGAAGCAGCCAT >5_fTCOjxwXsN1/1 TTAGCTTTAACCATTTTCTTTTTGTCTAAAGCAAA >3_VWCOjxwXsN1/1 TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC DATA io1 = StringIO.new(data) io2 = StringIO.new(data) fasta1 = Fasta.new(io1) fasta2 = Bio::FastaFormat.open(io2) Benchmark.bm(5) do |timer| timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } end From tomoakin at kenroku.kanazawa-u.ac.jp Fri Aug 13 09:37:06 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Fri, 13 Aug 2010 22:37:06 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: Message-ID: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> Hi, The benchmark is interesting. > Is it possible to optimize the code without major rewriting? Using ruby 1.9.2 (RC2) makes it 2.6 times faster without any rewriting the bioruby parser code :) compared to ruby-1.8 (1.8.7-p299). $ ~/ruby192/bin/ruby benchfasta user system total real Hack 3.800000 0.000000 3.800000 ( 3.800830) Bio 13.090000 0.000000 13.090000 ( 13.095722) $ ~/ruby187/bin/ruby benchfasta user system total real Hack 7.460000 0.000000 7.460000 ( 7.456281) Bio 34.670000 0.000000 34.670000 ( 34.680271) As you stated 3 times faster with the hack, you may be already using ruby 1.9. Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and this part will not be the bottle neck of any application. How fast do you need it be? -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From ktym at hgc.jp Fri Aug 13 10:46:20 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Fri, 13 Aug 2010 23:46:20 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: Message-ID: <0CCB19A9-2753-43E6-9E1B-04E75BD1DF20@hgc.jp> Hi, Thank you for your interesting post. :-) I used to love benchmarking bottlenecks in BioRuby. Could you also try to compare parsing whole GenBank or thousands of BLAST results or FASTQ files produced by NGSs with BioPerl, BioRuby and hopefully your version? As the FASTA is a most simple (but fuzzy) format in biology, I suppose the speed of parsing FASTA entry may depend on how many variations do you expect to allow in the defline of the (loosely defined) FASTA format. Most importantly, I also believe the current speed of parsing FASTA files is practically enough as Nishiyama-san stated. It required me >30min to download a file containing ~5.6 million protein sequences from KEGG (only 2.3G bytes). ftp://ftp.genome.jp/pub/kegg/genes/fasta/genes.pep The cat and grep commands took 1 min to read through the file. ------------------------------------------------------------ % time cat genes.pep > /dev/null cat genes.pep > /dev/null 0.02s user 1.84s system 3% cpu 1:00.91 total % time egrep '^>' genes.pep | wc -l 5604761 egrep '^>' genes.pep 12.71s user 2.13s system 23% cpu 1:04.46 total wc -l 0.44s user 0.21s system 1% cpu 1:04.46 total ------------------------------------------------------------ I modified your benchmark to do some real tasks -- counting sequences, printing sequence ID and the sequence length. ------------------------------------------------------------ file = "genes.pep" io1 = File.open(file) io2 = file fasta1 = Fasta.new(io1) fasta2 = Bio::FlatFile.auto(io2) c1 = 0 c2 = 0 Benchmark.bm(5) do |timer| timer.report('Hack') { 1.times { fasta1.each { |entry1| c1 += 1; $stderr.print c1, "\t", entry1[:seq_name][/^\S+/], "\t", entry1[:seq].length, "\n" } } } timer.report('Bio') { 1.times { fasta2.each { |entry2| c2 += 1; $stderr.print c2, "\t", entry2.entry_id, "\t", entry2.length, "\n" } } } end ------------------------------------------------------------ Then, your code took 3 min (sounds great!) and the current BioRuby implementation took 9 min. % ruby-1.8 benchfasta.rb genes.pep user system total real Hack 146.180000 27.820000 174.000000 (191.343770) Bio 480.940000 38.060000 519.000000 (557.216022) It could be painful if you need to deal with more sequences, however, please note that the number of whole protein entries in UniProt (which is believed to contain known protein universe to date) is only twice larger than the KEGG (which covers almost all protein sequences in >1200 completed genomes). http://www.expasy.org/sprot/relnotes/relstat.html http://www.ebi.ac.uk/uniprot/TrEMBLstats/ http://www.genome.jp/en/db_growth.html#genes > Is it possible to optimize the code without major rewriting? Of course, it would be great if you could contribute improved codes or suggest some possible ways to optimize the current implementation. Regards, Toshiaki Katayama, just back from summer vacation ;-) On 2010/08/13, at 21:25, Martin Asser Hansen wrote: > Hello, > > > I am new to Ruby and was testing bioruby (1.4.0) for parsing FASTA files. A > rough comparison with Perl indicated that the bioruby parser was slow. Now I > have hacked a parser of my own in Ruby in order to benchmark the bioruby > parser. The result is disappointing -> my hack is roughly 3x faster. > Admittedly, my hack should probably do a bit of format consistency checking, > but that will only take a few % off the speed. > > Could someone explain why the bioruby parser is so slow? > > Is it possible to optimize the code without major rewriting? > > Here is the benchmark result: > > user system total real > Hack 5.440000 0.010000 5.450000 ( 5.494207) > Bio 18.410000 0.020000 18.430000 ( 18.579867) > > > The code is shown below. > > Cheers, > > > Martin > > #!/usr/bin/env ruby > > require 'stringio' > require 'bio' > require 'benchmark' > > class Fasta > include Enumerable > > def initialize(io) > @io = io > end > > def each > while entry = get_entry do > yield entry > end > end > > def get_entry > block = @io.gets("\n>") > return nil if block.nil? > > block.chomp!("\n>") > block.sub!( /^\s|^>/, "") > > (seq_name, seq) = block.split("\n", 2) > seq.gsub!(/\s/, "") > > entry = {} > entry[:seq_name] = seq_name > entry[:seq] = seq > entry > end > end > > data = <> 5_gECOjxwXsN1/1 > AACGNTACTATCGTGACATGCGTGCAGGATTACAC >> 3_8ICOjxwXsN1/1 > ACTCNAGGGTTCGATTCCCTTCAACCGCCCCATAA >> 3_GUCOjxwXsN1/1 > TTGCNTCCTTCTTCTGCCTTCGTTGGCTCAGATTG >> 5_BWCOjxwXsN1/1 > TATATACAGGAATCCATTGTTGTTTAGATTCAGTT >> 7_NZCOjxwXsN1/1 > AGGTGATCCAGCCGCACCTTCCGATACGGCTACCT >> 3_2VCOjxwXsN1/1 > CTTTTCCAGGTGTGTAGACATCTTCACCCATTAAG >> 5_kVCOjxwXsN1/1 > CTACACCTAAGTTACATCGTCCATTATTTTCCAAT >> 1_GbCOjxwXsN1/1 > CCAGACAACTAGGATGTTGGCTTAGAAGCAGCCAT >> 5_fTCOjxwXsN1/1 > TTAGCTTTAACCATTTTCTTTTTGTCTAAAGCAAA >> 3_VWCOjxwXsN1/1 > TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC > DATA > > io1 = StringIO.new(data) > io2 = StringIO.new(data) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > Benchmark.bm(5) do |timer| > timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } > timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } > end > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From ngoto at gen-info.osaka-u.ac.jp Fri Aug 13 10:47:35 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Fri, 13 Aug 2010 23:47:35 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: Message-ID: <20100813144735.B60FF1CBC5AA@idnmail.gen-info.osaka-u.ac.jp> Hi, On Fri, 13 Aug 2010 14:25:46 +0200 Martin Asser Hansen wrote: > io1 = StringIO.new(data) > io2 = StringIO.new(data) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > Benchmark.bm(5) do |timer| > timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } > timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } > end To rewind the IO (StringIO or Bio::FlatFile object) every time after reading will be needed during the benchmark. #(snip) Benchmark.bm(5) do |timer| timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| }; io1.rewind } } timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| }; fasta2.rewind } } end Why using "fasta2.rewind" instead of "io2.rewind" is that the "fasta2" is an instance of Bio::FlatFile, IO wrapper used in BioRuby, and to keep consistency of information inside the wrapper, it is recommended using fasta2.rewind rather than io2.rewind. I applied above changes, and reduced iteration count to 100,000 times, and get the result with the same tendency. (ruby 1.8.7-p299 (debian Squeeze 1.8.7.299-1)) user system total real Hack 7.240000 0.160000 7.400000 ( 7.390807) Bio 23.250000 0.850000 24.100000 ( 24.100267) (ruby 1.9.1-p243 with env LANG=C) user system total real Hack 5.600000 0.010000 5.610000 ( 5.605175) Bio 15.920000 0.000000 15.920000 ( 15.917899) With E.coli genome ORF data, the difference become smaller, especially in Ruby 1.9.1. (snip) # ftp://ftp.ncbi.nih.gov:/genbank/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655/U00096.ffn io1 = File.open('U00096.ffn') io2 = File.open('U00096.ffn') fasta1 = Fasta.new(io1) fasta2 = Bio::FastaFormat.open(io2) Benchmark.bm(5) do |timer| timer.report('Hack') { 100.times { fasta1.each { |entry1| }; io1.rewind } } timer.report('Bio') { 100.times { fasta2.each { |entry2| }; fasta2.rewind } } end (ruby 1.8.7-p299) user system total real Hack 8.340000 0.140000 8.480000 ( 8.492107) Bio 13.480000 0.520000 14.000000 ( 13.998213) (Ruby 1.9.1-p243 with env LANG=C) user system total real Hack 9.130000 0.140000 9.270000 ( 9.270361) Bio 9.380000 0.180000 9.560000 ( 9.565899) -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From mail at maasha.dk Fri Aug 13 10:51:43 2010 From: mail at maasha.dk (Martin Asser Hansen) Date: Fri, 13 Aug 2010 16:51:43 +0200 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> Message-ID: > > > As you stated 3 times faster with the hack, you may be already using ruby > 1.9. > > I am using ruby 1.9.1, and I am using a fairly fast computer, but I am actually questioning the quality of the code. > Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and this > part will not be the bottle neck of any application. > How fast do you need it be? > Mind you that the Benchmark is performed on StringIO data, and that the script does not touch the disk! In a real test, it will be much slower! I did not test on real data and more speed issues may surface (I have no idea how Ruby's file buffering compares to Perl's, performance-wise). I was contemplating porting some Biopieces (www.biopieces.org) from Perl to Ruby. Biopieces are used for everyday slicing and dicing of all sorts of biological data in a very simple and flexible manner. While Biopieces are not as fast as dedicated scripts, they are fast enough for convenient analysis of NGS data, but I will not accept a +300% speed penalty (i.e. read_fasta). I have been trying to get an overview of the code in Bio::FastaFormat, but I find it hard to read (that could be because I am not used to Ruby, or OO for that matter). It strikes me that the FastaFormat class does a number of irrelevant things like subparsing comments when not strictly necessary. In fact, the FASTA format actually don't use comments prefixed with # (semicolon can be used, but I will strongly advice against it since most software don't deal with it). Also, parsing is dependent on the record separator being '\n' - that could be considered a bug. There seem to be an overuse of substitutions, transliterations and regex matching. How about keeping it nice an tight? ala: SEP = $/ FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ def get_entry block = @io.gets(SEP + ">") return nil if block.nil? if block =~ FASTA_REGEX seq_name = $1 seq = $2 else raise "Bad FASTA entry->#{block}" end seq.gsub!(/\s/, "") end Cheers, Martin > -- > Tomoaki NISHIYAMA > > Advanced Science Research Center, > Kanazawa University, > 13-1 Takara-machi, > Kanazawa, 920-0934, Japan > > From tomoakin at kenroku.kanazawa-u.ac.jp Fri Aug 13 23:42:07 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Sat, 14 Aug 2010 12:42:07 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> Message-ID: <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> Hi, > Mind you that the Benchmark is performed on StringIO data, and that > the script does not touch the disk! > In a real test, it will be much slower! My initial thought was :- That's true, and therefore the pure parser part which runs fairly fast with O(N) is not the primary problem. If you push the entries into a hash it will be much more time consuming. But realized that its two orders slower... (due to the benchmark code as pointed out by goto-san) 20 min for 100 M could be painful. > I have been trying to get an overview of the code in Bio::FastaFormat, > but I find it hard to read (that could be because I am not used to > Ruby, or OO for that matter). For one thing, the Bio::FastaFormat is designed to work with Bio::FlatFile. If you write a dedicated fasta parser that could run much faster. # I would write C codes for a very simple operation on NGS data. # That will run 100 times faster. # When the necessary operation is a bit more complex, I would use ruby. much much more time consuming.... Perhaps the target is to process about 20 ~ 1000 M reads with each of them having 25 to 150 nt for the time being. Thats quite different situation compared to process the ~ 0.1 M entry of 50-10000 aa residues or nucleotides in a genome. The relative cost for the entry separation becomes higher compared with the sequence processing within the entry. So, it may worth to write NGS dedicated parser rather than sticking on FlatFile. Playing around the benchmark, about the half of execution time is for garbage collection, and the order of execution is somewhat relevant to get the number. If you can suppress unnecessary object generation to the minimum and disable GC, that will perhaps make it run much faster. $ diff -u benchfasta benchfasta-hash-GC-b --- benchfasta 2010-08-13 21:45:21.000000000 +0900 +++ benchfasta-hash-GC-b 2010-08-14 11:53:20.000000000 +0900 @@ -34,6 +34,9 @@ end end +count = ARGV.shift.to_i +count = 2 if count == nil + data = <5_gECOjxwXsN1/1 AACGNTACTATCGTGACATGCGTGCAGGATTACAC @@ -57,12 +60,23 @@ TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC DATA -io1 = StringIO.new(data) -io2 = StringIO.new(data) +io0 = StringIO.new(data * count) +io1 = StringIO.new(data * count) +io2 = StringIO.new(data * count) +fasta0 = Fasta.new(io0) fasta1 = Fasta.new(io1) fasta2 = Bio::FastaFormat.open(io2) -Benchmark.bm(5) do |timer| - timer.report('Hack') { 10_000_000.times { fasta1.each { | entry1| } } } - timer.report('Bio') { 10_000_000.times { fasta2.each { | entry2| } } } +hash0=Hash.new +hash1=Hash.new +hash2=Hash.new + +Benchmark.bm(8) do |timer| + GC.enable;GC.start;GC.disable; + timer.report('Bio') { i=0; fasta2.each { |entry2| i+=1; hash2 [entry2.definition + i.to_s] = entry2.seq[2..25]} } + hash2 = nil; GC.enable;GC.start;GC.disable; + timer.report('Hack') { i=0; fasta0.each { |entry1| i+=1; hash0 [entry1[:seq_name] + i.to_s] = entry1[:seq][2..25]} } + hash0 = nil; GC.enable;GC.start;GC.disable; + timer.report('Hack-seq') { i=0; fasta1.each { |entry1| i+=1; hash1 [entry1[:seq_name] + i.to_s] = Bio::Sequence::NA.new(entry1[:seq]) [2..25]} } + hash1 = nil; GC.enable;GC.start;GC.disable; end -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/13, at 23:51, Martin Asser Hansen wrote: > > As you stated 3 times faster with the hack, you may be already > using ruby 1.9. > > > I am using ruby 1.9.1, and I am using a fairly fast computer, but I > am actually questioning the quality of the code. > > Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and > this > part will not be the bottle neck of any application. > How fast do you need it be? > > Mind you that the Benchmark is performed on StringIO data, and that > the script does not touch the disk! In a real test, it will be much > slower! I did not test on real data and more speed issues may > surface (I have no idea how Ruby's file buffering compares to > Perl's, performance-wise). > > I was contemplating porting some Biopieces (www.biopieces.org) from > Perl to Ruby. Biopieces are used for everyday slicing and dicing of > all sorts of biological data in a very simple and flexible manner. > While Biopieces are not as fast as dedicated scripts, they are fast > enough for convenient analysis of NGS data, but I will not accept a > +300% speed penalty (i.e. read_fasta). > > I have been trying to get an overview of the code in > Bio::FastaFormat, but I find it hard to read (that could be because > I am not used to Ruby, or OO for that matter). It strikes me that > the FastaFormat class does a number of irrelevant things like > subparsing comments when not strictly necessary. In fact, the FASTA > format actually don't use comments prefixed with # (semicolon can > be used, but I will strongly advice against it since most software > don't deal with it). Also, parsing is dependent on the record > separator being '\n' - that could be considered a bug. There seem > to be an overuse of substitutions, transliterations and regex > matching. How about keeping it nice an tight? ala: > > SEP = $/ > FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ > > def get_entry > block = @io.gets(SEP + ">") > return nil if block.nil? > > if block =~ FASTA_REGEX > seq_name = $1 > seq = $2 > else > raise "Bad FASTA entry->#{block}" > end > > seq.gsub!(/\s/, "") > end > > > Cheers, > > > Martin > > -- > Tomoaki NISHIYAMA > > Advanced Science Research Center, > Kanazawa University, > 13-1 Takara-machi, > Kanazawa, 920-0934, Japan > > From mail at maasha.dk Sat Aug 14 04:21:39 2010 From: mail at maasha.dk (Martin Asser Hansen) Date: Sat, 14 Aug 2010 10:21:39 +0200 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> Message-ID: I was hoping for an easy to use generic FASTA parser in bioruby. I think it would be confusing with two flavors of parsers for short/long entries. Also, I think that with a minor effort the existing parser could be optimized a fair bit. Subparsing of the defline should not be done for generic parsing, but rather when needed. Without any experience I think that disabling GC sounds like a bad idea. Of cause C is always faster, but Ruby is nicer. Cheers, Martin On Sat, Aug 14, 2010 at 5:42 AM, Tomoaki NISHIYAMA < tomoakin at kenroku.kanazawa-u.ac.jp> wrote: > Hi, > > Mind you that the Benchmark is performed on StringIO data, and that the > script does not touch the disk! > > In a real test, it will be much slower! > > > My initial thought was :- That's true, and therefore the pure parser part > which runs fairly fast with O(N) > is not the primary problem. If you push the entries into a hash it will be > much more time consuming. > > But realized that its two orders slower... (due to the benchmark code as > pointed out by goto-san) > 20 min for 100 M could be painful. > > I have been trying to get an overview of the code in Bio::FastaFormat, > > but I find it hard to read (that could be because I am not used to Ruby, or > OO for that matter). > > > For one thing, the Bio::FastaFormat is designed to work with Bio::FlatFile. > If you write a dedicated fasta parser that could run much faster. > > > # I would write C codes for a very simple operation on NGS data. > > # That will run 100 times faster. > > # When the necessary operation is a bit more complex, I would use ruby. > much much more time consuming.... > > > Perhaps the target is to process about 20 ~ 1000 M reads with each of > > them having 25 to 150 nt for the time being. > > Thats quite different situation compared to process the > > ~ 0.1 M entry of 50-10000 aa residues or nucleotides in a genome. > > The relative cost for the entry separation becomes higher compared with the > sequence > > processing within the entry. > > > So, it may worth to write NGS dedicated parser rather than sticking on > FlatFile. > > > Playing around the benchmark, about the half of execution time is for > garbage collection, > > and the order of execution is somewhat relevant to get the number. > > If you can suppress unnecessary object generation to the minimum and > disable GC, that will > > perhaps make it run much faster. > > > $ diff -u benchfasta benchfasta-hash-GC-b > --- benchfasta 2010-08-13 21:45:21.000000000 +0900 > +++ benchfasta-hash-GC-b 2010-08-14 11:53:20.000000000 +0900 > @@ -34,6 +34,9 @@ > end > end > > +count = ARGV.shift.to_i > +count = 2 if count == nil > + > data = < >5_gECOjxwXsN1/1 > AACGNTACTATCGTGACATGCGTGCAGGATTACAC > @@ -57,12 +60,23 @@ > TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC > DATA > > -io1 = StringIO.new(data) > -io2 = StringIO.new(data) > +io0 = StringIO.new(data * count) > +io1 = StringIO.new(data * count) > +io2 = StringIO.new(data * count) > +fasta0 = Fasta.new(io0) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > -Benchmark.bm(5) do |timer| > - timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } > - timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } > +hash0=Hash.new > +hash1=Hash.new > +hash2=Hash.new > + > +Benchmark.bm(8) do |timer| > + GC.enable;GC.start;GC.disable; > + timer.report('Bio') { i=0; fasta2.each { |entry2| i+=1; > hash2[entry2.definition + i.to_s] = entry2.seq[2..25]} } > + hash2 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack') { i=0; fasta0.each { |entry1| i+=1; > hash0[entry1[:seq_name] + i.to_s] = entry1[:seq][2..25]} } > + hash0 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack-seq') { i=0; fasta1.each { |entry1| i+=1; > hash1[entry1[:seq_name] + i.to_s] = > Bio::Sequence::NA.new(entry1[:seq])[2..25]} } > + hash1 = nil; GC.enable;GC.start;GC.disable; > end > > > > > > -- > > Tomoaki NISHIYAMA > > > Advanced Science Research Center, > > Kanazawa University, > > 13-1 Takara-machi, > > Kanazawa, 920-0934, Japan > > > On 2010/08/13, at 23:51, Martin Asser Hansen wrote: > > >> As you stated 3 times faster with the hack, you may be already using ruby >> 1.9. >> >> > I am using ruby 1.9.1, and I am using a fairly fast computer, but I am > actually questioning the quality of the code. > > >> Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and this >> part will not be the bottle neck of any application. >> How fast do you need it be? >> > > Mind you that the Benchmark is performed on StringIO data, and that the > script does not touch the disk! In a real test, it will be much slower! I > did not test on real data and more speed issues may surface (I have no idea > how Ruby's file buffering compares to Perl's, performance-wise). > > I was contemplating porting some Biopieces (www.biopieces.org) from Perl > to Ruby. Biopieces are used for everyday slicing and dicing of all sorts of > biological data in a very simple and flexible manner. While Biopieces are > not as fast as dedicated scripts, they are fast enough > for convenient analysis of NGS data, but I will not accept a +300% speed > penalty (i.e. read_fasta). > > I have been trying to get an overview of the code in Bio::FastaFormat, but > I find it hard to read (that could be because I am not used to Ruby, or OO > for that matter). It strikes me that the FastaFormat class does a number of > irrelevant things like subparsing comments when not strictly necessary. In > fact, the FASTA format actually don't use comments prefixed with # > (semicolon can be used, but I will strongly advice against it since most > software don't deal with it). Also, parsing is dependent on the record > separator being '\n' - that could be considered a bug. There seem to be an > overuse of substitutions, transliterations and regex matching. How about > keeping it nice an tight? ala: > > SEP = $/ > FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ > > def get_entry > block = @io.gets(SEP + ">") > return nil if block.nil? > > if block =~ FASTA_REGEX > seq_name = $1 > seq = $2 > else > raise "Bad FASTA entry->#{block}" > end > > seq.gsub!(/\s/, "") > end > > > Cheers, > > > Martin > > >> -- >> Tomoaki NISHIYAMA >> >> Advanced Science Research Center, >> Kanazawa University, >> 13-1 Takara-machi, >> Kanazawa, 920-0934, Japan >> >> > > From tomoakin at kenroku.kanazawa-u.ac.jp Sat Aug 14 10:52:57 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Sat, 14 Aug 2010 23:52:57 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> Message-ID: <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> Hi, > Subparsing of the defline should not be done for generic parsing, > but rather when needed. To my understanding, the subparsing of the definition occurs only when needed, ie when entry_id, identifiers, gi, etc. is called, in current code. If only definition is called, it is not further parsed. > Without any experience I think that disabling GC sounds like a bad > idea. Yes, completely disabling GC is generally a bad idea. A code running with 6 Gbytes mem could eat 60 Gbytes or more... (Yes it seems two or three-fold faster if there is enough memory, but this trade-off is too extreme). But since the GC dominates the running time, it is an important target for optimization. http://en.wikibooks.org/wiki/Ruby_Programming/Reference/Objects/GC A more moderate reduction of GC frequency will surely speedup the process 30~50%. Admittedly, explicit GC.disable, GC.start make the code ugly. Trial on tweaking the parameters in gc.c did only a minor (~5%) improvement. Careful coding to reduce object creation might contribute to speed up. One of questionable variable is @entry_overrun Is this variable and attr_reader :entry_overrun really required yet or is just a trace of older code? > Goto-San Since there is only two other variables, which is apparently essential, this third variable might account significant speed reduction. A tests suggested again removing 3 lines can improve 5%. (Unfortunately not 50%) diff --git a/lib/bio/db/fasta.rb b/lib/bio/db/fasta.rb index 7ea668e..95f3be4 100644 --- a/lib/bio/db/fasta.rb +++ b/lib/bio/db/fasta.rb @@ -111,7 +111,7 @@ module Bio # The seuqnce lines in text. attr_accessor :data - attr_reader :entry_overrun +# attr_reader :entry_overrun # Stores the comment and sequence information from one entry of the # FASTA format string. If the argument contains more than one @@ -119,8 +119,8 @@ module Bio def initialize(str) @definition = str[/.*/].sub(/^>/, '').strip # 1st line @data = str.sub(/.*/, '') # rests - @data.sub!(/^>.*/m, '') # remove trailing entries for sure - @entry_overrun = $& +# @data.sub!(/^>.*/m, '') # remove trailing entries for sure +# @entry_overrun = $& end # Returns the stored one entry as a FASTA format. (same as to_s) -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/14, at 17:21, Martin Asser Hansen wrote: > I was hoping for an easy to use generic FASTA parser in bioruby. I > think it would be confusing with two flavors of parsers for short/ > long entries. Also, I think that with a minor effort the existing > parser could be optimized a fair bit. Subparsing of the defline > should not be done for generic parsing, but rather when needed. > Without any experience I think that disabling GC sounds like a bad > idea. Of cause C is always faster, but Ruby is nicer. > > > Cheers, > > > Martin > > > > > On Sat, Aug 14, 2010 at 5:42 AM, Tomoaki NISHIYAMA > wrote: > Hi, > >> Mind you that the Benchmark is performed on StringIO data, and >> that the script does not touch the disk! >> In a real test, it will be much slower! > > My initial thought was :- That's true, and therefore the pure > parser part which runs fairly fast with O(N) > is not the primary problem. If you push the entries into a hash it > will be much more time consuming. > > But realized that its two orders slower... (due to the benchmark > code as pointed out by goto-san) > 20 min for 100 M could be painful. > >> I have been trying to get an overview of the code in >> Bio::FastaFormat, >> but I find it hard to read (that could be because I am not used to >> Ruby, or OO for that matter). > > > For one thing, the Bio::FastaFormat is designed to work with > Bio::FlatFile. > If you write a dedicated fasta parser that could run much faster. > > # I would write C codes for a very simple operation on NGS data. > # That will run 100 times faster. > # When the necessary operation is a bit more complex, I would use > ruby. much much more time consuming.... > > Perhaps the target is to process about 20 ~ 1000 M reads with each of > them having 25 to 150 nt for the time being. > Thats quite different situation compared to process the > ~ 0.1 M entry of 50-10000 aa residues or nucleotides in a genome. > The relative cost for the entry separation becomes higher compared > with the sequence > processing within the entry. > > So, it may worth to write NGS dedicated parser rather than sticking > on FlatFile. > > Playing around the benchmark, about the half of execution time is > for garbage collection, > and the order of execution is somewhat relevant to get the number. > If you can suppress unnecessary object generation to the minimum > and disable GC, that will > perhaps make it run much faster. > > $ diff -u benchfasta benchfasta-hash-GC-b > --- benchfasta 2010-08-13 21:45:21.000000000 +0900 > +++ benchfasta-hash-GC-b 2010-08-14 11:53:20.000000000 +0900 > @@ -34,6 +34,9 @@ > end > end > > +count = ARGV.shift.to_i > +count = 2 if count == nil > + > data = < >5_gECOjxwXsN1/1 > AACGNTACTATCGTGACATGCGTGCAGGATTACAC > @@ -57,12 +60,23 @@ > TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC > DATA > > -io1 = StringIO.new(data) > -io2 = StringIO.new(data) > +io0 = StringIO.new(data * count) > +io1 = StringIO.new(data * count) > +io2 = StringIO.new(data * count) > +fasta0 = Fasta.new(io0) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > -Benchmark.bm(5) do |timer| > - timer.report('Hack') { 10_000_000.times { fasta1.each { | > entry1| } } } > - timer.report('Bio') { 10_000_000.times { fasta2.each { | > entry2| } } } > +hash0=Hash.new > +hash1=Hash.new > +hash2=Hash.new > + > +Benchmark.bm(8) do |timer| > + GC.enable;GC.start;GC.disable; > + timer.report('Bio') { i=0; fasta2.each { |entry2| i+=1; hash2 > [entry2.definition + i.to_s] = entry2.seq[2..25]} } > + hash2 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack') { i=0; fasta0.each { |entry1| i+=1; hash0 > [entry1[:seq_name] + i.to_s] = entry1[:seq][2..25]} } > + hash0 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack-seq') { i=0; fasta1.each { |entry1| i+=1; > hash1[entry1[:seq_name] + i.to_s] = Bio::Sequence::NA.new(entry1 > [:seq])[2..25]} } > + hash1 = nil; GC.enable;GC.start;GC.disable; > end > > > > > > > -- > Tomoaki NISHIYAMA > > Advanced Science Research Center, > Kanazawa University, > 13-1 Takara-machi, > Kanazawa, 920-0934, Japan > > > On 2010/08/13, at 23:51, Martin Asser Hansen wrote: > >> >> As you stated 3 times faster with the hack, you may be already >> using ruby 1.9. >> >> >> I am using ruby 1.9.1, and I am using a fairly fast computer, but >> I am actually questioning the quality of the code. >> >> Anyway, I think 13 or 18 seconds for 100 M entry is fast enough >> and this >> part will not be the bottle neck of any application. >> How fast do you need it be? >> >> Mind you that the Benchmark is performed on StringIO data, and >> that the script does not touch the disk! In a real test, it will >> be much slower! I did not test on real data and more speed issues >> may surface (I have no idea how Ruby's file buffering compares to >> Perl's, performance-wise). >> >> I was contemplating porting some Biopieces (www.biopieces.org) >> from Perl to Ruby. Biopieces are used for everyday slicing and >> dicing of all sorts of biological data in a very simple and >> flexible manner. While Biopieces are not as fast as dedicated >> scripts, they are fast enough for convenient analysis of NGS data, >> but I will not accept a +300% speed penalty (i.e. read_fasta). >> >> I have been trying to get an overview of the code in >> Bio::FastaFormat, but I find it hard to read (that could be >> because I am not used to Ruby, or OO for that matter). It strikes >> me that the FastaFormat class does a number of irrelevant things >> like subparsing comments when not strictly necessary. In fact, the >> FASTA format actually don't use comments prefixed with # >> (semicolon can be used, but I will strongly advice against it >> since most software don't deal with it). Also, parsing is >> dependent on the record separator being '\n' - that could be >> considered a bug. There seem to be an overuse of substitutions, >> transliterations and regex matching. How about keeping it nice an >> tight? ala: >> >> SEP = $/ >> FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ >> >> def get_entry >> block = @io.gets(SEP + ">") >> return nil if block.nil? >> >> if block =~ FASTA_REGEX >> seq_name = $1 >> seq = $2 >> else >> raise "Bad FASTA entry->#{block}" >> end >> >> seq.gsub!(/\s/, "") >> end >> >> >> Cheers, >> >> >> Martin >> >> -- >> Tomoaki NISHIYAMA >> >> Advanced Science Research Center, >> Kanazawa University, >> 13-1 Takara-machi, >> Kanazawa, 920-0934, Japan >> >> > > From ngoto at gen-info.osaka-u.ac.jp Sun Aug 15 01:58:35 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Sun, 15 Aug 2010 14:58:35 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> Message-ID: <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> On Sat, 14 Aug 2010 23:52:57 +0900 Tomoaki NISHIYAMA wrote: > To my understanding, the subparsing of the definition occurs only > when needed, ie when entry_id, identifiers, gi, etc. is called, in > current code. > If only definition is called, it is not further parsed. Right. > Careful coding to reduce object creation might contribute to speed up. > One of questionable variable is > @entry_overrun > Is this variable and attr_reader :entry_overrun > really required yet or is just a trace of older code? > Goto-San The @entry_overrun has two means. 1. Adjustment of file position. The separator used to read a fasta entry is "\n>", but the ">" should be belonging to the next entry. To adjust this, the last ">" is stored to @entry_overrun. The Bio::FlatFile wrapper will use the content of @entry_overrun in the next time of reading. In addition, it is used to get proper file positions when indexing fasta files. 2. Integrity of data format In Bio::FastaFormat.new(str), if the str contains two or more fasta data, the sequence could be wring with naive parser. For example, for ">test1\nATATATAT\n>test2\nGCGCGCGC\n", the sequence could be "ATATAT>test2GCGCGCGC" without the cutting process of the trailing entries. In addition, to store the removed element to @entry_overrun may help debugging of user's code and might prevent data loss. Indeed, in the current code, both 1 and 2 are done at a time with the lines @data.sub!(/^>.*/m, '') # remove trailing entries for sure @entry_overrun = $& The 1 might be skipped when reading all data at a time without file positions. The 2 might be skipped if we can ignore such kind of mistakes to give two or more entries to the Bio::FastaFormat.new. -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From tomoakin at kenroku.kanazawa-u.ac.jp Sun Aug 15 02:19:03 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Sun, 15 Aug 2010 15:19:03 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> Message-ID: Hi, > 1. Adjustment of file position. > The separator used to read a fasta entry is "\n>", but the ">" > should be belonging to the next entry. To adjust this, the last > ">" is stored to @entry_overrun. The Bio::FlatFile wrapper will > use the content of @entry_overrun in the next time of reading. I first thought as such, but I could not find the code that actually use it. Could you specify where it is used? I could find only several places defining it. Maybe there was a reformation of Flatfile buffering to use ungets but not entry_overrun? #at bioruby/lib/bio/ $ grep entry_overrun * */* */*/* */*/*/* db/fasta.rb:# attr_reader :entry_overrun db/fasta.rb:# @entry_overrun = $& db/fastq.rb: # entry_overrun db/fastq.rb: attr_reader :entry_overrun db/fastq.rb: @entry_overrun = sc.rest db/nbrf.rb: @entry_overrun = $& db/nbrf.rb: attr_reader :entry_overrun db/newick.rb: @entry_overrun = $1 db/newick.rb: attr_reader :entry_overrun appl/blast/format0.rb: @entry_overrun = $1 appl/blast/format0.rb: attr_reader :entry_overrun appl/blast/rpsblast.rb: @entry_overrun = $1 appl/fasta/format10.rb: @entry_overrun = overruns.join('') appl/fasta/format10.rb: attr_reader :entry_overrun appl/spidey/report.rb: @entry_overrun = $1 appl/spidey/report.rb: attr_reader :entry_overrun -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From ngoto at gen-info.osaka-u.ac.jp Sun Aug 15 02:39:22 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Sun, 15 Aug 2010 15:39:22 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100815063922.843861CBC3C8@idnmail.gen-info.osaka-u.ac.jp> On Sun, 15 Aug 2010 15:19:03 +0900 Tomoaki NISHIYAMA wrote: > Hi, > > > 1. Adjustment of file position. > > The separator used to read a fasta entry is "\n>", but the ">" > > should be belonging to the next entry. To adjust this, the last > > ">" is stored to @entry_overrun. The Bio::FlatFile wrapper will > > use the content of @entry_overrun in the next time of reading. > > I first thought as such, but I could not find the code that actually > use it. Could you specify where it is used? The "adjustment of file position" have already been replaced by a constant DELIMITER_OVERRUN. I'm sorry I've forgotten things. So, currently, only the role of the 2 is expected. -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From pjotr.public14 at thebird.nl Mon Aug 16 07:22:56 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 16 Aug 2010 13:22:56 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100816112256.GA4509@thebird.nl> On Fri, Aug 13, 2010 at 12:12:05AM +0900, Naohisa GOTO wrote: > > Now, to print FASTA I now do: > > > > gff3.sequences.each do | item | > > print item.to_fasta(item.entry_id, 70) > > end > > gff3.sequences.each do | item | > print item.output(:fasta) > end As it stands, it is not a direct replacement as the entry_id gets printed twice. Also when I replace line 971: @sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('') with the output(:fasta) equivalent, the unit test in BioRuby fails, because the ID becomes 'test01 test01' instead of 'test01'. Does it mean we have to modify 's', to get the proper output? Pj. From pjotr.public14 at thebird.nl Mon Aug 16 08:05:30 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 16 Aug 2010 14:05:30 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100816120530.GA4996@thebird.nl> The GFF3 module parses GFF3 files and stores them in memory. We could do something with that data. Most people will want to fetch mRNA and CDS's. BioPerl has some similar facility. How about adding a module GFF3::Sequence with methods that fetch the mRNA (splicing) variants and CDS's that belong to an ID? Or do you think an implementation would be ambiguous? Or is that already in there? I must admit I can't find it. Pj. From ngoto at gen-info.osaka-u.ac.jp Mon Aug 16 08:17:36 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Mon, 16 Aug 2010 21:17:36 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816112256.GA4509@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816112256.GA4509@thebird.nl> Message-ID: <20100816121736.A60491CBC3C2@idnmail.gen-info.osaka-u.ac.jp> On Mon, 16 Aug 2010 13:22:56 +0200 Pjotr Prins wrote: > On Fri, Aug 13, 2010 at 12:12:05AM +0900, Naohisa GOTO wrote: > > > Now, to print FASTA I now do: > > > > > > gff3.sequences.each do | item | > > > print item.to_fasta(item.entry_id, 70) > > > end > > > > gff3.sequences.each do | item | > > print item.output(:fasta) > > end > > As it stands, it is not a direct replacement as the entry_id gets > printed twice. Also when I replace This is considered to be a bug. The bug was reported previously, but was postponed. http://lists.open-bio.org/pipermail/bioruby/2009-April/000897.html > line 971: @sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('') > > with the output(:fasta) equivalent, the unit test in BioRuby fails, > because the ID becomes 'test01 test01' instead of 'test01'. > > Does it mean we have to modify 's', to get the proper output? > The workaround is s.output(:fasta, :header=>s.entry_id, :width=>70) Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From ngoto at gen-info.osaka-u.ac.jp Mon Aug 16 08:40:28 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Mon, 16 Aug 2010 21:40:28 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816120530.GA4996@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> Message-ID: <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> On Mon, 16 Aug 2010 14:05:30 +0200 Pjotr Prins wrote: > The GFF3 module parses GFF3 files and stores them in memory. We could > do something with that data. Most people will want to fetch mRNA and > CDS's. BioPerl has some similar facility. > > How about adding a module GFF3::Sequence with methods that fetch the > mRNA (splicing) variants and CDS's that belong to an ID? Or do you > think an implementation would be ambiguous? Currently, the GFF parser in BioRuby is currently based on lines. To treat relations in the lines in a GFF3 file will be needed. A simple implementation would be to store all relations into a graph (or graphs) and then extracting information. BTW, for extracting sequence, I prefer GFF3::SequenceCutter or ExtractSequence rather than GFF3::Sequence. > Or is that already in there? I must admit I can't find it. No. Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 08:38:56 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 21:38:56 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816120530.GA4996@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> Message-ID: <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> Hi, It sounds me nice to have such feature. Maybe first thing to do for the implementation is to make a function to collect exons of a mRNA, sort and write join addresses, and finally pass to Sequence::NA.splicing. One thing needed before implementation is how to specify the sequene. GFF files sometimes comes without the sequence part and sometimes with the sequence. When the sequence is accompanied within the file its simple: just use it. If the sequence is not accompanied, may be we should pass a Hash of Bio::Sequence::NA? When a hash is supplied for a GFF with Sequence should the hash override the accompanied sequence? Some more exceptional thing would be handling circular genomes with some annotation spanning over the cut site of the genome. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/16, at 21:05, Pjotr Prins wrote: > The GFF3 module parses GFF3 files and stores them in memory. We could > do something with that data. Most people will want to fetch mRNA and > CDS's. BioPerl has some similar facility. > > How about adding a module GFF3::Sequence with methods that fetch the > mRNA (splicing) variants and CDS's that belong to an ID? Or do you > think an implementation would be ambiguous? > > Or is that already in there? I must admit I can't find it. > > Pj. > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 08:38:56 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 21:38:56 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816120530.GA4996@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> Message-ID: <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> Hi, It sounds me nice to have such feature. Maybe first thing to do for the implementation is to make a function to collect exons of a mRNA, sort and write join addresses, and finally pass to Sequence::NA.splicing. One thing needed before implementation is how to specify the sequene. GFF files sometimes comes without the sequence part and sometimes with the sequence. When the sequence is accompanied within the file its simple: just use it. If the sequence is not accompanied, may be we should pass a Hash of Bio::Sequence::NA? When a hash is supplied for a GFF with Sequence should the hash override the accompanied sequence? Some more exceptional thing would be handling circular genomes with some annotation spanning over the cut site of the genome. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/16, at 21:05, Pjotr Prins wrote: > The GFF3 module parses GFF3 files and stores them in memory. We could > do something with that data. Most people will want to fetch mRNA and > CDS's. BioPerl has some similar facility. > > How about adding a module GFF3::Sequence with methods that fetch the > mRNA (splicing) variants and CDS's that belong to an ID? Or do you > think an implementation would be ambiguous? > > Or is that already in there? I must admit I can't find it. > > Pj. > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From ktym at hgc.jp Mon Aug 16 09:40:25 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Mon, 16 Aug 2010 22:40:25 +0900 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: Message-ID: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Hi George, Oops, I just realized that I missed to read this thread. Sorry. ;) I'm very surprised and excited to know that you guys will organize a BioHackathon-like event in Kenya. Few hours ago, I finished a Skype meeting with the organizers and learnt about the plan described at http://rsg-ea-bio-sprint-2010.wikispaces.com/ (design of the poster is awesome, good job! :) Please use this mailing list to distill pre-hackathon preparations. We often asked "what can I contribute to the BioRuby project?" but it is usually difficult to assign a target and mentoring on it as the project itself has been self-organized. (The Google Summer of Code will be an exception. Mentors are working really hard and I really appreciate about that.) However, I take this opportunity to suggest several potential targets: (in addition to 1. finishing the newly introduced BioRuby plugin system and 2. supporting Semantic Web technologies on which we have been working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) === interfaces to external resources: * API for Ensemble (suggested by Jan Aerts) * API for UCSC (also suggested by Jan) * API for BioMart, InterMine etc. * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon === modern bioinformatics: * handling NGS data - wrappers and parsers for tools and libraries * Proteomics * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) === classical bioinformatics: * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. === visualization modules: * BioGraphics (already started by Jan) - genome mapping / comparative genomics? * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby === improving docs: * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. Regards Toshiaki Katayama On 2010/08/10, at 16:37, George Githinji wrote: > Hi all, > The Regional Students Group for Eastern Africa (RSG-EA) is one of the > grass-root level bodies of the International Society for Computational > Biology Student Council (ISCB-SC). The group has membership from ten > countries namely Burundi, Democratic Republic of Congo, Djibouti, > Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. > Recently we proposed to organize a biohakathon three day event to: > > 1) Learn how to collaborate on bioinformatics programming projects > using open source tools. > 2) Forge an East African bioinformatics programming community. > 3) Contribute a module/code to Bioruby library. > > The event has been sponsored by a grant from ISCB and ILRI/Beca > bioinformatics platform in Nairobi, Kenya. > > We would like to seek for a suitable project work from one of the > developer(s) and the community. The project should ideally be of > beginner to intermediate level difficulty. A third of the participants > will be of intermediate level programming skills with experience from > Java,Python and Perl. while the rest will have beginner level skills. > > We were also wondering whether it would be possible to get one of the > lead contributors to bioruby project to give a short 15-20 minutes > introductory talk to the participants. We have excellent video > conferencing facilities at the ILRI/Beca hub. The event is slated to > take place in late September. > > Thank you > > -- > --------------- > Sincerely > George > KEMRI/Wellcome-Trust Research Program > Skype: george_g2 > Blog: http://biorelated.wordpress.com/ > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From biopython at maubp.freeserve.co.uk Mon Aug 16 09:40:48 2010 From: biopython at maubp.freeserve.co.uk (Peter) Date: Mon, 16 Aug 2010 14:40:48 +0100 Subject: [BioRuby] GFF3 In-Reply-To: <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> Message-ID: On Mon, Aug 16, 2010 at 1:38 PM, Tomoaki NISHIYAMA wrote: > > Some more exceptional thing would be handling circular genomes with some > annotation spanning over the cut site of the genome. > Hi all, In case you were not aware, the GFF3 specification was recently (July 2010) updated to explicitly support circular genomes via a new Is_circular flag in the GFF3 attributes field. This also defines how the co-ordinates of features spanning the origin should be defined. http://lists.open-bio.org/pipermail/biopython-dev/2010-July/008003.html http://sourceforge.net/mailarchive/message.php?msg_name=5B028E4D-30B2-4DCA-B41A-FF59ABDC4898%40mac.com Regards, Peter From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 09:52:02 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 22:52:02 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> Hi, > A simple implementation would be to store all relations into a > graph (or graphs) and then extracting information. I recently wrote a program to extract all the mRNAs, but up to the addresses and not to the sequences. http://github.com/tomoakin/Bioruby-use/blob/master/src/gff2easytrack.rb This is not designed to be very general, but might be useful as a starting point. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/16, at 21:40, Naohisa GOTO wrote: > On Mon, 16 Aug 2010 14:05:30 +0200 > Pjotr Prins wrote: > >> The GFF3 module parses GFF3 files and stores them in memory. We could >> do something with that data. Most people will want to fetch mRNA and >> CDS's. BioPerl has some similar facility. >> >> How about adding a module GFF3::Sequence with methods that fetch the >> mRNA (splicing) variants and CDS's that belong to an ID? Or do you >> think an implementation would be ambiguous? > > Currently, the GFF parser in BioRuby is currently based on lines. > To treat relations in the lines in a GFF3 file will be needed. > A simple implementation would be to store all relations into a > graph (or graphs) and then extracting information. > > BTW, for extracting sequence, I prefer GFF3::SequenceCutter or > ExtractSequence rather than GFF3::Sequence. > > >> Or is that already in there? I must admit I can't find it. > > No. > > > Naohisa Goto > ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 10:08:09 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 23:08:09 +0900 Subject: [BioRuby] csfasta parser Message-ID: <5EAE938B-D406-46D1-B274-086F82F25D05@kenroku.kanazawa-u.ac.jp> Hi, I modified fasta.rb to parse csfasta format a modified version of fasta to handle color sequence produced by SOLiD sequencers by Lifetechnologies (Formally Applied Biosystems). The most important difference is that the sequence is a nucleotide followed by colors specified by numbers [0-3]. When the sequencer fail to assign a color it may be represented by a dot ".". The other difference is that mapping location may be added to the definition line without space but separated with comma ",". Thus the entry_id extraction should be based on comma rather than space. In some case, more interest is for the mapping location or entry id itself, and the data is not touched at all. So, I made it to store the entry and definition, but the data is not extracted at initialization but left for lazy evaluation. The code can be found at http://github.com/tomoakin/bioruby/blob/master/lib/bio/db/csfasta.rb Note that naseq etc. is not tested. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From sararayburn at gmail.com Mon Aug 16 15:09:18 2010 From: sararayburn at gmail.com (Sara Rayburn) Date: Mon, 16 Aug 2010 14:09:18 -0500 Subject: [BioRuby] [GSoC] final project status Message-ID: Hi everyone, Well, GSoC is finished for this summer. Thanks for a great experience working with Bioruby (especially thanks to Christian and Diana for mentoring). It's been a fun and challenging experience and I'm looking forward to continuing to work on the project beyond the scope of GSoC. Here's a quick rundown of the final status of my project: -- The speciation/duplication inference algorithm is implemented in Bio::Algorithm::SDI for fully binary gene & species trees. There is also an alternative algorithm that will reroot the gene tree to minimize the number of duplicaitons. This is in Bio::Algorithm::SDIR. The more generalized algorithm is implemented in Bio::Algorithm::GSDI, but is unverified and not recommended for general use yet. There are things that i'd like to further work on, including refactoring some of the code and improving my unit tests. Also, I'm going to continue working on verifying the generalized algorithm. Again, thanks for a great summer and a great opportunity! Sara Rayburn From georgkam at gmail.com Tue Aug 17 07:39:45 2010 From: georgkam at gmail.com (George Githinji) Date: Tue, 17 Aug 2010 14:39:45 +0300 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: Thank you very much Toshiaki. We really appreciated the call and the much advice and helpful conversation that we held. We are distilling on the various ideas and we will update you and the list on what will be most appropriate and achievable for us. On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: > Hi George, > > Oops, I just realized that I missed to read this thread. Sorry. ;) > > I'm very surprised and excited to know that you guys will organize > a BioHackathon-like event in Kenya. > > Few hours ago, I finished a Skype meeting with the organizers > and learnt about the plan described at > > http://rsg-ea-bio-sprint-2010.wikispaces.com/ > (design of the poster is awesome, good job! :) > > Please use this mailing list to distill pre-hackathon preparations. > > We often asked "what can I contribute to the BioRuby project?" but > it is usually difficult to assign a target and mentoring on it > as the project itself has been self-organized. > (The Google Summer of Code will be an exception. Mentors are working > really hard and I really appreciate about that.) > > However, I take this opportunity to suggest several potential targets: > (in addition to 1. finishing the newly introduced BioRuby plugin system > and 2. supporting Semantic Web technologies on which we have been > working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ > and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) > > === interfaces to external resources: > > * API for Ensemble (suggested by Jan Aerts) > * API for UCSC (also suggested by Jan) > * API for BioMart, InterMine etc. > * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon > > === modern bioinformatics: > > * handling NGS data - wrappers and parsers for tools and libraries > * Proteomics > * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) > > === classical bioinformatics: > > * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) > > * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. > > === visualization modules: > > * BioGraphics (already started by Jan) - genome mapping / comparative genomics? > > * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby > > === improving docs: > > * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) > > * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. > > Regards > Toshiaki Katayama > > > On 2010/08/10, at 16:37, George Githinji wrote: > >> Hi all, >> The Regional Students Group for Eastern Africa (RSG-EA) is one of the >> grass-root level bodies of the International Society for Computational >> Biology Student Council (ISCB-SC). The group has membership from ten >> countries namely Burundi, Democratic Republic of Congo, Djibouti, >> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. >> Recently we proposed to organize a biohakathon three day event to: >> >> ?1) Learn how to collaborate on bioinformatics programming projects >> using open source tools. >> ?2) Forge an East African bioinformatics programming community. >> ?3) Contribute a module/code to Bioruby library. >> >> The event has been sponsored by a grant from ISCB and ILRI/Beca >> bioinformatics platform in Nairobi, Kenya. >> >> We would like to seek for ?a suitable project work from one of the >> developer(s) and the community. The project should ideally be of >> beginner to intermediate level difficulty. A third of the participants >> will be of intermediate level programming skills with experience from >> Java,Python and Perl. while the rest will have beginner level skills. >> >> We were also wondering whether it would be possible to get one of the >> lead contributors to bioruby project to give a short 15-20 minutes >> introductory talk to the participants. We have excellent video >> conferencing ?facilities at the ILRI/Beca hub. The event is slated to >> take place in late September. >> >> Thank you >> >> -- >> --------------- >> Sincerely >> George >> KEMRI/Wellcome-Trust Research Program >> Skype: george_g2 >> Blog: http://biorelated.wordpress.com/ >> _______________________________________________ >> BioRuby Project - http://www.bioruby.org/ >> BioRuby mailing list >> BioRuby at lists.open-bio.org >> http://lists.open-bio.org/mailman/listinfo/bioruby > > -- --------------- Sincerely George KEMRI/Wellcome-Trust Research Program Skype: george_g2 Blog: http://biorelated.wordpress.com/ From pjotr.public14 at thebird.nl Tue Aug 17 12:38:37 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 17 Aug 2010 18:38:37 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> Message-ID: <20100817163837.GA15726@thebird.nl> On Mon, Aug 16, 2010 at 10:52:02PM +0900, Tomoaki NISHIYAMA wrote: > Hi, > >> A simple implementation would be to store all relations into a >> graph (or graphs) and then extracting information. > > I recently wrote a program to extract all the mRNAs, but up to the > addresses > and not to the sequences. > > http://github.com/tomoakin/Bioruby-use/blob/master/src/gff2easytrack.rb > > This is not designed to be very general, but might be useful as a > starting point. Thanks for the nice example. It shows how you can filter GFF without storing everything in memory. Naturally that does not work for extracting all transcripts as GFF does not guarantee ordered data. Still, a good example. What I also like is that there is almost no coupling with other BioRuby modules (other than embedded Fasta). We should keep it that way. Question, have we ever seen GFF files that are not ordered? It makes so much sense to keep genes and their components together. I think it is somewhere argued that you can share parts between genes, but how often does that happen - and would they be far apart in the file? Even Lincoln states that you can split GFF files. That would not work if data is not together. I am thinking we can assume that related data comes with each other. This means we only have to cache a limited number records in memory to resolve dependencies. I'll probably write something in the coming week, as I need it. I'll design it to be a BioRuby plugin. For the time being. Pj. From tomoakin at kenroku.kanazawa-u.ac.jp Tue Aug 17 21:09:06 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Wed, 18 Aug 2010 10:09:06 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100817163837.GA15726@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> Message-ID: Hi, > Thanks for the nice example. It shows how you can filter GFF without > storing everything in memory. Naturally that does not work for > extracting all transcripts as GFF does not guarantee ordered data. I think the code is not dependent on the order of the GFF file. All the exon is stored in an array holding the exons that belong to the mRNA. The output order of the exons is dependent on the input GFF, but in this case the output order is not required to be specified. I could insert exonary.sort{ some ordering rule } before exonary.each{} if the output order matters. (Since this program was not to persist a long time and there was sufficient memory, I didn't care anything to keep the memory usage low). > I am thinking we can assume that related data comes with each other. The nature of gene/genome is not so simple. You can read on trans-splicing. So, unlinked parts of the genome can form a mature mRNA and protein thereof. If these parts are collected close in GFF file, then positional order is not preserved. If the GFF is sorted by the position, the parts are in distant position. > share parts between genes, For, shared parts between genes, it is frequent that micro RNA genes are on introns or exons of other genes. Also, for compact genomes, there is quite a number of genes having overlapping UTRs. On chloroplast genomes, even overlapped CDS are known. > Question, have we ever seen GFF files that are not ordered? I've never seen an unordered GFF file, but there could be different orders. 1. The lines are just sorted according to the location. 2. genes are ordered and the parts of the gene comes together. For example the arabidopsis GFF file looks like this and you can see that the feature itself is not ordered that protein 3760 comes earlier than exon 3631. Chr1 TAIR9 gene 3631 5899 . + . ID=AT1G01010;Note=protein_coding_gene;Name=AT1G01010 Chr1 TAIR9 mRNA 3631 5899 . + . ID=AT1G01010.1;Parent=AT1G01010;Name=AT1G01010.1;Index=1 Chr1 TAIR9 protein 3760 5630 . + . ID=AT1G01010.1-Protein;Name=AT1G01010.1;Derives_from=AT1G01010.1 Chr1 TAIR9 exon 3631 3913 . + . Parent=AT1G01010.1 Chr1 TAIR9 five_prime_UTR 3631 3759 . + . Parent=AT1G01010.1 Chr1 TAIR9 CDS 3760 3913 . + 0 Parent=AT1G01010.1,AT1G01010.1-Protein; Chr1 TAIR9 exon 3996 4276 . + . Parent=AT1G01010.1 Chr1 TAIR9 CDS 3996 4276 . + 2 Parent=AT1G01010.1,AT1G01010.1-Protein; > It makes so much sense to keep genes and their components together. I think GFF is an exchange format rather than to work directly with part of it. The data can be relatively easily stored into a RDB and extracted from it. Index on RDB will allow a fast identification of all feature in a specified region or a gene. That subset is good to work with. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/18, at 1:38, Pjotr Prins wrote: > On Mon, Aug 16, 2010 at 10:52:02PM +0900, Tomoaki NISHIYAMA wrote: >> Hi, >> >>> A simple implementation would be to store all relations into a >>> graph (or graphs) and then extracting information. >> >> I recently wrote a program to extract all the mRNAs, but up to the >> addresses >> and not to the sequences. >> >> http://github.com/tomoakin/Bioruby-use/blob/master/src/ >> gff2easytrack.rb >> >> This is not designed to be very general, but might be useful as a >> starting point. > > Thanks for the nice example. It shows how you can filter GFF without > storing everything in memory. Naturally that does not work for > extracting all transcripts as GFF does not guarantee ordered data. > > Still, a good example. What I also like is that there is almost no > coupling with other BioRuby modules (other than embedded Fasta). We > should keep it that way. > > Question, have we ever seen GFF files that are not ordered? It makes > so much sense to keep genes and their components together. I think it > is somewhere argued that you can share parts between genes, but how > often does that happen - and would they be far apart in the file? > Even Lincoln states that you can split GFF files. That would not work > if data is not together. > > I am thinking we can assume that related data comes with each other. > This means we only have to cache a limited number records in memory > to resolve dependencies. > > I'll probably write something in the coming week, as I need it. I'll > design it to be a BioRuby plugin. For the time being. > > Pj. > From pjotr.public14 at thebird.nl Wed Aug 18 02:12:11 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Wed, 18 Aug 2010 08:12:11 +0200 Subject: [BioRuby] GFF3 In-Reply-To: References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> Message-ID: <20100818061211.GA18479@thebird.nl> On Wed, Aug 18, 2010 at 10:09:06AM +0900, Tomoaki NISHIYAMA wrote: >> Thanks for the nice example. It shows how you can filter GFF without >> storing everything in memory. Naturally that does not work for >> extracting all transcripts as GFF does not guarantee ordered data. > > I think the code is not dependent on the order of the GFF file. Sorry, I was not talking about your script. I merely stated your example shows *how* it is possible to filter data. My sentence was ambiguous. > I've never seen an unordered GFF file, but there could be different > orders. > 1. The lines are just sorted according to the location. > 2. genes are ordered and the parts of the gene comes together. > For example the arabidopsis GFF file looks like this and you can see > that the > feature itself is not ordered that protein 3760 comes earlier than exon > 3631. Thanks for that. In that case I can store the seekpos of every gene/location and use disk access instead. The way GFF is normally orgainized would hardly incur a penalty. I do the same with my BigBio FASTA reader. I want to get away from loading everything in memory. We can not assume that memory expansion keeps up with data load. It is fine as an 'optimization', but we should not take it for granted. > I think GFF is an exchange format rather than to work directly with > part of it. The data can be relatively easily stored into a RDB and > extracted from it. Index on RDB will allow a fast identification of > all feature in a specified region or a gene. That subset is good to > work with. I avoid RDB (assuming you mean RDBMS, and not the Rwanda Development Board), until BioRuby comes with an RDBMS that can be used in a transparent fashion. You can not assume every user has an RDBMS readily available. Pj. From tomoakin at kenroku.kanazawa-u.ac.jp Wed Aug 18 04:21:24 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Wed, 18 Aug 2010 17:21:24 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100818061211.GA18479@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> Message-ID: <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> Hi, Here is how the trans-splicing gene rps12 looks like in the genomic context. http://www.ncbi.nlm.nih.gov/nuccore/7525012?report=graph&v=60000:170000 > In that case I can store the seekpos of every > gene/location and use disk access instead. It should be safe if you scan the data and store the position in the GFF file of first and last record of every gene. > We can not assume that memory expansion keeps up with data load. > It is fine as an 'optimization', but we should not take it for > granted. The gene number within a genome doesn't grow so much. So, the memory becomes problematic only if you are dealing with multiple genomes or more fine features. Saving memory is another kind of optimization. It's good if we can achieve to do with less memory. I just don't care much as far as the problem fit in the memory I can use and run in a reasonable time. > I avoid RDB (assuming you mean RDBMS, and not the Rwanda Development > Board), until BioRuby comes with an RDBMS that can be used in a > transparent fashion. You can not assume every user has an RDBMS > readily > available. Oh, I meant relational database. It is for flexibility. Its just easier for me to use a RDBMS than to think of a new way to do without it. So, its just expression of my way. If you are always to query from the gene name, then gene name to seekpos index will be sufficient. But, then I would rather consider to store the parsed data object in PStore than to parse the GFF file again. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From pjotr.public14 at thebird.nl Wed Aug 18 05:59:37 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Wed, 18 Aug 2010 11:59:37 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> Message-ID: <20100818095937.GA23171@thebird.nl> On Wed, Aug 18, 2010 at 05:21:24PM +0900, Tomoaki NISHIYAMA wrote: > Hi, > > Here is how the trans-splicing gene rps12 looks like in the genomic > context. > http://www.ncbi.nlm.nih.gov/nuccore/7525012?report=graph&v=60000:170000 Cool, huh :) > The gene number within a genome doesn't grow so much. So, the > memory becomes problematic only if you are dealing with multiple > genomes or more fine features. Yup. That is where we are heading. 100K people project, for example. > Saving memory is another kind of optimization. It's good if we can > achieve to do with less memory. I just don't care much as far as > the problem fit in the memory I can use and run in a reasonable > time. Sure, but I think it is short sighted to load everything in RAM when we think in more general BioRuby terms. > Oh, I meant relational database. It is for flexibility. > Its just easier for me to use a RDBMS than to think of a new way > to do without it. So, its just expression of my way. Sure, feel free to use an RDBMS. Just don't expect everyone to. > If you are always to query from the gene name, then gene name to > seekpos index will be sufficient. But, then I would rather consider > to store the parsed data object in PStore than to parse the GFF file > again. PStore is cool too. Pj. From rob.syme at gmail.com Sun Aug 22 02:22:18 2010 From: rob.syme at gmail.com (Rob Syme) Date: Sun, 22 Aug 2010 14:22:18 +0800 Subject: [BioRuby] BioSQL development Message-ID: Is there a particular person who has taken charge of the BioSQL part of Bioruby? I just want confirmation that I'm not using it in completely the wrong way. Are the classes designed so that you generate models for an app (a rails app, for example) that inherit from the Bio::SQL::whatever? eg: $ rails g model Biodatabase name:string authority:string description:text and then in app/model/biodatabase.rb you change: class Biodatabase < ActiveRecord::Base end into: class Biodatabase < Bio::SQL::Biodatabase # which inherits ActiveRecord::Base end If I get a handle on this, I'd be happy to write it up for http://bioruby.open-bio.org/wiki/Tutorial#BioSQL Thanks for all the work by the dev team. Much appreciated - I use bioruby almost every day. -r Rob Syme From ktym at hgc.jp Sun Aug 22 03:44:33 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Sun, 22 Aug 2010 16:44:33 +0900 Subject: [BioRuby] BioSQL development In-Reply-To: References: Message-ID: <2967C144-A2CF-4FF7-81AA-F0B25E55B9C0@hgc.jp> Hi Rob, Thank you for your will to volunteer for the documentation. Raoul is the current maintainer of the BioSQL module in BioRuby, but I heard that he is on vacation for now. Cheers, Toshiaki Katayama On 2010/08/22, at 15:22, Rob Syme wrote: > Is there a particular person who has taken charge of the BioSQL part of > Bioruby? > I just want confirmation that I'm not using it in completely the wrong way. > > Are the classes designed so that you generate models for an app (a rails > app, for example) that inherit from the Bio::SQL::whatever? eg: > > $ rails g model Biodatabase name:string authority:string description:text > > and then in app/model/biodatabase.rb you change: > class Biodatabase < ActiveRecord::Base > end > > into: > class Biodatabase < Bio::SQL::Biodatabase # which inherits > ActiveRecord::Base > end > > If I get a handle on this, I'd be happy to write it up for > http://bioruby.open-bio.org/wiki/Tutorial#BioSQL > > Thanks for all the work by the dev team. Much appreciated - I use bioruby > almost every day. > -r > > Rob Syme > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From ju at ncoffee.de Sun Aug 22 04:50:07 2010 From: ju at ncoffee.de (Julian Nordt) Date: Sun, 22 Aug 2010 10:50:07 +0200 Subject: [BioRuby] BioSQL development In-Reply-To: References: Message-ID: Hi Rob, I tried to use the BioSQL part of Bioruby for a webapplication based on rails. With the idea of giving users the capability to upload genomes to the application. However the mapping between BioSQL <-> GFF3 is(?)/were not figured out completly, which was one of the reasons why I changed my implementation to CHADO scheme. In regard to your question I did use BioSQL in the following way: def openBioConnection Bio::SQL.establish_connection(:adapter => "mysql", :host => "localhost", :username => "xxx", :password => "xxx", :database => "biosql_development") end def get_sequence(bio_entry_id) openBioConnection() if !bio_entry_id.blank? return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) end end def setSeqFeatName(id,name) seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) end (! This does not mean that it is the correct way or intended way of usage as I'm new to ruby/bioruby !) I had to make some small modifications to bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, unfortunately I do not remember where exactly. Mostlikely some changes in regard to the connection adapter and some changes to tablenames or similiar stuff. As you can see in ar-biosql.rb all the "classes" for the biosql orm are already defined, so one does not have to define the classes himself. Hope this helps, Julian Nordt On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme wrote: > Is there a particular person who has taken charge of the BioSQL part of > Bioruby? > I just want confirmation that I'm not using it in completely the wrong > way. > > Are the classes designed so that you generate models for an app (a rails > app, for example) that inherit from the Bio::SQL::whatever? eg: > > $ rails g model Biodatabase name:string authority:string description:text > > and then in app/model/biodatabase.rb you change: > class Biodatabase < ActiveRecord::Base > end > > into: > class Biodatabase < Bio::SQL::Biodatabase # which inherits > ActiveRecord::Base > end > > If I get a handle on this, I'd be happy to write it up for > http://bioruby.open-bio.org/wiki/Tutorial#BioSQL > > Thanks for all the work by the dev team. Much appreciated - I use bioruby > almost every day. > -r > > Rob Syme > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > -- Using Opera's revolutionary email client: http://www.opera.com/mail/ From rob.syme at gmail.com Sun Aug 22 05:23:52 2010 From: rob.syme at gmail.com (Rob Syme) Date: Sun, 22 Aug 2010 17:23:52 +0800 Subject: [BioRuby] BioSQL development In-Reply-To: References: Message-ID: Thanks Toshiaki and Julian, Mapping features from gff may well become an issue. Until Raoul gets back, I might try and keep the data under the Chado schema as suggested by Julian. *If* I can get it clean enough, I'll offer it up for incorporation into bioruby. -r On Sun, Aug 22, 2010 at 4:50 PM, Julian Nordt wrote: > Hi Rob, > > I tried to use the BioSQL part of Bioruby for a webapplication based on > rails. With the idea of giving users the capability to upload genomes to the > application. However the mapping between BioSQL <-> GFF3 is(?)/were not > figured out completly, which was one of the reasons why I changed my > implementation to CHADO scheme. > > > In regard to your question I did use BioSQL in the following way: > > > def openBioConnection > Bio::SQL.establish_connection(:adapter => "mysql", > :host => "localhost", > :username => "xxx", > :password => "xxx", > :database => "biosql_development") > end > > > def get_sequence(bio_entry_id) > openBioConnection() > if !bio_entry_id.blank? > return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) > end > end > > > def setSeqFeatName(id,name) > seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) > Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) > end > > (! This does not mean that it is the correct way or intended way of usage > as I'm new to ruby/bioruby !) > > I had to make some small modifications to > bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, unfortunately I do > not remember where exactly. Mostlikely some changes in regard to the > connection adapter and some changes to tablenames or similiar stuff. > > As you can see in ar-biosql.rb all the "classes" for the biosql orm are > already defined, so one does not have to define the classes himself. > > Hope this helps, > > Julian Nordt > > > > > > > On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme wrote: > > Is there a particular person who has taken charge of the BioSQL part of >> Bioruby? >> I just want confirmation that I'm not using it in completely the wrong >> way. >> >> Are the classes designed so that you generate models for an app (a rails >> app, for example) that inherit from the Bio::SQL::whatever? eg: >> >> $ rails g model Biodatabase name:string authority:string description:text >> >> and then in app/model/biodatabase.rb you change: >> class Biodatabase < ActiveRecord::Base >> end >> >> into: >> class Biodatabase < Bio::SQL::Biodatabase # which inherits >> ActiveRecord::Base >> end >> >> If I get a handle on this, I'd be happy to write it up for >> http://bioruby.open-bio.org/wiki/Tutorial#BioSQL >> >> Thanks for all the work by the dev team. Much appreciated - I use bioruby >> almost every day. >> -r >> >> Rob Syme >> _______________________________________________ >> BioRuby Project - http://www.bioruby.org/ >> BioRuby mailing list >> BioRuby at lists.open-bio.org >> http://lists.open-bio.org/mailman/listinfo/bioruby >> >> > > -- > Using Opera's revolutionary email client: http://www.opera.com/mail/ > From ju at ncoffee.de Sun Aug 22 06:30:03 2010 From: ju at ncoffee.de (Julian Nordt) Date: Sun, 22 Aug 2010 12:30:03 +0200 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: Hi Rob, I just wanted to point that there are for sure people that have a greater experience in regard to the discussed db-schemas and might give you better advice on this topic than I'm able to do. As pointed out I have just recently started to work with bioruby. Hence it might be a good idea to consider further opinions on this topic. However I did wanted to reply to you request, as your scenario sounded somewhat similar to one of the projects I have to work on. -- Julian On Sun, 22 Aug 2010 11:23:52 +0200, Rob Syme wrote: > Thanks Toshiaki and Julian, > > Mapping features from gff may well become an issue. Until Raoul gets > back, I > might try and keep the data under the Chado schema as suggested by > Julian. > *If* I can get it clean enough, I'll offer it up for incorporation into > bioruby. > > -r > > > On Sun, Aug 22, 2010 at 4:50 PM, Julian Nordt wrote: > >> Hi Rob, >> >> I tried to use the BioSQL part of Bioruby for a webapplication based on >> rails. With the idea of giving users the capability to upload genomes >> to the >> application. However the mapping between BioSQL <-> GFF3 is(?)/were not >> figured out completly, which was one of the reasons why I changed my >> implementation to CHADO scheme. >> >> >> In regard to your question I did use BioSQL in the following way: >> >> >> def openBioConnection >> Bio::SQL.establish_connection(:adapter => "mysql", >> :host => "localhost", >> :username => "xxx", >> :password => "xxx", >> :database => "biosql_development") >> end >> >> >> def get_sequence(bio_entry_id) >> openBioConnection() >> if !bio_entry_id.blank? >> return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) >> end >> end >> >> >> def setSeqFeatName(id,name) >> seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) >> Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) >> end >> >> (! This does not mean that it is the correct way or intended way of >> usage >> as I'm new to ruby/bioruby !) >> >> I had to make some small modifications to >> bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, unfortunately I >> do >> not remember where exactly. Mostlikely some changes in regard to the >> connection adapter and some changes to tablenames or similiar stuff. >> >> As you can see in ar-biosql.rb all the "classes" for the biosql orm are >> already defined, so one does not have to define the classes himself. >> >> Hope this helps, >> >> Julian Nordt >> >> >> >> >> >> >> On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme wrote: >> >> Is there a particular person who has taken charge of the BioSQL part of >>> Bioruby? >>> I just want confirmation that I'm not using it in completely the wrong >>> way. >>> >>> Are the classes designed so that you generate models for an app (a >>> rails >>> app, for example) that inherit from the Bio::SQL::whatever? eg: >>> >>> $ rails g model Biodatabase name:string authority:string >>> description:text >>> >>> and then in app/model/biodatabase.rb you change: >>> class Biodatabase < ActiveRecord::Base >>> end >>> >>> into: >>> class Biodatabase < Bio::SQL::Biodatabase # which inherits >>> ActiveRecord::Base >>> end >>> >>> If I get a handle on this, I'd be happy to write it up for >>> http://bioruby.open-bio.org/wiki/Tutorial#BioSQL >>> >>> Thanks for all the work by the dev team. Much appreciated - I use >>> bioruby >>> almost every day. >>> -r >>> >>> Rob Syme >>> _______________________________________________ >>> BioRuby Project - http://www.bioruby.org/ >>> BioRuby mailing list >>> BioRuby at lists.open-bio.org >>> http://lists.open-bio.org/mailman/listinfo/bioruby >>> >>> >> >> -- >> Using Opera's revolutionary email client: http://www.opera.com/mail/ >> -- Using Opera's revolutionary email client: http://www.opera.com/mail/ From hlapp at drycafe.net Sun Aug 22 10:02:01 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Sun, 22 Aug 2010 10:02:01 -0400 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: Is the issue with GFF3 in the Bioruby to BioSQL mapping, or is somehow in the BioSQL schema? I recall there was a thread on GFF recently which I wasn't able to follow, so if the answer is in that thread and isn't easy to sum up here, just point me there. -hilmar On Aug 22, 2010, at 6:30 AM, Julian Nordt wrote: > Hi Rob, > > I just wanted to point that there are for sure people that have a > greater > experience in regard to the discussed db-schemas and might give you > better > advice on this topic than I'm able to do. As pointed out I have just > recently started to work with bioruby. Hence it might be a good idea > to > consider further opinions on this topic. > > However I did wanted to reply to you request, as your scenario sounded > somewhat similar to one of the projects I have to work on. > > -- Julian > > On Sun, 22 Aug 2010 11:23:52 +0200, Rob Syme > wrote: > >> Thanks Toshiaki and Julian, >> >> Mapping features from gff may well become an issue. Until Raoul >> gets back, I >> might try and keep the data under the Chado schema as suggested by >> Julian. >> *If* I can get it clean enough, I'll offer it up for incorporation >> into >> bioruby. >> >> -r >> >> >> On Sun, Aug 22, 2010 at 4:50 PM, Julian Nordt wrote: >> >>> Hi Rob, >>> >>> I tried to use the BioSQL part of Bioruby for a webapplication >>> based on >>> rails. With the idea of giving users the capability to upload >>> genomes to the >>> application. However the mapping between BioSQL <-> GFF3 is(?)/ >>> were not >>> figured out completly, which was one of the reasons why I changed my >>> implementation to CHADO scheme. >>> >>> >>> In regard to your question I did use BioSQL in the following way: >>> >>> >>> def openBioConnection >>> Bio::SQL.establish_connection(:adapter => "mysql", >>> :host => "localhost", >>> :username => "xxx", >>> :password => "xxx", >>> :database => "biosql_development") >>> end >>> >>> >>> def get_sequence(bio_entry_id) >>> openBioConnection() >>> if !bio_entry_id.blank? >>> return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) >>> end >>> end >>> >>> >>> def setSeqFeatName(id,name) >>> seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) >>> Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) >>> end >>> >>> (! This does not mean that it is the correct way or intended way >>> of usage >>> as I'm new to ruby/bioruby !) >>> >>> I had to make some small modifications to >>> bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, >>> unfortunately I do >>> not remember where exactly. Mostlikely some changes in regard to the >>> connection adapter and some changes to tablenames or similiar stuff. >>> >>> As you can see in ar-biosql.rb all the "classes" for the biosql >>> orm are >>> already defined, so one does not have to define the classes himself. >>> >>> Hope this helps, >>> >>> Julian Nordt >>> >>> >>> >>> >>> >>> >>> On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme >>> wrote: >>> >>> Is there a particular person who has taken charge of the BioSQL >>> part of >>>> Bioruby? >>>> I just want confirmation that I'm not using it in completely the >>>> wrong >>>> way. >>>> >>>> Are the classes designed so that you generate models for an app >>>> (a rails >>>> app, for example) that inherit from the Bio::SQL::whatever? eg: >>>> >>>> $ rails g model Biodatabase name:string authority:string >>>> description:text >>>> >>>> and then in app/model/biodatabase.rb you change: >>>> class Biodatabase < ActiveRecord::Base >>>> end >>>> >>>> into: >>>> class Biodatabase < Bio::SQL::Biodatabase # which inherits >>>> ActiveRecord::Base >>>> end >>>> >>>> If I get a handle on this, I'd be happy to write it up for >>>> http://bioruby.open-bio.org/wiki/Tutorial#BioSQL >>>> >>>> Thanks for all the work by the dev team. Much appreciated - I use >>>> bioruby >>>> almost every day. >>>> -r >>>> >>>> Rob Syme >>>> _______________________________________________ >>>> BioRuby Project - http://www.bioruby.org/ >>>> BioRuby mailing list >>>> BioRuby at lists.open-bio.org >>>> http://lists.open-bio.org/mailman/listinfo/bioruby >>>> >>>> >>> >>> -- >>> Using Opera's revolutionary email client: http://www.opera.com/mail/ >>> > > > -- > Using Opera's revolutionary email client: http://www.opera.com/mail/ > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From rob.syme at gmail.com Sun Aug 22 10:17:45 2010 From: rob.syme at gmail.com (Rob Syme) Date: Sun, 22 Aug 2010 22:17:45 +0800 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: I've had a look around and a pretty solid mapping seems to be available: http://www.biosql.org/wiki/Annotation_Mapping#GFF3 Blue collar bioinformatics gave it a shot here: http://bcbio.wordpress.com/2009/02/22/exploring-bioperl-genbank-to-gff-mapping/ -r On 22 Aug 2010 22:02, "Hilmar Lapp" wrote: Is the issue with GFF3 in the Bioruby to BioSQL mapping, or is somehow in the BioSQL schema? I recall there was a thread on GFF recently which I wasn't able to follow, so if the answer is in that thread and isn't easy to sum up here, just point me there. -hilmar On Aug 22, 2010, at 6:30 AM, Julian Nordt wrote: > Hi Rob, > > I just wanted to point that there ... -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== _______________________________________________ BioRuby Project - http://www.bioruby.org/ BioRu... From ju at ncoffee.de Sun Aug 22 11:17:44 2010 From: ju at ncoffee.de (Julian Nordt) Date: Sun, 22 Aug 2010 17:17:44 +0200 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: One more thing in regard to the mapping between BioSQL and GFF3: I tried to follow the mapping given by the biosql wiki and blue collar bioinformatics. The mapping is acceptable in the sense that you can store *most* or even all (?) of the features that GFF3 offers. The further I got though within the development the unclearer things got me, especially in terms of the "attribute" column. If you compare the table at the biosql wiki (for the attribute column) with the one at blue collar bioinformatics, one will notice that the there are keywords that occour in one, but not at the other table. That not mentioning the todos on the wiki regarding the "standard" columns. I havn't looked in that detail though through blue collars code, maybe the answer is given there. However I wrote a small library that managed to store most - but not all the given information of the GFF3-files - correctly to BioSQL. There were some points where the mapping has been unclear to me and where I stored the given information where I thought it would fit best. Considering that I chose a standard db schema to avoid any ambiguously and the fact that I experienced performance issues with MYSQL+Rails (not related to BioSQL) at the project made it enough for me to switch to CHADO backed by POSTGRES. The documentation regarding CHADO is in my opinion richer and most importantly one can follow gmod_bulk_load_gff3.pl for the mapping relatively easy, since it is well documented. I would very much welcome other opinions on the topic, especially in combination with the use of web applications. -- Julian On Sun, 22 Aug 2010 16:17:45 +0200, Rob Syme wrote: > I've had a look around and a pretty solid mapping seems to be available: > http://www.biosql.org/wiki/Annotation_Mapping#GFF3 > > Blue collar bioinformatics gave it a shot here: > http://bcbio.wordpress.com/2009/02/22/exploring-bioperl-genbank-to-gff-mapping/ > > -r > > On 22 Aug 2010 22:02, "Hilmar Lapp" wrote: > Is the issue with GFF3 in the Bioruby to BioSQL mapping, or is somehow in > the BioSQL schema? > > I recall there was a thread on GFF recently which I wasn't able to > follow, > so if the answer is in that thread and isn't easy to sum up here, just > point > me there. > > -hilmar > > > > On Aug 22, 2010, at 6:30 AM, Julian Nordt wrote: > >> Hi Rob, >> >> I just wanted to point that there ... -- Using Opera's revolutionary email client: http://www.opera.com/mail/ From pjotr.public14 at thebird.nl Mon Aug 23 08:16:16 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 23 Aug 2010 14:16:16 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100818095937.GA23171@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> <20100818095937.GA23171@thebird.nl> Message-ID: <20100823121616.GC2223@thebird.nl> Hi, I am in the process of providing GFF3 support for extracting mRNA, exons and CDSs from a GFF file. The BioRuby plugin is at: http://github.com/pjotrp/bioruby-gff3-plugin and the writeup is at: http://thebird.nl/bioruby/BioRuby_GFF3.html You are invited to comment on its contents. The first genome I am trying has over 600Mb of data, which, sadly, won't fit in a 2Gb RAM Thinkpad. I could use a large memory server or database, but that I consider cheating ;). BTW I am not suprised GFF3 support in, for example, BioSQL is patchy. The GFF3 standard is somewhat loosely defined, and open to interpretation. Not that it necessarily is a bad thing, though it is probably impossible to write the all encompassing parser. See the writeup. Pj. From ktym at hgc.jp Thu Aug 26 02:04:04 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Thu, 26 Aug 2010 15:04:04 +0900 Subject: [BioRuby] BioRuby paper is just published Message-ID: Dear all, After 10 years of development, the BioRuby paper is finally published in the Bioinformatics journal. The article is open access, so please take a look. BioRuby: Bioinformatics software for the Ruby programming language Naohisa Goto, Pjotr Prins, Mitsuteru Nakao, Raoul Bonnal, Jan Aerts and Toshiaki Katayama Bioinformatics 2010; doi: 10.1093/bioinformatics/btq475 Abstract: http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq475 PDF: http://bioinformatics.oxfordjournals.org/cgi/reprint/btq475 For the future publication, please cite this paper when you use the BioRuby library for your work. :-) We sincerely thank all contributors (http://bioruby.open-bio.org/wiki/Contributors) so far. We are very sorry that we could not include all of your names in the manuscript due to the space limitation. I'd like to take this opportunity to thank Pjotr Prins who has been lead this happen by hosting regular Skype meetings and worked very hard for drafting and editing the manuscript, as a joint first author. I also thank DBCLS (Database Center for Life Science, Japan) for giving us several chances to meet each other by hosting the DBCLS BioHackathons (http://hackathon3.dbcls.jp http://www.ncbi.nlm.nih.gov/pubmed/20727200). Draft of the bioruby paper was emerged during the hackathons, and that's why we have only 6 authors in this publication. I ask your kind understanding on this. The BioRuby project was originally started at the KEGG laboratory in Kyoto University and some resources are now hosted by Human Genome Center in Tokyo University, so I'd like to express my appreciation to these two institutes as well. Additionally, I also thank IPA (Information-technology Promotion Agency Japan) for 1 year grant in 2005 which greatly extended our motivation for further developments in coming years. Best Regards, Toshiaki Katayama From biopython at maubp.freeserve.co.uk Thu Aug 26 04:24:57 2010 From: biopython at maubp.freeserve.co.uk (Peter) Date: Thu, 26 Aug 2010 09:24:57 +0100 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: References: Message-ID: On Thu, Aug 26, 2010 at 7:04 AM, Toshiaki Katayama wrote: > > Dear all, > > After 10 years of development, the BioRuby paper is finally published in the > Bioinformatics journal. The article is open access, so please take a look. > > BioRuby: Bioinformatics software for the Ruby programming language > Naohisa Goto, Pjotr Prins, Mitsuteru Nakao, Raoul Bonnal, Jan Aerts > and Toshiaki Katayama > Bioinformatics 2010; doi: 10.1093/bioinformatics/btq475 > > Abstract: > http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq475 > > PDF: > http://bioinformatics.oxfordjournals.org/cgi/reprint/btq475 > > For the future publication, please cite this paper when you use the BioRuby > library for your work. :-) > > We sincerely thank all contributors (http://bioruby.open-bio.org/wiki/Contributors) > so far. We are very sorry that we could not include all of your names in the > manuscript due to the space limitation. > > I'd like to take this opportunity to thank Pjotr Prins who has been lead this happen > by hosting regular Skype meetings and worked very hard for drafting and editing > the manuscript, as a joint first author. > > I also thank DBCLS (Database Center for Life Science, Japan) for giving us > several chances to meet each other by hosting the DBCLS BioHackathons > (http://hackathon3.dbcls.jp http://www.ncbi.nlm.nih.gov/pubmed/20727200). > Draft of the bioruby paper was emerged during the hackathons, and that's > why we have only 6 authors in this publication. I ask your kind understanding > on this. > > The BioRuby project was originally started at the KEGG laboratory in Kyoto > University and some resources are now hosted by Human Genome Center in > Tokyo University, so I'd like to express my appreciation to these two institutes > as well. Additionally, I also thank IPA (Information-technology Promotion > Agency Japan) for 1 year grant in 2005 which greatly extended our > motivation for further developments in coming years. > > Best Regards, > Toshiaki Katayama > Congratulation Katayama-san, Pjotr, and the rest of the team. This is excellent news. Peter @ Biopython P.S. Do either of you have an account on the OBF news server? Posting an announcement there (even just this email) would be great: http://news.open-bio.org/news/category/obf-projects/bioruby/ Please email me (or OBF support) if you need help with access. From pjotr.public14 at thebird.nl Thu Aug 26 13:53:44 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 26 Aug 2010 19:53:44 +0200 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: References: Message-ID: <20100826175344.GA10231@thebird.nl> Thanks Peter! I think this paper is an important milestone for BioRuby, and the Bio* projects in general. I think we took care to write the paper in such a way that it will help popularise the concept of OSS Bio* projects, and attract new developers to BioRuby, BioPython, BioJava and others. Just to show what it means, during this year's GSoC running up, someone (I won't mention names) had the gall to say that BioRuby was nowhere and not worth supporting, since we had no peer reviewed publication. Doh! Someone needed OSS explained. I guess that argument is buried now. What shines on any Bio* project shines on the others. We look forward to many cross Bio* collaborations. Thanks Peter, Brad, Hilmar, Chris and all others, for being above project interests, and supporting all Bio* projects. In true sportsmanship. I think we have a pretty and attractive paper, which anyone can take to his or her supervisor. That was the goal. Hut hut hut, go OBF! And go BioRuby, BioPython, BioPerl, and BioJava! Pj. On Thu, Aug 26, 2010 at 09:24:57AM +0100, Peter wrote: > On Thu, Aug 26, 2010 at 7:04 AM, Toshiaki Katayama wrote: > > After 10 years of development, the BioRuby paper is finally published in the > > Bioinformatics journal. The article is open access, so please take a look. From rutgeraldo at gmail.com Thu Aug 26 15:23:45 2010 From: rutgeraldo at gmail.com (Rutger Vos) Date: Thu, 26 Aug 2010 20:23:45 +0100 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: <20100826175344.GA10231@thebird.nl> References: <20100826175344.GA10231@thebird.nl> Message-ID: > What shines on any Bio* project shines on the others. We look forward > to many cross Bio* collaborations. Hear, hear. -- Dr. Rutger A. Vos School of Biological Sciences Philip Lyle Building, Level 4 University of Reading Reading RG6 6BX United Kingdom Tel: +44 (0) 118 378 7535 http://www.nexml.org http://rutgervos.blogspot.com From cjfields at illinois.edu Thu Aug 26 15:43:03 2010 From: cjfields at illinois.edu (Chris Fields) Date: Thu, 26 Aug 2010 14:43:03 -0500 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: References: <20100826175344.GA10231@thebird.nl> Message-ID: On Aug 26, 2010, at 2:23 PM, Rutger Vos wrote: >> What shines on any Bio* project shines on the others. We look forward >> to many cross Bio* collaborations. > > Hear, hear. > > -- > Dr. Rutger A. Vos > School of Biological Sciences > Philip Lyle Building, Level 4 > University of Reading > Reading > RG6 6BX > United Kingdom > Tel: +44 (0) 118 378 7535 > http://www.nexml.org > http://rutgervos.blogspot.com :) (and, +1) chris From pjotr.public14 at thebird.nl Mon Aug 30 05:10:28 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 30 Aug 2010 11:10:28 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> Message-ID: <20100830091028.GA13445@thebird.nl> On Wed, Aug 18, 2010 at 05:21:24PM +0900, Tomoaki NISHIYAMA wrote: > The gene number within a genome doesn't grow so much. So, the > memory becomes problematic only if you are dealing with multiple > genomes or more fine features. > > Saving memory is another kind of optimization. It's good if we can > achieve to do with less memory. I just don't care much as far as > the problem fit in the memory I can use and run in a reasonable > time. Well, interesting news. The low memory version is actually 50% faster than the InMemory BioRuby edition. On a decent 15Gb server with fast drives (and ruby 1.8.7 (2010-08-16 patchlevel 302) [x86_64-linux]): When I parse a 500Mb GFF3 file, without FASTA information, with BioRuby it consumes 8.5 Gb RAM and takes 20 minutes. My NoCache version takes 1Gb RAM and 13 minutes. On my 2Gb laptop the native BioRuby version never completed (which, in my opinion, is unacceptable). Mine is the naive version - i.e. I only store file seek positions in memory, and reload and parse a record from disk every time. The record parser is BioRuby's, not mine. There are no optimizations. Even this is faster than BioRuby's default in memory model - which takes 19 minutes by itself to load and parse the data file; I only use the last 1 minute for digesting information and assembly of sequences. I am not 100% sure why this is, but I know that BioRuby consumes the whole file in memory first, splits it by line and, next, starts parsing GFF. Probably memory allocation and regex are expensive with really large buffers. I think BioRuby needs to provide iterators for on demand parsing of files, rather than big memory blobs. I also do it for FASTA in my BigBio project. It can be done transparently, as both InMemory and NoCache versions use the same algorithm. It will take me some time to complete a write-up on how to approach this for BioRuby, as I am keeping my head low next month. Note that, BioJava provides iteration too, as a default model, though I think their visitor pattern introduces too much complexity. In short: We can use simple Ruby iterators - it will work - and potentially even provides transparent LRU caching. I'll have numbers on that later, as that is my route to speed optimization. I know GFF3 components get reloaded and re-parsed many times. If you want to try, my code is at http://github.com/pjotrp/bioruby-gff3-plugin the current report is at http://thebird.nl/bioruby/BioRuby_GFF3.html Note: you may need my empty line patch for BioRuby to run the InMemory edition (my BioRuby GFF3 branch on github). Pj. From rob.syme at gmail.com Mon Aug 30 05:13:59 2010 From: rob.syme at gmail.com (Rob Syme) Date: Mon, 30 Aug 2010 17:13:59 +0800 Subject: [BioRuby] Chado Mappings and DataMapper vs ActiveRecord Message-ID: I've got some (very) early mappings up for the Chado DB schema (only the cv, general, sequence and pub modules so far). http://github.com/robsyme/RubyCHADO I'd be very happy to offer a more final version up for inclusion into BioRuby if others thought that it might be useful. The code is neither clever nor elegant, but it might save somebody else putting together all the relationships/associations in the future. At the moment, the models are based on DataMapper rather than ActiveRecord. DataMapper feels like a better fit to me, but if there are others with strong opinions about ORMs in BioRuby, I'd appreciate the input. -r From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 30 22:12:37 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Tue, 31 Aug 2010 11:12:37 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100830091028.GA13445@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> <20100830091028.GA13445@thebird.nl> Message-ID: <954164B5-931F-426C-8833-3CBCA8BA21D3@kenroku.kanazawa-u.ac.jp> Hi, > When I parse a 500Mb GFF3 file, without FASTA information, with > BioRuby it consumes 8.5 Gb RAM and takes 20 minutes. My NoCache > version takes 1Gb RAM and 13 minutes. This sounds nice! > I am not 100% sure why this is, but I know that BioRuby consumes the > whole file in memory first, splits it by line and, next, starts > parsing GFF. Probably memory allocation and regex are expensive with > really large buffers. During the conversation on "Benchmarking FASTA file parsing", I realized that GC takes quite a lot of time if a large memory is to be used. The mark and sweep algorithm in Matz ruby implementation scans over all the allocated objects every time the GC is run (which is not written in ruby code but implicitly runs if not suppressed). Since ruby-1.9.2 seems to have much better GC performance, I am interested how the performance compares in ruby-1.9.2. (I am also interested in GC.disable condition, but this may not work with 15 Gbytes though). Running your script with ruby 1.9 caused several errors, related to case when : removal of colon at the end of when line and changing colon to newline if the colon is not at the end of line was sufficient to run with ruby 1.9.2. (diff at the end) Either one of newline, semicolon, and "then" seems to work. > I only store file seek positions in > memory, and reload and parse a record from disk every time. The other good reason is that the data is perhaps not read from the disk many times but cached by the operating system and retained on memory. So this is not as bad as it sounds. Having 15 Gbytes, presumably 500 Mbytes file need not flushed. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan diff --git a/bin/gff3-fetch b/bin/gff3-fetch index b8d4718..36e61f7 100755 --- a/bin/gff3-fetch +++ b/bin/gff3-fetch @@ -39,17 +39,17 @@ ARGV.each do | fn | gffdb = Bio::GFFbrowser::GFFdb.new(fn,options) gff = gffdb.assembler case gfftype - when 'mrna'||'mRNA' : + when 'mrna'||'mRNA' gff.each_mRNA_seq do | id, seq | puts ">"+id puts seq end - when 'exon': + when 'exon' gff.each_exon_seq do | id, seq | puts ">"+id puts seq end - when 'CDS': + when 'CDS' gff.each_CDS_seq do | id, seq | puts ">"+id puts seq diff --git a/lib/bio/db/gff/gffdb.rb b/lib/bio/db/gff/gffdb.rb index 5325fb9..9540154 100644 --- a/lib/bio/db/gff/gffdb.rb +++ b/lib/bio/db/gff/gffdb.rb @@ -26,7 +26,7 @@ module Bio cache_recs = options[:cache_records] @assembler = case cache_recs - when :cache_none : + when :cache_none NoCache.new(filename, options) else InMemory.new(filename, options) # default diff --git a/lib/bio/db/gff/gffparser.rb b/lib/bio/db/gff/gffparser.rb index 5522d81..e1ed9db 100644 --- a/lib/bio/db/gff/gffparser.rb +++ b/lib/bio/db/gff/gffparser.rb @@ -30,9 +30,12 @@ module Bio info "Added #{rec.feature_type} with component ID #{id}" else case rec.feature_type - when 'mRNA' || 'SO:0000234' : @mrnalist.add(id,rec) - when 'CDS' || 'SO:0000316' : @cdslist.add(id,rec) - when 'exon' || 'SO:0000147' : @exonlist.add(id,rec) + when 'mRNA' || 'SO:0000234' + @mrnalist.add(id,rec) + when 'CDS' || 'SO:0000316' + @cdslist.add(id,rec) + when 'exon' || 'SO:0000147' + @exonlist.add(id,rec) else if !IGNORE_FEATURES.include?(rec.feature_type) @unrecognized_features[rec.feature_type] = true From pjotr.public14 at thebird.nl Tue Aug 31 02:53:09 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 31 Aug 2010 08:53:09 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <954164B5-931F-426C-8833-3CBCA8BA21D3@kenroku.kanazawa-u.ac.jp> References: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> <20100830091028.GA13445@thebird.nl> <954164B5-931F-426C-8833-3CBCA8BA21D3@kenroku.kanazawa-u.ac.jp> Message-ID: <20100831065309.GA20904@thebird.nl> On Tue, Aug 31, 2010 at 11:12:37AM +0900, Tomoaki NISHIYAMA wrote: > During the conversation on "Benchmarking FASTA file parsing", I > realized that GC takes quite a lot of time if a large memory is to > be used. The mark and sweep algorithm in Matz ruby implementation > scans over all the allocated objects every time the GC is run (which > is not written in ruby code but implicitly runs if not suppressed). Yup. No GC is perfect. They all have trade-offs. And, like you say, in particular when you run out of memory it starts to hurt. > Since ruby-1.9.2 seems to have much better GC performance, I am > interested how the performance compares in ruby-1.9.2. (I am also > interested in GC.disable condition, but this may not work with 15 > Gbytes though). The GC should really run on a separate thread (read core). Not sure Ruby 1.9 does that now. The JVM does, so JRuby probably does. When I implement an LRU cache it could also easily run on a separate thread, as returned data is immutable. I may do that, if I find something similar to Erlang actors, for Ruby. This may be it: http://on-ruby.blogspot.com/2008/01/ruby-concurrency-with-actors.html It is something to do later. Parallelized cache handling would really be nice for big data. And, if it looks like a standard Hash to the outside users, it will be easy to implement transparently throughout BioRuby. Anyway, let me add a cache first, and see what it means to performance. > Running your script with ruby 1.9 caused several errors, related to > case when : removal of colon at the end of when line and changing > colon to newline if the colon is not at the end of line was > sufficient to run with ruby 1.9.2. (diff at the end) Either one of > newline, semicolon, and "then" seems to work. I still have to migrate to 1.9. Thanks for trying! Next time please fix it on github so I can merge it in easier. I may migrate for using actors. > The other good reason is that the data is perhaps not read from the > disk many times but cached by the operating system and retained on > memory. So this is not as bad as it sounds. Having 15 Gbytes, > presumably 500 Mbytes file need not flushed. Yes. And that is why I started experimenting with NoCache. Seeks are cheap. Even without the OS buffers, disk reads are very very optimized these days (I have done some work on that last year, together with a student Konstantin Tretjakov). Most seeks in GFF3 are even within the standard hardware cache (8/16 Mb) boundary, and are therefore not a problem, even on small machines! With NoCache the file gets read twice, so the penalty should really be 2x max. Which is totally acceptable, if that means we can handle any size data on any machine. And then we can offer both InMemory and NoCache. We can handle any type of big data. Our users win. BioRuby wins. Next to do: I want an LRU cache to prevent *parsing* every record twice. Parsing is the single expensive thing in NoCache. One thing will be interesting: to see what LRU means in conjunction with GC. Pj. From email2ants at gmail.com Tue Aug 31 06:42:12 2010 From: email2ants at gmail.com (Anthony Underwood) Date: Tue, 31 Aug 2010 11:42:12 +0100 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: Hi Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. This field is becoming huge within bioinformatics and bioruby is lagging when it comes to tools to parse the date, specifically wrappers around the C functions found in samtools. I would have a go myself but have no experience in C so am sure others would do a better job. Thanks Anthony On 17 Aug 2010, at 12:39, George Githinji wrote: > Thank you very much Toshiaki. We really appreciated the call and the > much advice and helpful conversation that we held. > We are distilling on the various ideas and we will update you and the > list on what will be most appropriate and achievable for us. > > > On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: >> Hi George, >> >> Oops, I just realized that I missed to read this thread. Sorry. ;) >> >> I'm very surprised and excited to know that you guys will organize >> a BioHackathon-like event in Kenya. >> >> Few hours ago, I finished a Skype meeting with the organizers >> and learnt about the plan described at >> >> http://rsg-ea-bio-sprint-2010.wikispaces.com/ >> (design of the poster is awesome, good job! :) >> >> Please use this mailing list to distill pre-hackathon preparations. >> >> We often asked "what can I contribute to the BioRuby project?" but >> it is usually difficult to assign a target and mentoring on it >> as the project itself has been self-organized. >> (The Google Summer of Code will be an exception. Mentors are working >> really hard and I really appreciate about that.) >> >> However, I take this opportunity to suggest several potential targets: >> (in addition to 1. finishing the newly introduced BioRuby plugin system >> and 2. supporting Semantic Web technologies on which we have been >> working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ >> and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) >> >> === interfaces to external resources: >> >> * API for Ensemble (suggested by Jan Aerts) >> * API for UCSC (also suggested by Jan) >> * API for BioMart, InterMine etc. >> * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon >> >> === modern bioinformatics: >> >> * handling NGS data - wrappers and parsers for tools and libraries >> * Proteomics >> * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) >> >> === classical bioinformatics: >> >> * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) >> >> * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. >> >> === visualization modules: >> >> * BioGraphics (already started by Jan) - genome mapping / comparative genomics? >> >> * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby >> >> === improving docs: >> >> * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) >> >> * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. >> >> Regards >> Toshiaki Katayama >> >> >> On 2010/08/10, at 16:37, George Githinji wrote: >> >>> Hi all, >>> The Regional Students Group for Eastern Africa (RSG-EA) is one of the >>> grass-root level bodies of the International Society for Computational >>> Biology Student Council (ISCB-SC). The group has membership from ten >>> countries namely Burundi, Democratic Republic of Congo, Djibouti, >>> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. >>> Recently we proposed to organize a biohakathon three day event to: >>> >>> 1) Learn how to collaborate on bioinformatics programming projects >>> using open source tools. >>> 2) Forge an East African bioinformatics programming community. >>> 3) Contribute a module/code to Bioruby library. >>> >>> The event has been sponsored by a grant from ISCB and ILRI/Beca >>> bioinformatics platform in Nairobi, Kenya. >>> >>> We would like to seek for a suitable project work from one of the >>> developer(s) and the community. The project should ideally be of >>> beginner to intermediate level difficulty. A third of the participants >>> will be of intermediate level programming skills with experience from >>> Java,Python and Perl. while the rest will have beginner level skills. >>> >>> We were also wondering whether it would be possible to get one of the >>> lead contributors to bioruby project to give a short 15-20 minutes >>> introductory talk to the participants. We have excellent video >>> conferencing facilities at the ILRI/Beca hub. The event is slated to >>> take place in late September. >>> >>> Thank you >>> >>> -- >>> --------------- >>> Sincerely >>> George >>> KEMRI/Wellcome-Trust Research Program >>> Skype: george_g2 >>> Blog: http://biorelated.wordpress.com/ >>> _______________________________________________ >>> BioRuby Project - http://www.bioruby.org/ >>> BioRuby mailing list >>> BioRuby at lists.open-bio.org >>> http://lists.open-bio.org/mailman/listinfo/bioruby >> >> > > > > -- > --------------- > Sincerely > George > KEMRI/Wellcome-Trust Research Program > Skype: george_g2 > Blog: http://biorelated.wordpress.com/ > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From pjotr.public14 at thebird.nl Tue Aug 31 06:59:55 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 31 Aug 2010 12:59:55 +0200 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: <20100831105955.GA24001@thebird.nl> I Anthony, I wrote a Ruby wrapper for SAMtools. See http://thebird.nl/biolib/Adding_BioLib_BAM_SAM_Support.html If you want to test and use it, we can move it forward. Pj. On Tue, Aug 31, 2010 at 11:42:12AM +0100, Anthony Underwood wrote: > Hi > > Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. This field is becoming huge within bioinformatics and bioruby is lagging when it comes to tools to parse the date, specifically wrappers around the C functions found in samtools. I would have a go myself but have no experience in C so am sure others would do a better job. > > Thanks Anthony > On 17 Aug 2010, at 12:39, George Githinji wrote: > > > Thank you very much Toshiaki. We really appreciated the call and the > > much advice and helpful conversation that we held. > > We are distilling on the various ideas and we will update you and the > > list on what will be most appropriate and achievable for us. > > > > > > On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: > >> Hi George, > >> > >> Oops, I just realized that I missed to read this thread. Sorry. ;) > >> > >> I'm very surprised and excited to know that you guys will organize > >> a BioHackathon-like event in Kenya. > >> > >> Few hours ago, I finished a Skype meeting with the organizers > >> and learnt about the plan described at > >> > >> http://rsg-ea-bio-sprint-2010.wikispaces.com/ > >> (design of the poster is awesome, good job! :) > >> > >> Please use this mailing list to distill pre-hackathon preparations. > >> > >> We often asked "what can I contribute to the BioRuby project?" but > >> it is usually difficult to assign a target and mentoring on it > >> as the project itself has been self-organized. > >> (The Google Summer of Code will be an exception. Mentors are working > >> really hard and I really appreciate about that.) > >> > >> However, I take this opportunity to suggest several potential targets: > >> (in addition to 1. finishing the newly introduced BioRuby plugin system > >> and 2. supporting Semantic Web technologies on which we have been > >> working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ > >> and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) > >> > >> === interfaces to external resources: > >> > >> * API for Ensemble (suggested by Jan Aerts) > >> * API for UCSC (also suggested by Jan) > >> * API for BioMart, InterMine etc. > >> * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon > >> > >> === modern bioinformatics: > >> > >> * handling NGS data - wrappers and parsers for tools and libraries > >> * Proteomics > >> * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) > >> > >> === classical bioinformatics: > >> > >> * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) > >> > >> * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. > >> > >> === visualization modules: > >> > >> * BioGraphics (already started by Jan) - genome mapping / comparative genomics? > >> > >> * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby > >> > >> === improving docs: > >> > >> * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) > >> > >> * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. > >> > >> Regards > >> Toshiaki Katayama > >> > >> > >> On 2010/08/10, at 16:37, George Githinji wrote: > >> > >>> Hi all, > >>> The Regional Students Group for Eastern Africa (RSG-EA) is one of the > >>> grass-root level bodies of the International Society for Computational > >>> Biology Student Council (ISCB-SC). The group has membership from ten > >>> countries namely Burundi, Democratic Republic of Congo, Djibouti, > >>> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. > >>> Recently we proposed to organize a biohakathon three day event to: > >>> > >>> 1) Learn how to collaborate on bioinformatics programming projects > >>> using open source tools. > >>> 2) Forge an East African bioinformatics programming community. > >>> 3) Contribute a module/code to Bioruby library. > >>> > >>> The event has been sponsored by a grant from ISCB and ILRI/Beca > >>> bioinformatics platform in Nairobi, Kenya. > >>> > >>> We would like to seek for a suitable project work from one of the > >>> developer(s) and the community. The project should ideally be of > >>> beginner to intermediate level difficulty. A third of the participants > >>> will be of intermediate level programming skills with experience from > >>> Java,Python and Perl. while the rest will have beginner level skills. > >>> > >>> We were also wondering whether it would be possible to get one of the > >>> lead contributors to bioruby project to give a short 15-20 minutes > >>> introductory talk to the participants. We have excellent video > >>> conferencing facilities at the ILRI/Beca hub. The event is slated to > >>> take place in late September. > >>> > >>> Thank you > >>> > >>> -- > >>> --------------- > >>> Sincerely > >>> George > >>> KEMRI/Wellcome-Trust Research Program > >>> Skype: george_g2 > >>> Blog: http://biorelated.wordpress.com/ > >>> _______________________________________________ > >>> BioRuby Project - http://www.bioruby.org/ > >>> BioRuby mailing list > >>> BioRuby at lists.open-bio.org > >>> http://lists.open-bio.org/mailman/listinfo/bioruby > >> > >> > > > > > > > > -- > > --------------- > > Sincerely > > George > > KEMRI/Wellcome-Trust Research Program > > Skype: george_g2 > > Blog: http://biorelated.wordpress.com/ > > > > _______________________________________________ > > BioRuby Project - http://www.bioruby.org/ > > BioRuby mailing list > > BioRuby at lists.open-bio.org > > http://lists.open-bio.org/mailman/listinfo/bioruby > > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From biopython at maubp.freeserve.co.uk Tue Aug 31 07:12:19 2010 From: biopython at maubp.freeserve.co.uk (Peter) Date: Tue, 31 Aug 2010 12:12:19 +0100 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: On Tue, Aug 31, 2010 at 11:42 AM, Anthony Underwood wrote: > > Hi > > Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. > This field is becoming huge within bioinformatics and bioruby is lagging when it comes > to tools to parse the date, specifically wrappers around the C functions found in > samtools. I would have a go myself but have no experience in C so am sure others > would do a better job. > > Thanks ?Anthony Anthony - Have you looked at Pjotr's recent work in BioLib to wrap the samtools C API in Ruby (and other languages)? http://lists.open-bio.org/pipermail/biolib-dev/2010-August/000160.html Peter From ngoto at gen-info.osaka-u.ac.jp Tue Aug 31 08:00:47 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Tue, 31 Aug 2010 21:00:47 +0900 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: <20100831105955.GA24001@thebird.nl> References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> <20100831105955.GA24001@thebird.nl> Message-ID: <20100831120048.1FF4C1CBC57E@idnmail.gen-info.osaka-u.ac.jp> Hi, I found samtools-ruby at ISMB 2010 poster session. http://github.com/homonecloco/samtools-ruby The abstract of the presentation is available: http://www.iscb.org/cms_addon/conferences/ismb2010/posterlist.php?cat=J (Poster J52). They are developing GeeFu, Rails based web application for high-throughput genome sequencing, which is using BioRuby. http://github.com/danmaclean/gee_fu Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org On Tue, 31 Aug 2010 12:59:55 +0200 Pjotr Prins wrote: > I Anthony, > > I wrote a Ruby wrapper for SAMtools. See > > http://thebird.nl/biolib/Adding_BioLib_BAM_SAM_Support.html > > If you want to test and use it, we can move it forward. > > Pj. > > On Tue, Aug 31, 2010 at 11:42:12AM +0100, Anthony Underwood wrote: > > Hi > > > > Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. This field is becoming huge within bioinformatics and bioruby is lagging when it comes to tools to parse the date, specifically wrappers around the C functions found in samtools. I would have a go myself but have no experience in C so am sure others would do a better job. > > > > Thanks Anthony > > On 17 Aug 2010, at 12:39, George Githinji wrote: > > > > > Thank you very much Toshiaki. We really appreciated the call and the > > > much advice and helpful conversation that we held. > > > We are distilling on the various ideas and we will update you and the > > > list on what will be most appropriate and achievable for us. > > > > > > > > > On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: > > >> Hi George, > > >> > > >> Oops, I just realized that I missed to read this thread. Sorry. ;) > > >> > > >> I'm very surprised and excited to know that you guys will organize > > >> a BioHackathon-like event in Kenya. > > >> > > >> Few hours ago, I finished a Skype meeting with the organizers > > >> and learnt about the plan described at > > >> > > >> http://rsg-ea-bio-sprint-2010.wikispaces.com/ > > >> (design of the poster is awesome, good job! :) > > >> > > >> Please use this mailing list to distill pre-hackathon preparations. > > >> > > >> We often asked "what can I contribute to the BioRuby project?" but > > >> it is usually difficult to assign a target and mentoring on it > > >> as the project itself has been self-organized. > > >> (The Google Summer of Code will be an exception. Mentors are working > > >> really hard and I really appreciate about that.) > > >> > > >> However, I take this opportunity to suggest several potential targets: > > >> (in addition to 1. finishing the newly introduced BioRuby plugin system > > >> and 2. supporting Semantic Web technologies on which we have been > > >> working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ > > >> and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) > > >> > > >> === interfaces to external resources: > > >> > > >> * API for Ensemble (suggested by Jan Aerts) > > >> * API for UCSC (also suggested by Jan) > > >> * API for BioMart, InterMine etc. > > >> * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon > > >> > > >> === modern bioinformatics: > > >> > > >> * handling NGS data - wrappers and parsers for tools and libraries > > >> * Proteomics > > >> * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) > > >> > > >> === classical bioinformatics: > > >> > > >> * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) > > >> > > >> * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. > > >> > > >> === visualization modules: > > >> > > >> * BioGraphics (already started by Jan) - genome mapping / comparative genomics? > > >> > > >> * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby > > >> > > >> === improving docs: > > >> > > >> * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) > > >> > > >> * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. > > >> > > >> Regards > > >> Toshiaki Katayama > > >> > > >> > > >> On 2010/08/10, at 16:37, George Githinji wrote: > > >> > > >>> Hi all, > > >>> The Regional Students Group for Eastern Africa (RSG-EA) is one of the > > >>> grass-root level bodies of the International Society for Computational > > >>> Biology Student Council (ISCB-SC). The group has membership from ten > > >>> countries namely Burundi, Democratic Republic of Congo, Djibouti, > > >>> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. > > >>> Recently we proposed to organize a biohakathon three day event to: > > >>> > > >>> 1) Learn how to collaborate on bioinformatics programming projects > > >>> using open source tools. > > >>> 2) Forge an East African bioinformatics programming community. > > >>> 3) Contribute a module/code to Bioruby library. > > >>> > > >>> The event has been sponsored by a grant from ISCB and ILRI/Beca > > >>> bioinformatics platform in Nairobi, Kenya. > > >>> > > >>> We would like to seek for a suitable project work from one of the > > >>> developer(s) and the community. The project should ideally be of > > >>> beginner to intermediate level difficulty. A third of the participants > > >>> will be of intermediate level programming skills with experience from > > >>> Java,Python and Perl. while the rest will have beginner level skills. > > >>> > > >>> We were also wondering whether it would be possible to get one of the > > >>> lead contributors to bioruby project to give a short 15-20 minutes > > >>> introductory talk to the participants. We have excellent video > > >>> conferencing facilities at the ILRI/Beca hub. The event is slated to > > >>> take place in late September. > > >>> > > >>> Thank you > > >>> > > >>> -- > > >>> --------------- > > >>> Sincerely > > >>> George > > >>> KEMRI/Wellcome-Trust Research Program > > >>> Skype: george_g2 > > >>> Blog: http://biorelated.wordpress.com/ > > >>> _______________________________________________ > > >>> BioRuby Project - http://www.bioruby.org/ > > >>> BioRuby mailing list > > >>> BioRuby at lists.open-bio.org > > >>> http://lists.open-bio.org/mailman/listinfo/bioruby > > >> > > >> > > > > > > > > > > > > -- > > > --------------- > > > Sincerely > > > George > > > KEMRI/Wellcome-Trust Research Program > > > Skype: george_g2 > > > Blog: http://biorelated.wordpress.com/ > > > > > > _______________________________________________ > > > BioRuby Project - http://www.bioruby.org/ > > > BioRuby mailing list > > > BioRuby at lists.open-bio.org > > > http://lists.open-bio.org/mailman/listinfo/bioruby > > > > > > _______________________________________________ > > BioRuby Project - http://www.bioruby.org/ > > BioRuby mailing list > > BioRuby at lists.open-bio.org > > http://lists.open-bio.org/mailman/listinfo/bioruby > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From ralf at ark.in-berlin.de Tue Aug 3 06:58:16 2010 From: ralf at ark.in-berlin.de (Ralf Stephan) Date: Tue, 3 Aug 2010 08:58:16 +0200 Subject: [BioRuby] [PATCH] GO annotations fixes and improvements Message-ID: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> Hello, seeing the file bio/db/go.rb is seven years old, I have fixed and improved the GO annotations parsing (now GAF1, GAF2, Phenote) and output (GAF1, GAF2) for inclusion in next bioruby version. 0001-Fix-parsing-of-GAF-1.0-files-preliminary-adaptation.patch 0002-Add-parsing-and-output-of-GAF-2.0-files.patch 0003-Add-documentation-copyright.patch 0004-Add-Phenote-GOA-file-format-parsing-GAF1-output.patch I hope you will accept the patch set. Enjoy, ralf >From 05b435e0e3f791d0fae38a5d76cbc522835bf085 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Mon, 2 Aug 2010 19:43:58 +0200 Subject: [PATCH] Fix parsing of GAF 1.0 files, preliminary adaptations --- lib/bio/db/go.rb | 42 ++++++++++++++++++++++++++++-------------- 1 files changed, 28 insertions(+), 14 deletions(-) diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index 6b5d539..a8d3f47 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -186,6 +186,18 @@ class GO # p [entry.entry_id, entry.evidence, entry.goid] # end # + class ArrayOrString + def initialize(arg) + @var = arg + end + def join(char) + if @var.instance_of? String + then return @var + else return @var.join(char) + end + end + end + class GeneAssociation # < Bio::DB # Delimiter @@ -253,30 +265,34 @@ class GO # attr_reader :assigned_by - + alias entry_id db_object_id - # Parsing an entry (in a line) in the gene_association flatfile. - def initialize(entry) - tmp = entry.chomp.split(/\t/) + # Assign fields of an entry (in a line). + def assign(tmp) @db = tmp[0] @db_object_id = tmp[1] @db_object_symbol = tmp[2] @qualifier = tmp[3] # @goid = tmp[4] - @db_reference = tmp[5].split(/\|/) # + @db_reference = ArrayOrString.new(tmp[5].split(/\|/)) # @evidence = tmp[6] - @with = tmp[7].split(/\|/) # + @with = ArrayOrString.new(tmp[7].split(/\|/)) # @aspect = tmp[8] @db_object_name = tmp[9] # - @db_object_synonym = tmp[10].split(/\|/) # + @db_object_synonym = ArrayOrString.new(tmp[10].split(/\|/)) # @db_object_type = tmp[11] @taxon = tmp[12] # taxon:4932 @date = tmp[13] # 20010118 @assigned_by = tmp[14] end + # Parsing an entry (in a line) in the gene_association flatfile. + def initialize(entry) + tmp = entry.chomp.split(/\t/) + self.assign(tmp) + end # Returns GO_ID in /\d{7}/ format. Giving not nil arg, returns # /GO:\d{7}/ style. @@ -293,17 +309,15 @@ class GO # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. def to_str - return [@db, @db_object_id, @db_object_symbol, @quialifier, @goid, - @qualifier.join("|"), @evidence, @with.join("|"), @aspect, + return [@db, @db_object_id, @db_object_symbol, @qualifier, @goid, + @db_reference.join("|"), @evidence, @with.join("|"), @aspect, @db_object_name, @db_object_synonym.join("|"), @db_object_type, @taxon, @date, @assigned_by].join("\t") end end # class GeneAssociation - - - # = Container class for files in geneontology.org/go/external2go/*2go. +# = Container class for files in geneontology.org/go/external2go/*2go. # # The line syntax is: # @@ -402,8 +416,8 @@ class GO end end # class External2go - -end # class GO + +end end # module Bio -- 1.5.5 >From 1dbca2952239c4028a89a507d1badd5935c9e477 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Mon, 2 Aug 2010 20:12:36 +0200 Subject: [PATCH] Add parsing and output of GAF 2.0 files --- lib/bio/db/go.rb | 32 ++++++++++++++++++++++++++++++++ 1 files changed, 32 insertions(+), 0 deletions(-) diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index a8d3f47..affbe66 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -266,6 +266,11 @@ class GO # attr_reader :assigned_by + attr_reader :annotation_extension + + attr_reader :gene_product_form_id + + alias entry_id db_object_id @@ -286,6 +291,8 @@ class GO @taxon = tmp[12] # taxon:4932 @date = tmp[13] # 20010118 @assigned_by = tmp[14] + @annotation_extension = tmp[15] + @gene_product_form_id = tmp[16] end # Parsing an entry (in a line) in the gene_association flatfile. @@ -317,6 +324,31 @@ class GO end # class GeneAssociation + class GeneAssociation2 < GeneAssociation + + # Iterator through all entries + def self.parser(str) + if block_given? + str.each_line(DELIMITER) {|line| + next if /^!/ =~ line + yield GeneAssociation2.new(line) + } + else + galist = [] + str.each_line(DELIMITER) {|line| + next if /^!/ =~ line + galist << GeneAssociation2.new(line) + } + return galist + end + end + + # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. + def to_str + return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t") + end + end + # = Container class for files in geneontology.org/go/external2go/*2go. # # The line syntax is: -- 1.5.5 >From 4370b2bf3dc53f49334f9fb3948dc2fb584b75e5 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Mon, 2 Aug 2010 20:28:45 +0200 Subject: [PATCH] Add documentation, copyright --- bin/bioruby | 47 ------ bin/br_biofetch.rb | 47 ------ bin/br_bioflat.rb | 293 ----------------------------------- bin/br_biogetseq.rb | 45 ------ bin/br_pmfetch.rb | 422 --------------------------------------------------- lib/bio/db/go.rb | 21 +++- 6 files changed, 18 insertions(+), 857 deletions(-) delete mode 100755 bin/bioruby delete mode 100755 bin/br_biofetch.rb delete mode 100755 bin/br_bioflat.rb delete mode 100755 bin/br_biogetseq.rb delete mode 100755 bin/br_pmfetch.rb diff --git a/bin/bioruby b/bin/bioruby deleted file mode 100755 index 9980af8..0000000 --- a/bin/bioruby +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env ruby -# -# = BioRuby shell - command line interface for the BioRuby library -# -# Copyright:: Copyright (C) 2005, 2006, 2007 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id:$ -# - -begin - require 'rubygems' - gem 'bio', '>= 1.1.0' -rescue LoadError - require 'bio' -end -require 'bio/shell' - -# required to run commands (getseq, ls etc.) -include Bio::Shell - -# setup command line options, working directory, and irb configurations -Bio::Shell::Setup.new - -# loading workspace and command history -Bio::Shell.load_session - -# sets default email address for Entrez eUtils. -Bio::NCBI.default_email ||= 'staff at bioruby.org' - -# main loop -if Bio::Shell.cache[:rails] - Bio::Shell.cache[:rails].join -else - Signal.trap("SIGINT") do - Bio::Shell.cache[:irb].signal_handle - end - - catch(:IRB_EXIT) do - Bio::Shell.cache[:irb].eval_input - end -end - -# saving workspace, command history and configuration before exit -Bio::Shell.save_session - diff --git a/bin/br_biofetch.rb b/bin/br_biofetch.rb deleted file mode 100755 index 40319cf..0000000 --- a/bin/br_biofetch.rb +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env ruby -# -# = biofetch - BioFetch client -# -# Copyright:: Copyright (C) 2002 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id: br_biofetch.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $ -# - -require 'bio/io/fetch' - -def usage - default_url = 'http://bioruby.org/cgi-bin/biofetch.rb' - another_url = 'http://www.ebi.ac.uk/cgi-bin/dbfetch' - puts "#{$0} [-s[erver] #{another_url}] db id [style] [format]" - puts " server : URL of the BioFetch CGI (default is #{default_url})" - puts " db : database name (embl, genbank, etc.)" - puts " id : entry id" - puts " style : 'raw' or 'html' (default is 'raw')" - puts " format : change the output format ('default', 'fasta', etc.)" -end - -if ARGV.empty? or ARGV[0] =~ /^--?h/ - usage - exit 1 -end - -case ARGV[0] -when /^--?s/ # User specified server - ARGV.shift - serv = Bio::Fetch.new(ARGV.shift) - puts serv.fetch(*ARGV) -when /^--?e/ # EBI server - ARGV.shift - serv = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') - puts serv.fetch(*ARGV) -when /^--?r/ # BioRuby server - ARGV.shift - serv = Bio::Fetch.new('http://bioruby.org/cgi-bin/biofetch.rb') - puts serv.fetch(*ARGV) -else # Default server - puts Bio::Fetch.query(*ARGV) -end - - diff --git a/bin/br_bioflat.rb b/bin/br_bioflat.rb deleted file mode 100755 index 279da9b..0000000 --- a/bin/br_bioflat.rb +++ /dev/null @@ -1,293 +0,0 @@ -#!/usr/bin/env ruby -# -# = bioflat - OBDA flat file indexer (executable) -# -# Copyright:: Copyright (C) 2002 -# Naohisa Goto -# License:: The Ruby License -# -# $Id: br_bioflat.rb,v 1.17 2007/04/05 23:35:39 trevor Exp $ -# - -require 'bio' - -def usage - print <] [options...] [--files] FILES -Update index: - #{$0} --update --location DIR --dbname DBNAME [options...] [--files] FILES - -Create index options: - --primary=UNIQUE set primary namespece to UNIQUE - Default primary/secondary namespaces depend on - each format of flatfiles. - --secondary=KEY set secondary namespaces. - You may use this option many times to specify - more than one namespace. - --add-secondary=KEY add secondary namespaces to default specification. - You can use this option many times. - -Options only valid for --create (or --update) --type flat: - --sort=/path/to/sort use external sort program (e.g. /usr/bin/sort) - --sort=BUILTIN use builtin sort routine - (default: /usr/bin/sort or BUILTIN) - --env=/path/to/env use env program to run sort (default: /usr/bin/env) - --env-arg=XXXXXX argument given to the env program (default: LC_ALL=C) - (multiple --env-arg=XXXXXX can be specified) - -Options only valid for --update: - --renew re-read all flatfiles and update whole index - -Backward compatibility: - --makeindex DIR/DBNAME - same as --create --type flat --location DIR --dbname DBNAME - --makeindexBDB DIR/DBNAME - same as --create --type bdb --location DIR --dbname DBNAME - --format=CLASS - instead of genbank|embl|fasta, specifing a class name is allowed - -Show namespaces: - #{$0} --show-namespaces [--location DIR --dbname DBNAME] [DIR/DBNAME] -or - #{$0} --show-namespaces [--format=CLASS] -or - #{$0} --show-namespaces --files file - -EOM - -end - - -def do_index(mode = :create) - case ARGV[0] - when /^\-\-?make/ - dbpath = ARGV[1] - args = ARGV[2..-1] - is_bdb = nil - when /^\-\-?make.*bdb/i - dbname = ARGV[1] - args = ARGV[2..-1] - is_bdb = Bio::FlatFileIndex::MAGIC_BDB - when /^\-\-create/, /^\-\-update/ - args = ARGV[1..-1] - else - usage - end - - options = {} - - while args.first =~ /^\-/ - case x = args.shift - - # OBDA stuff - - when /^\-\-?format$/ - args.shift - format = nil # throw this f*ckin' mess for auto detect :) - when /^\-\-?location/ - location = args.shift.chomp('/') - when /^\-\-?dbname/ - dbname = args.shift - when /^\-\-?(index)?type/ - indextype = args.shift - case indextype - when /bdb/ - is_bdb = Bio::FlatFileIndex::MAGIC_BDB - when /flat/ - is_bdb = nil - else - usage - end - - # BioRuby extension - - when /^\-\-?files/i - break - - when /^\-\-?format\=(.*)/i - format = $1 - - when /^\-\-?sort\=(.*)/i - options['sort_program'] = $1 - options['onmemory'] = nil - when /^\-\-?no\-?te?mp/i - options['onmemory'] = true - - when /^\-\-?env\=(.*)/i - options['env_program'] = $1 - - when /^\-\-?env-arg(?:ument)?\=(.*)/i - options['env_program_arguments'] ||= [] - options['env_program_arguments'].push $1 - - when /^\-\-?primary.*\=(.*)/i - options['primary_namespace'] = $1 - - when /^\-\-?add-secondary.*\=(.*)/i - unless options['additional_secondary_namespaces'] then - options['additional_secondary_namespaces'] = [] - end - options['additional_secondary_namespaces'] << $1 if $1.length > 0 - - when /^\-\-?secondary.*\=(.*)/i - unless options['secondary_namespaces'] then - options['secondary_namespaces'] = [] - end - options['secondary_namespaces'] << $1 if $1.length > 0 - - when /^\-\-?renew/ - options['renew'] = true - - else - $stderr.print "Warning: ignoring invalid option #{x.inspect}\n" - end - end - - dbpath = File.join(location, dbname) unless dbpath - if mode == :update then - Bio::FlatFileIndex::update_index(dbpath, format, options, *args) - else - Bio::FlatFileIndex::makeindex(is_bdb, dbpath, format, options, *args) - end -end - - -def do_search - dbname = nil - location = nil - names = [] - while x = ARGV.shift - case x - when /\A\-\-?search/i - #do nothing - when /\A\-\-?location/i - location = ARGV.shift.to_s.chomp('/') - when /\A\-\-?dbname/i - dbname = ARGV.shift - when /\A\-\-?name(?:space)?(?:\=(.+))?/i - if $1 then - names << $1 - elsif x = ARGV.shift - names << x - end - else - ARGV.unshift x - break - end - end - dbname = ARGV.shift unless dbname - dbname = File.join(location, dbname) unless location.to_s.empty? - db = Bio::FlatFileIndex.open(dbname) - ARGV.each do |key| - $stderr.print "Searching for \'#{key}\'...\n" - #r = db.search(key) - #$stderr.print "OK, #{r.size} entry found\n" - #if r.size > 0 then - # print r - #end - begin - if names.empty? then - r = db.include?(key) - else - r = db.include_in_namespaces?(key, *names) - end - rescue RuntimeError - $stderr.print "ERROR: #{$!}\n" - next - end - r = [] unless r - $stderr.print "OK, #{r.size} entry found\n" - r.each do |i| - print db.search_primary(i) - end - end - db.close -end - - -def do_show_namespaces - dbname = nil - location = nil - files = nil - format = nil - names = [] - while x = ARGV.shift - case x - when /\A\-\-?(show\-)?name(space)?s/i - #do nothing - when /\A\-\-?location/i - location = ARGV.shift.to_s.chomp('/') - when /\A\-\-?dbname/i - dbname = ARGV.shift - when /\A\-\-?format(?:\=(.+))?/i - if $1 then - format = $1 - elsif x = ARGV.shift - format = x - end - when /\A\-\-?files/i - files = ARGV - break - else - ARGV.unshift x - break - end - end - if files then - k = nil - files.each do |x| - k = Bio::FlatFile.autodetect_file(x) - break if k - end - if k then - $stderr.print "Format: #{k.to_s}\n" - format = k - else - $stderr.print "ERROR: couldn't determine file format\n" - return - end - end - $stderr.print "Namespaces: (first line: primary namespace)\n" - if format then - parser = Bio::FlatFileIndex::Indexer::Parser.new(format) - print parser.primary.name, "\n" - puts parser.secondary.keys - else - dbname = ARGV.shift unless dbname - dbname = File.join(location, dbname) unless location.to_s.empty? - db = Bio::FlatFileIndex.open(dbname) - puts db.namespaces - db.close - end -end - -if ARGV.size > 1 - case ARGV[0] - when /--make/, /--create/ - Bio::FlatFileIndex::DEBUG.out = true - do_index - when /--update/ - Bio::FlatFileIndex::DEBUG.out = true - do_index(:update) - when /\A\-\-?(show\-)?name(space)?s/i - do_show_namespaces - when /--search/ - do_search - else #default is search - do_search - end -else - usage -end - diff --git a/bin/br_biogetseq.rb b/bin/br_biogetseq.rb deleted file mode 100755 index 76c94de..0000000 --- a/bin/br_biogetseq.rb +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env ruby -# -# = biogetseq - OBDA sequence data retrieval (executable) -# -# Copyright:: Copyright (C) 2003 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id: br_biogetseq.rb,v 1.4 2007/04/05 23:35:39 trevor Exp $ -# - -require 'bio' - -def usage - print < [--namespace ] entry_id [entry_id] -END - exit 1 -end - -if ARGV.size < 3 - usage -end - -while ARGV.first =~ /^-/ - case ARGV.shift - when /^\-\-format/ - ARGV.shift - raise NotImplementedError - when /^\-\-dbname/ - dbname = ARGV.shift - when /^\-\-namespace/ - namespace = ARGV.shift - end -end - -reg = Bio::Registry.new -db = reg.get_database(dbname) -if namespace - db['namespace'] = namespace -end -ARGV.each do |entry| - puts db.get_by_id(entry) -end - diff --git a/bin/br_pmfetch.rb b/bin/br_pmfetch.rb deleted file mode 100755 index eb0f4ed..0000000 --- a/bin/br_pmfetch.rb +++ /dev/null @@ -1,422 +0,0 @@ -#!/usr/bin/env ruby -# -# = pmfetch - PubMed client -# -# Copyright:: Copyright (C) 2004, 2005 -# Toshiaki Katayama -# License:: The Ruby License -# -# $Id:$ -# - -require 'bio' - -PROG_VER = "Powered by BioRuby #{Bio::BIORUBY_VERSION_ID}" -PROG_NAME = File.basename($0) - - -require 'getoptlong' - - -### formatting - -class String - def fill(fill_column = 80, prefix = '', separater = ' ') - prefix = ' ' * prefix if prefix.is_a?(Integer) - maxlen = fill_column - prefix.length - raise "prefix is longer than fill_column" if maxlen <= 0 - - cursor = pos = 0 - lines = [] - while cursor < self.length - line = self[cursor, maxlen] - pos = line.rindex(separater) - pos = nil if line.length < maxlen - if pos - len = pos + separater.length - lines << self[cursor, len] - cursor += len - else - lines << self[cursor, maxlen] - cursor += maxlen - end - end - return lines.join("\n#{prefix}") - end -end - - -module Bio - class Reference - def report - if (num = @authors.size) > 10 - authors = "#{@authors[0]} et al. (#{num} authors)" - elsif num > 4 - sep = ',' * (num - 1) - authors = "#{@authors[0]}#{sep} #{@authors[-1]}" - else - authors = authors_join(' & ') - end - journal = "#{@journal} #{@year} #{@volume}(#{@issue}):#{@pages}" - - indent = 8 - prefix = ' ' * indent - [ - "#{@pages[/\d+/]}".ljust(indent) + "#{@title}".fill(78, indent), - authors, - "#{journal} [PMID:#{@pubmed}]", - ].join("\n#{prefix}") - end - end -end - - -class PMFetch - - class Examples < StandardError; end - class Version < StandardError; end - class Usage < StandardError; end - - ### default options - - def initialize - @format = 'rd' - @search_opts = { - 'retmax' => 20, - } - @query = nil - @query_opts = [] - @pmid_list_only = false - - pmfetch - end - - - ### main - - def pmfetch - begin - set_options - parse_options - check_query - rescue PMFetch::Examples - puts examples - exit - rescue PMFetch::Version - puts version - exit - rescue PMFetch::Usage - puts usage - exit - rescue GetoptLong::MissingArgument, GetoptLong::InvalidOption - puts usage - exit - end - - list = pm_esearch - - if list.empty? - ; - elsif @pmid_list_only - puts list - else - pm_efetch(list) - end - end - - - ### help - - def usage -%Q[ -Usage: #{PROG_NAME} [options...] "query string" - or #{PROG_NAME} --query "query string" [other options...] - -Options: - -q --query "genome AND virus" Query string for PubMed search - -t --title "mobile elements" Title of the article to search - -j --journal "genome res" Journal title to search - -v --volume # Journal volume to search - -i --issue # Journal issue to search - -p --page # First page number of the article to search - -a --author "Altschul SF" Author name to search - -m --mesh "SARS virus" MeSH term to search - -f --format bibtex Summary output format - --pmidlist Output only a list of PubMed IDs - -n --retmax # Number of articles to retrieve at the maximum - -N --retstart # Starting number of the articles to retrieve - -s --sort pub+date Sort method for the summary output - --reldate # Search articles published within recent # days - --mindate YYYY/MM/DD Search articles published after the date - --maxdate YYYY/MM/DD Search articles published before the date - --help Output this help, then exit - --examples Output examples, then exit - --version Output version number, then exit - -Formats: - endnote, medline, bibitem, bibtex, report, rd, - nature, science, genome_res, genome_biol, nar, current, trends, cell - -Sort: - author, journal, pub+date, page - -See the following pages for the PubMed search options: - http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html - http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html - -#{version} - -] - end - - def version - PROG_VER - end - - def examples - DATA.read.gsub('PMFetch', PROG_NAME) - end - - - private - - - ### options - - def set_options - @parser = GetoptLong.new - - @parser.set_options( - [ '--query', '-q', GetoptLong::REQUIRED_ARGUMENT ], - [ '--title', '-t', GetoptLong::REQUIRED_ARGUMENT ], - [ '--journal', '-j', GetoptLong::REQUIRED_ARGUMENT ], - [ '--volume', '-v', GetoptLong::REQUIRED_ARGUMENT ], - [ '--issue', '-i', GetoptLong::REQUIRED_ARGUMENT ], - [ '--page', '-p', GetoptLong::REQUIRED_ARGUMENT ], - [ '--author', '-a', GetoptLong::REQUIRED_ARGUMENT ], - [ '--mesh', '-m', GetoptLong::REQUIRED_ARGUMENT ], - [ '--format', '-f', GetoptLong::REQUIRED_ARGUMENT ], - [ '--pmidlist', GetoptLong::NO_ARGUMENT ], - [ '--retmax', '-n', GetoptLong::REQUIRED_ARGUMENT ], - [ '--retstart', '-N', GetoptLong::REQUIRED_ARGUMENT ], - [ '--sort', '-s', GetoptLong::REQUIRED_ARGUMENT ], - [ '--reldate', GetoptLong::REQUIRED_ARGUMENT ], - [ '--mindate', GetoptLong::REQUIRED_ARGUMENT ], - [ '--maxdate', GetoptLong::REQUIRED_ARGUMENT ], - [ '--examples', GetoptLong::NO_ARGUMENT ], - [ '--help', GetoptLong::NO_ARGUMENT ], - [ '--version', GetoptLong::NO_ARGUMENT ] - ) - end - - def parse_options - @parser.each_option do |optname, optarg| - case optname - when /--query/ - @query = optarg - when /--title/ - @query_opts << "#{optarg}[ti]" - when /--journal/ - @query_opts << "#{optarg}[ta]" - when /--volume/ - @query_opts << "#{optarg}[vi]" - when /--issue/ - @query_opts << "#{optarg}[ip]" - when /--page/ - @query_opts << "#{optarg}[pg]" - when /--author/ - @query_opts << "#{optarg}[au]" - when /--mesh/ - @query_opts << "#{optarg}[mh]" - when /--format/ - @format = optarg - when /--pmidlist/ - @pmid_list_only = true - when /--examples/ - raise PMFetch::Examples - when /--help/ - raise PMFetch::Usage - when /--version/ - raise PMFetch::Version - when /--sort/ - @sort = optarg - @search_opts["sort"] = @sort unless @sort == "page" - else - optname.delete!('-') - @search_opts[optname] = optarg - end - end - end - - - ### check query - - def check_query - p @query if $DEBUG - @query ||= ARGV.join(" ") unless ARGV.empty? - - p @query if $DEBUG - @query_str = [ @query, @query_opts ].flatten.compact.join(" AND ") - - p @query_str if $DEBUG - if @query_str.empty? - raise PMFetch::Usage - end - end - - - ### search - - def pm_esearch - return Bio::PubMed.esearch(@query_str, @search_opts) - end - - def pm_efetch(list) - entries = Bio::PubMed.efetch(list) - - if @format == 'medline' - medline_format(entries) - else - entries = parse_entries(entries) - if @sort == 'page' - entries = sort_entries(entries) - end - if @format == 'report' - report_format(entries) - else - other_format(entries) - end - end - end - - - ### output - - def medline_format(entries) - entries.each do |entry| - puts entry - puts '//' - end - end - - def parse_entries(entries) - entries.map { |entry| Bio::MEDLINE.new(entry) } - end - - def sort_entries(entries) - if RUBY_VERSION > "1.8.0" - entries.sort_by { |x| - [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i ] - } - else - entries.map { |x| - [ x.journal, x.volume.to_i, x.issue.to_i, x.pages.to_i, x ] - }.sort { |a, b| - a[0..3] <=> b[0..3] - }.map { |y| - y.pop - } - end - end - - def report_format(entries) - entries.each do |entry| - puts entry.reference.report - puts - end - end - - def other_format(entries) - entries.each do |entry| - puts entry.reference.format(@format) - puts - end - end - -end - - -PMFetch.new - - -__END__ - -= Examples : PubMed search - -These four lines will do the same job. - - % PMFetch transcription factor - % PMFetch "transcription factor" - % PMFetch --query "transcription factor" - % PMFetch -q "transcription factor" - - -Retrieve max 100 artiecles (20 is a NCBI's default) at a time, use --retmax as - - % PMFetch -q "transcription factor" --retmax 100 - -and, to retrieve next 100 articles, use --retstart as - - % PMFetch -q "transcription factor" --retmax 100 --retstart 100 - - -You can narrow the search target for an issue of the journal. - - % PMFetch --journal development --volume 131 --issue 3 transcription factor - - -Short options are also available. - - % PMFetch -j development -v 131 -i 3 transcription factor - - -Search articles indexed in PubMed within these 90 days. - - % PMFetch -q "transcription factor" --reldate 90 - - -Search articles indexed in PubMed during the period of 2001/04/01 to 2001/08/31 - - % PMFetch -q "transcription factor" --mindate 2001/04/01 --maxdate 2001/08/31 - - -Output format can be changed by --format option. - - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f report - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f rd - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f endnote - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f medline - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibitem - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f bibtex - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f nature - % PMFetch -q "transcription factor" -j development -v 131 -i 3 -f science - - -Generate title listings for the journal report meeting (don't forget -to inclease the number of --retmax for fetching all titles). - - % PMFetch -f report -j development -v 131 -i 3 -n 100 - - -Search by author name. - - % PMFetch -a "Karlin S" - % PMFetch -a "Koonin EV" - - -Search by MeSH term. - - % PMFetch -m "computational biology" - % PMFetch -m "SARS virus" - - -Search by PubMed ID (PMID). - - % PMFetch 12345 - - -Output PMID only. - - % PMFetch --pmidlist tardigrada - - diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index affbe66..62f78ba 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -1,8 +1,9 @@ # # = bio/db/go.rb - Classes for Gene Ontology # -# Copyright:: Copyright (C) 2003 +# Copyright:: Copyright (C) 2003, 2010 # Mitsuteru C. Nakao +# R. Stephan # License:: The Ruby License # # $Id:$ @@ -174,8 +175,8 @@ class GO # = Bio::GO::GeneAssociation # $CVSROOT/go/gene-associations/gene_association.* # - # Data parser for the gene_association go annotation. - # See also the file format http://www.geneontology.org/doc/GO.annotation.html#file + # Data parser for the gene_association go annotation 1.0. + # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml # # == Example # @@ -324,6 +325,20 @@ class GO end # class GeneAssociation + # = Bio::GO::GeneAssociation2 + # + # Data parser for the gene_association go annotation 2.0. + # See also the file format http://www.geneontology.org/GO.format.gaf-2_0.shtml + # + # == Example + # + # mgi_data = File.open('gene_association.mgi').read + # mgi = Bio::GO::GeneAssociation2.parser(mgi_data) + # + # Bio::GO::GeneAssociation.parser(mgi_data) do |entry| + # p [entry.entry_id, entry.evidence, entry.goid] + # end + # class GeneAssociation2 < GeneAssociation # Iterator through all entries -- 1.5.5 >From c6729520a9faf985975fb7f5b93128cdbe31b0e8 Mon Sep 17 00:00:00 2001 From: R. Stephan Date: Tue, 3 Aug 2010 08:47:31 +0200 Subject: [PATCH] Add Phenote GOA file format parsing, GAF1 output --- lib/bio/db/go.rb | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 60 insertions(+), 1 deletions(-) diff --git a/lib/bio/db/go.rb b/lib/bio/db/go.rb index 62f78ba..b265c7e 100644 --- a/lib/bio/db/go.rb +++ b/lib/bio/db/go.rb @@ -358,12 +358,71 @@ class GO end end - # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. + # Bio::GO::GeneAssociation2#to_str -> a line of gene_association file. def to_str return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t") end end + # = Bio::GO::Phenote_GOA + # + # Data parser for the Phenote file format which is similar to GAF1. + # We serialize to GAF1 format (to_str). + # See http://www.phenote.org + # See also the file format http://www.geneontology.org/GO.format.gaf-1_0.shtml + # + # == Example + # + # mgi_data = File.open('gene_association.mgi').read + # mgi = Bio::GO::Phenote_GOA.parser(mgi_data) + # + # Bio::GO::Phenote_GOA.parser(mgi_data) do |entry| + # p.to_str + # end + + class Phenote_GOA < GeneAssociation + + # Retruns an Array of parsed Phenote file. + # Block is acceptable. + def self.parser(str) + if block_given? + str.each_line(DELIMITER) {|line| + next if /^DB\t/ =~ line + yield Phenote_GOA.new(line) + } + else + galist = [] + str.each_line(DELIMITER) {|line| + next if /^DB\t/ =~ line + galist << Phenote_GOA.new(line) + } + return galist + end + end + + # Assign fields of an entry (in a line) in Phenote format. + def assign(tmp) + @db = tmp[0] + @db_object_id = tmp[1] + @db_object_symbol = tmp[2] + @qualifier = tmp[3] # + @goid = tmp[4] + # We ignore Phenote's tmp[5] + @db_reference = ArrayOrString.new(tmp[6].split(/\|/)) # + @evidence = tmp[7] + @with = ArrayOrString.new(tmp[8].split(/\|/)) # + @aspect = tmp[9] + @db_object_name = tmp[10] # + @db_object_synonym = ArrayOrString.new(tmp[11].split(/\|/)) # + @db_object_type = tmp[12] + @taxon = tmp[13] # taxon:4932 + @date = tmp[14] # 20010118 + @assigned_by = tmp[15] + # We ignore Phenote's tmp[16-18] + end + end + + # # = Container class for files in geneontology.org/go/external2go/*2go. # # The line syntax is: -- 1.5.5 Ralf Stephan http://www.ark.in-berlin.de pub 1024D/C5114CB2 2009-06-07 [expires: 2011-06-06] Key fingerprint = 76AE 0D21 C06C CBF9 24F8 7835 1809 DE97 C511 4CB2 From ralf at ark.in-berlin.de Tue Aug 3 07:13:31 2010 From: ralf at ark.in-berlin.de (Ralf Stephan) Date: Tue, 3 Aug 2010 09:13:31 +0200 Subject: [BioRuby] [PATCH] GO annotations fixes and improvements In-Reply-To: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> References: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> Message-ID: <2CA96C77-E1BC-4958-AA2A-9CC97F346917@ark.in-berlin.de> Please ignore changes to bin/* in patch 0003 Sorry, ralf From ngoto at gen-info.osaka-u.ac.jp Tue Aug 3 16:13:27 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Wed, 4 Aug 2010 01:13:27 +0900 Subject: [BioRuby] [PATCH] GO annotations fixes and improvements In-Reply-To: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> References: <40095810-63F1-4919-8946-7CA3802B67B1@ark.in-berlin.de> Message-ID: <20100803161327.BF9DE1CBC410@idnmail.gen-info.osaka-u.ac.jp> Hi Ralf, Thank you to send patches. I reviewed the patch. Please see the comments below. Some part of the patches will be merged soon, and some would be later, and some will not be merged. On Tue, 3 Aug 2010 08:58:16 +0200 Ralf Stephan wrote: > --- a/lib/bio/db/go.rb > +++ b/lib/bio/db/go.rb > @@ -186,6 +186,18 @@ class GO > # p [entry.entry_id, entry.evidence, entry.goid] > # end > # > + class ArrayOrString > + def initialize(arg) > + @var = arg > + end > + def join(char) > + if @var.instance_of? String > + then return @var > + else return @var.join(char) > + end > + end > + end I disagree with the class. For GAF, there is no need to introduce such new wrapper class. > @@ -253,30 +265,34 @@ class GO > > # > attr_reader :assigned_by > - > + > alias entry_id db_object_id > > > - # Parsing an entry (in a line) in the gene_association flatfile. > - def initialize(entry) > - tmp = entry.chomp.split(/\t/) > + # Assign fields of an entry (in a line). > + def assign(tmp) I don't like the method name. The word "assign" is used in the context of Gene Ontology Annotation, and it is better not to use the word for the class internal use to avoid confusion. > @@ -293,17 +309,15 @@ class GO > > # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. > def to_str > - return [@db, @db_object_id, @db_object_symbol, @quialifier, @goid, > - @qualifier.join("|"), @evidence, @with.join("|"), @aspect, > + return [@db, @db_object_id, @db_object_symbol, @qualifier, @goid, > + @db_reference.join("|"), @evidence, @with.join("|"), @aspect, > @db_object_name, @db_object_synonym.join("|"), @db_object_type, > @taxon, @date, @assigned_by].join("\t") > end This seems bug fix. Thanks! By the way, I think it is good to change to_str to to_s, because the GeneAssociation class do not need to behave like a string. > --- a/lib/bio/db/go.rb > +++ b/lib/bio/db/go.rb > @@ -266,6 +266,11 @@ class GO > # > attr_reader :assigned_by > > + attr_reader :annotation_extension > + > + attr_reader :gene_product_form_id > + > + If you want to add GeneAssociation2 class, these new attributes should only be added in the GeneAssociation2 class. Alternatively, it is also good to support both GAF 1.0 and 2.0 in the GeneAssociation class. > alias entry_id db_object_id > > > @@ -286,6 +291,8 @@ class GO > @taxon = tmp[12] # taxon:4932 > @date = tmp[13] # 20010118 > @assigned_by = tmp[14] > + @annotation_extension = tmp[15] > + @gene_product_form_id = tmp[16] > end > > # Parsing an entry (in a line) in the gene_association flatfile. > @@ -317,6 +324,31 @@ class GO > > end # class GeneAssociation > > + class GeneAssociation2 < GeneAssociation > + > + # Iterator through all entries > + def self.parser(str) > + if block_given? > + str.each_line(DELIMITER) {|line| > + next if /^!/ =~ line > + yield GeneAssociation2.new(line) > + } > + else > + galist = [] > + str.each_line(DELIMITER) {|line| > + next if /^!/ =~ line > + galist << GeneAssociation2.new(line) > + } > + return galist > + end > + end > + > + # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. > + def to_str > + return [super.to_str, @annotation_extension, @gene_product_form_id].join("\t") > + end > + end > + The role of the GeneAssociation2 class will be carefully considered. It might be merged to the GeneAssociation class. The method name "parser" may be changed, or the method might not be merged. > + class Phenote_GOA < GeneAssociation The name of the class would be changed, based on the format name used in the Phenote community. > + # Assign fields of an entry (in a line) in Phenote format. > + def assign(tmp) > + @db = tmp[0] > + @db_object_id = tmp[1] > + @db_object_symbol = tmp[2] > + @qualifier = tmp[3] # > + @goid = tmp[4] > + # We ignore Phenote's tmp[5] Please do not ignore. When supporting a new data format, all data should be parsed and stored unless it is technically very difficult. Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From jimktrainslists at gmail.com Wed Aug 4 19:32:02 2010 From: jimktrainslists at gmail.com (James Keener) Date: Wed, 4 Aug 2010 15:32:02 -0400 Subject: [BioRuby] Consensus sequence In-Reply-To: References: Message-ID: At alignment.rb:118 there is this function: # Returns consensus character of the site. # If consensus is found, eturns a single-letter string. # If not, returns nil. def consensus_string(threshold = 1.0) return nil if self.size <= 0 return self[0] if self.sort.uniq.size == 1 h = Hash.new(0) self.each { |x| h[x] += 1 } total = self.size b = h.to_a.sort do |x,y| z = (y[1] <=> x[1]) z = (self.index(x[0]) <=> self.index(y[0])) if z == 0 z end if total * threshold <= b[0][1] then b[0][0] else nil end end Now, I have 2 questions about it. 1) Why is it sorting? Shouldn't it use a linear search? 2) How can the count of the greatest residue (b[0][1]) be larger than or equal to the total number of residues? Also, there is a whole set of functions I am adding (group entropy and some book keeping/housecleaning things) and would like to commit them back. What is the best way to commit them back? Jim From tomoakin at kenroku.kanazawa-u.ac.jp Thu Aug 5 00:28:04 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Thu, 5 Aug 2010 09:28:04 +0900 Subject: [BioRuby] Consensus sequence In-Reply-To: References: Message-ID: Hi, > 2) How can the count of the greatest residue (b[0][1]) be larger > than or equal to the total number of residues? It is obvious that the count of the greatest residue is equal to the total number of residues if all the residues are identical. Presumably, the parameter threshold should be 0 <= threshold <= 1.0 > 1) Why is it sorting? Shouldn't it use a linear search? I really don't know. So this is just my feeling, but this could be simplicity, readability, and extensibility... Did you compare the performance with linear search? -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From ngoto at gen-info.osaka-u.ac.jp Thu Aug 5 06:16:41 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Thu, 5 Aug 2010 15:16:41 +0900 Subject: [BioRuby] Consensus sequence In-Reply-To: References: Message-ID: <20100805061641.B9C2D1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> Hi, On Thu, 5 Aug 2010 09:28:04 +0900 Tomoaki NISHIYAMA wrote: > Hi, > > > 2) How can the count of the greatest residue (b[0][1]) be larger > > than or equal to the total number of residues? > > > It is obvious that the count of the greatest residue is equal to the > total number of residues > if all the residues are identical. > > Presumably, the parameter threshold should be > 0 <= threshold <= 1.0 In addition, the bahavior is undefined when the threshold is out of the range. > > 1) Why is it sorting? Shouldn't it use a linear search? I forget what I was thinking when I wrote it in 2003. > I really don't know. So this is just my feeling, > but this could be simplicity, readability, and extensibility... > Did you compare the performance with linear search? For simplicity and readability, using Enumerable#max (Array#max) seems to be the straightforward way, though I don't know much about the performance. On Wed, 4 Aug 2010 15:32:02 -0400 James Keener wrote: > Also, there is a whole set of functions I am adding (group entropy and some book keeping/housecleaning things) and would like to commit them back. What is the best way to commit them back? Please create your fork on GitHub and push them to the GitHub fork. It seems http://github.com/fredrikj/bioruby also make some modifications to the alignment classes (in lib/bio/appl/seala.rb in the repository). -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From sararayburn at gmail.com Thu Aug 5 17:33:14 2010 From: sararayburn at gmail.com (Sara Rayburn) Date: Thu, 5 Aug 2010 12:33:14 -0500 Subject: [BioRuby] GSoC project update Message-ID: Hi all, This week I've implemented the generalized version of the speciation/duplication inference algorithm. This extension allows the algorithm to be used on trees with non-binary nodes, however it is based on the unverified implementation found in the java Forester package. My remaining goals for the rest of the project time are 1) preparing the code for merging with the main bioruby repository. 2) trying to construct an informal proof of correctness of the generalized sdi algorithm. Thanks, Sara Rayburn From missy at be.to Fri Aug 6 02:53:15 2010 From: missy at be.to (MISHIMA, Hiroyuki) Date: Fri, 06 Aug 2010 11:53:15 +0900 Subject: [BioRuby] Indexing fasta file with Ruby 1.9.1 In-Reply-To: <30c617b1-dcc4-42b9-9ffe-498fc663708b@ingm.it> References: <30c617b1-dcc4-42b9-9ffe-498fc663708b@ingm.it> Message-ID: <4C5B791B.6070307@be.to> Hi Raoul and all, Raoul Bonnal wrote (2010/07/30 20:28): > Caught error: # "bta-miR-3596":String> in "mature.fa" position 1178667 The following patch seems to work... --- ./indexer-orig.rb 2010-08-06 11:40:52.000000000 +0900 +++ /usr/local/lib/ruby/gems/1.9.1/gems/bio-1.4.0/lib/bio/io/flatfile/indexer.rb 2010-08-06 11:38:53.000000000 +0900 @@ -155,8 +155,15 @@ def parse_secondary self.secondary.each do |x| p = x.proc.call(@entry) - p.each do |y| - yield x.name, y if y.length > 0 + + if p.respond_to? :each + p.each do |y| + yield x.name, y if y.length > 0 + end + else + p.each_line do |y| + yield x.name, y if y.length > 0 + end end end end This is typical incompatibility between Ruby-1.8 and -1.9. In Ruby-1.9, String#each should be replaced by String#each_line. irb-1.8> "abc\ndef".each {|l| p l} "abc\n" "def" => "abc\ndef" irb-1.9> "abc\ndef".each {|l| p l} NoMethodError: undefined method `each' for "abc\ndef":String from (irb):1 from /usr/local/bin/irb-1.9:12:in `
' irb-1.9> "abc\ndef".each_line {|l| p l} "abc\n" "def" => "abc\ndef" Because the "p" variable can be String or Array in the "parse_secondary" method, I used "respond_to?". I do not know this instant patch is right way or no. Sincerely yours, Hiro -- MISHIMA, Hiroyuki, DDS, Ph.D. COE Research Fellow Department of Human Genetics Nagasaki University Graduate School of Biomedical Sciences From anurag08priyam at gmail.com Sun Aug 8 15:32:57 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Sun, 8 Aug 2010 21:02:57 +0530 Subject: [BioRuby] development updates? Message-ID: Today I had a chat with Jan when I found that a manuscript on BioRuby is under progress. I do not remember it being discussed on the list. Perhaps BioRuby's development process needs to be a little more transparent, in the sense that everybody should be kept in the loop for a more synergic development process. Also chance of a new contributors chiming in increases with a more active( updated ) list. I would suggest that the list be constantly updated : 1. short and long term goals - targets for minor and major releases, prioritizing bugs or feature requests or design decisions. 2. what is cooking - each developer could update the list on what he/she is working at, or/and a fortnightly or a monthly update on the cummulative development status( how much of the target has been achieved and stuff ) 3. important decisions and changes. Perhaps, a development specific list can be setup to keep the user and the developer space segregated. A development list can also be attached to the issue tracker so that developers are automatically updated on new bugs and feature requests. Or, a blog can be setup where regular commiters have posting access. P.S : The idea behind this mail is to spark some discussion on a more efficient software development culture and hopefully adopt it :). -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From anurag08priyam at gmail.com Sun Aug 8 16:09:56 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Sun, 8 Aug 2010 21:39:56 +0530 Subject: [BioRuby] ohloh Message-ID: i just added http://github.com/bioruby/bioruby.git master as an enlistment to ohlo's BioRuby page. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From anurag08priyam at gmail.com Mon Aug 9 05:31:13 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Mon, 9 Aug 2010 11:01:13 +0530 Subject: [BioRuby] development updates? In-Reply-To: <20100808222310.GA15100@thebird.nl> References: <20100808222310.GA15100@thebird.nl> Message-ID: > > What does the fact that *you* are not aware of a manuscript have to do > with the development process? I miss the connection. > I just thought it would be better if *all* the developers( important or less important ) are updated on what is happening. I see it happening on the few other lists I have been subscribed to all the time. > As BioRuby is an OSS project, feel free to take the lead. You > understand OSS development, right? > Well to be very frank this has been my very first attempt at OSS contribution. So, may be I really do not understand it very well. > Talk is cheap. Making stuff happen - that is what counts. > FYI, that is exactly what I am trying to do. I see very different culture in other development teams and I find it better. I was trying to bring some of it in. If it is unwelcome, I have no issues. Out of the things I suggested, there is only one thing that I can do in my capacity : update the list on what I have been doing, and I do that. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From pjotr.public14 at thebird.nl Mon Aug 9 06:43:32 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 08:43:32 +0200 Subject: [BioRuby] development updates? In-Reply-To: <20100809060025.GA17390@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> Message-ID: <20100809064332.GA19790@thebird.nl> Sorry, that was a bit rabid. Basically you think we should be organised differently, and/or share information differently. On the mailing list you'll find a history of that. Every OSS project is different. There is no single way. If you want it differently, take initiative - rather than talking about 'culture' in other projects. BioRuby has grown this way, and it works for us. As I see it, all major developments have been discussed via the list. The latest developments are GSoC, plugins and BioLib BAM/SAM support (which is on the biolib mailing list). Again, I don't mind you want it different. But the way to change things is to act. With actions gain you respect. And most respect is gained by writing code. That is true in every OSS project I know. Everyone can criticise, few really change things. Pj. On Mon, Aug 09, 2010 at 08:00:25AM +0200, Pjotr Prins wrote: > First you steal our announcement of a paper - I am sure Jan did not > intend you to blurt that out on the list. Second you breach my trust > by quoting a private message on the list. Third, you assume our > development process is not transparent. Based on what? > > Maybe there is a simple answer: we don't need to communicate that > much. > > BioRuby is not a centrally run project, business or organisation. > BioRuby is OSS. Feel free to run with the project. Fork, code, > document, organise, whatever. > > So far, I see talk and hand waving. If you want to organise something, > be factual and concrete. Actions speak louder than words. You also may > want to read the history of the mailing list. > > I suggest you earn your stripes with getting your code accepted to > BioRuby, first. That is a pretty steep hill anyway. Last time I > checked your code. > > And I suggest at least two apologies, if you want further responses > from me. > > Pj > > On Mon, Aug 09, 2010 at 11:01:13AM +0530, Anurag Priyam wrote: > > > > > > What does the fact that *you* are not aware of a manuscript have to do > > > with the development process? I miss the connection. > > > > > > > I just thought it would be better if *all* the developers( important > > or less important ) are updated on what is happening. I see it > > happening on the few other lists I have been subscribed to all the > > time. > > > > > As BioRuby is an OSS project, feel free to take the lead. You > > > understand OSS development, right? > > > > > > > Well to be very frank this has been my very first attempt at OSS > > contribution. So, may be I really do not understand it very well. > > > > > Talk is cheap. Making stuff happen - that is what counts. > > > > > > > FYI, that is exactly what I am trying to do. I see very different > > culture in other development teams and I find it better. I was trying > > to bring some of it in. If it is unwelcome, I have no issues. > > > > Out of the things I suggested, there is only one thing that I can do > > in my capacity : update the list on what I have been doing, and I do > > that. > > > > -- > > Anurag Priyam, > > 3rd Year Undergraduate, > > Department of Mechanical Engineering, > > IIT Kharagpur. > > +91-9775550642 From andrew.j.grimm at gmail.com Mon Aug 9 07:17:17 2010 From: andrew.j.grimm at gmail.com (Andrew Grimm) Date: Mon, 9 Aug 2010 17:17:17 +1000 Subject: [BioRuby] development updates? In-Reply-To: References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> Message-ID: While there may be a difference between making suggestions and doing stuff, there's also a difference between making suggestions and personally criticising people. I think that personal criticisms should be avoided when possible. I am no longer surprised when Rails people behave in an incivil way, but I hoped that non-Rails ruby people would still follow "Matz is nice, so we are nice". Andrew Grimm On Mon, Aug 9, 2010 at 4:43 PM, Pjotr Prins wrote: > Sorry, that was a bit rabid. Basically you think we should be > organised differently, and/or share information differently. On the > mailing list you'll find a history of that. > > Every OSS project is different. There is no single way. If you want > it differently, take initiative - rather than talking about 'culture' > in other projects. > > BioRuby has grown this way, and it works for us. As I see it, all > major developments have been discussed via the list. The latest > developments are GSoC, plugins and BioLib BAM/SAM support (which is > on the biolib mailing list). > > Again, I don't mind you want it different. But the way to change > things is to act. With actions gain you respect. And most respect is > gained by writing code. That is true in every OSS project I know. > > Everyone can criticise, few really change things. > > Pj. > > On Mon, Aug 09, 2010 at 08:00:25AM +0200, Pjotr Prins wrote: >> First you steal our announcement of a paper - I am sure Jan did not >> intend you to blurt that out on the list. Second you breach my trust >> by quoting a private message on the list. Third, you assume our >> development process is not transparent. Based on what? >> >> Maybe there is a simple answer: we don't need to communicate that >> much. >> >> BioRuby is not a centrally run project, business or organisation. >> BioRuby is OSS. Feel free to run with the project. Fork, code, >> document, organise, whatever. >> >> So far, I see talk and hand waving. If you want to organise something, >> be factual and concrete. Actions speak louder than words. You also may >> want to read the history of the mailing list. >> >> I suggest you earn your stripes with getting your code accepted to >> BioRuby, first. That is a pretty steep hill anyway. Last time I >> checked your code. >> >> And I suggest at least two apologies, if you want further responses >> from me. >> >> Pj >> >> On Mon, Aug 09, 2010 at 11:01:13AM +0530, Anurag Priyam wrote: >> > > >> > > What does the fact that *you* are not aware of a manuscript have to do >> > > with the development process? I miss the connection. >> > > >> > >> > I just thought it would be better if *all* the developers( important >> > or less important ) are updated on what is happening. I see it >> > happening on the few other lists I have been subscribed to all the >> > time. >> > >> > > As BioRuby is an OSS project, feel free to take the lead. You >> > > understand OSS development, right? >> > > >> > >> > Well to be very frank this has been my very first attempt at OSS >> > contribution. So, may be I really do not understand it very well. >> > >> > > Talk is cheap. Making stuff happen - that is what counts. >> > > >> > >> > FYI, that is exactly what I am trying to do. I see very different >> > culture in other development teams and I find it better. I was trying >> > to bring some of it in. If it is unwelcome, I have no issues. >> > >> > Out of the things I suggested, there is only one thing that I can do >> > in my capacity : update the list on what I have been doing, and I do >> > that. >> > >> > -- >> > Anurag Priyam, >> > 3rd Year Undergraduate, >> > Department of Mechanical Engineering, >> > IIT Kharagpur. >> > +91-9775550642 > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From anurag08priyam at gmail.com Mon Aug 9 07:49:19 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Mon, 9 Aug 2010 13:19:19 +0530 Subject: [BioRuby] development updates? In-Reply-To: <20100809060025.GA17390@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> Message-ID: > First you steal our announcement of a paper - I am sure Jan did not > intend you to blurt that out on the list. Oops. Extremely sorry about that. > Second you breach my trust > by quoting a private message on the list. I had intended my mail to be *open* so it perfectly seemed apt to me that others know what you think about it. I am extremely sorry that you took it otherwise. I did not mean any offense. > Third, you assume our > development process is not transparent. Based on what? As I said, I like things to be more open. Again, I was just suggesting that doing something *might* be better. I am not forcing anyone to adopt it, not that I can. > > Maybe there is a simple answer: we don't need to communicate that > much. Maybe. > > So far, I see talk and hand waving. If you want to organise something, > be factual and concrete. Actions speak louder than words. You also may > want to read the history of the mailing list. > > I suggest you earn your stripes with getting your code accepted to > BioRuby, first. That is a pretty steep hill anyway. Last time I > checked your code. > Well, I am definitely embarrassed that you think so. I can only hope that it changes as I contribute more. Last time you checked my code it definitely worked. I see no wrong in first writing something that works and then make it more correct or elegant. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From pjotr.public14 at thebird.nl Mon Aug 9 07:50:05 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 09:50:05 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> Message-ID: <20100809075005.GA21017@thebird.nl> On Mon, Aug 09, 2010 at 05:17:17PM +1000, Andrew Grimm wrote: > While there may be a difference between making suggestions and doing > stuff, there's also a difference between making suggestions and > personally criticising people. I think that personal criticisms should > be avoided when possible. I did not sent a personal criticism reply to the list originally. I sent Anurag a private response. *He* quoted it on the mailing list. I am allowed to be critical on that. Anyway, no worries. Chapter closed, as far as I am concerned. Anurag can reply privately, if he wants. Pj. From pjotr.public14 at thebird.nl Mon Aug 9 07:53:52 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 09:53:52 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> Message-ID: <20100809075352.GC21017@thebird.nl> Apologies accepted. Pj. On Mon, Aug 09, 2010 at 01:19:19PM +0530, Anurag Priyam wrote: > > First you steal our announcement of a paper - I am sure Jan did not > > intend you to blurt that out on the list. > > Oops. Extremely sorry about that. > > > Second you breach my trust > > by quoting a private message on the list. > > I had intended my mail to be *open* so it perfectly seemed apt to me > that others know what you think about it. I am extremely sorry that > you took it otherwise. I did not mean any offense. > > > Third, you assume our > > development process is not transparent. Based on what? > > As I said, I like things to be more open. Again, I was just > suggesting that doing something *might* be better. I am not forcing > anyone to adopt it, not that I can. > > > > > Maybe there is a simple answer: we don't need to communicate that > > much. > > Maybe. > > > > > So far, I see talk and hand waving. If you want to organise something, > > be factual and concrete. Actions speak louder than words. You also may > > want to read the history of the mailing list. > > > > I suggest you earn your stripes with getting your code accepted to > > BioRuby, first. That is a pretty steep hill anyway. Last time I > > checked your code. > > > > Well, I am definitely embarrassed that you think so. I can only hope > that it changes as I contribute more. Last time you checked my code it > definitely worked. I see no wrong in first writing something that > works and then make it more correct or elegant. > > -- > Anurag Priyam, > 3rd Year Undergraduate, > Department of Mechanical Engineering, > IIT Kharagpur. > +91-9775550642 From anurag08priyam at gmail.com Mon Aug 9 07:57:34 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Mon, 9 Aug 2010 13:27:34 +0530 Subject: [BioRuby] development updates? In-Reply-To: <20100809064332.GA19790@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> Message-ID: > Sorry, that was a bit rabid. No issues :). -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From andrew.j.grimm at gmail.com Mon Aug 9 11:05:32 2010 From: andrew.j.grimm at gmail.com (Andrew Grimm) Date: Mon, 9 Aug 2010 21:05:32 +1000 Subject: [BioRuby] development updates? In-Reply-To: <20100809075005.GA21017@thebird.nl> References: <20100808222310.GA15100@thebird.nl> <20100809060025.GA17390@thebird.nl> <20100809064332.GA19790@thebird.nl> <20100809075005.GA21017@thebird.nl> Message-ID: No worries, mate! ;) Andrew On Mon, Aug 9, 2010 at 5:50 PM, Pjotr Prins wrote: > On Mon, Aug 09, 2010 at 05:17:17PM +1000, Andrew Grimm wrote: >> While there may be a difference between making suggestions and doing >> stuff, there's also a difference between making suggestions and >> personally criticising people. I think that personal criticisms should >> be avoided when possible. > > I did not sent a personal criticism reply to the list originally. I > sent Anurag a private response. ?*He* quoted it on the mailing list. I > am allowed to be critical on that. > > Anyway, no worries. Chapter closed, as far as I am concerned. Anurag > can reply privately, if he wants. > > Pj. > From rutgeraldo at gmail.com Mon Aug 9 13:45:26 2010 From: rutgeraldo at gmail.com (Rutger Vos) Date: Mon, 9 Aug 2010 21:45:26 +0800 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: I certainly appreciate Pjotr's point that whoever writes the code calls the shots - and since I have written zero code my opinion carries no weight. That said, now that this thread was started there is probably no harm in trying to address the points in the original post, if only for posterity's sake. I would start by saying that I commend Anurag for his (youthful, if he doesn't mind) enthusiasm. It seems to me that the points he raises are already being addressed in a way that evidently suits the bioruby community: > I would suggest that the list be constantly updated : > 1. short and long term goals - targets for minor and major releases, > prioritizing bugs or feature requests or design decisions. > There is currently on the bioruby homepage a link to a bug tracker and a feature request tracker. There is also one on the github page. Adding another technological fix (some tracker tool) will do nothing but fragment things. As far as design decisions are concerned, none of the OBF(-like) projects I follow are really designed by committee, so in general there are no formal decisions that need to be communicated to lower echelons. > 2. what is cooking - each developer could update the list on what he/she is > working at, or/and a fortnightly or a monthly update on the cummulative > development status( how much of the target has been achieved and stuff ) > On the bioruby homepage are links to a number of blogs by people who very generously take the time to record what they do so that others can learn from that and use it. That is probably about as good as it's gonna get given the time constraints that researcher/programmers are under. > 3. important decisions and changes. > I doubt that this is how things work. People use bioruby to get their work done, and they add things if they are deemed useful. This isn't the kind of open source project where the user base vastly outnumbers the developer community (e.g. apache, firefox, and so on) so that there would need to be impressive "milestones" and cool sounding code names sent through formal lines of communication. > Perhaps, a development specific list can be setup to keep the user and the > developer space segregated. A development list can also be attached to the > issue tracker so that developers are automatically updated on new bugs and > feature requests. > I have been subscribed to a number of -guts at example.org mailing lists to which bugs and commits are automatically piped. They have been of dubious value - I filter the messages automatically from my inbox to some folder which I then never visit, it turns out. In any case, to say that a list can be setup is to say that a real person should spend time setting up a list. Maybe that is not necessary given that source code repositories and bug trackers have RSS feeds. > Or, a blog can be setup where regular commiters have posting access. > Again, this is something that a real person would need to do. To the extent that I know the people in the bioruby core (I only "know" a couple) I know that they are all people with heavy academic work loads - perhaps even (heaven forbid) with departmental duties on top of that. They do what they can, I assume they already pretty much exhaust their copious amounts of spare time as it is :) > P.S : The idea behind this mail is to spark some discussion on a more > efficient software development culture and hopefully adopt it :). > I don't think cultures are ever adopted unless there is very, very forceful management that imposes it (which of course there isn't for OBF projects) or unless it grows around certain key people with long term involvement in a project. But this is probably just a roundabout way of repeating Pjotr's point that the way to change things is to act. Rutger -- Dr. Rutger A. Vos School of Biological Sciences Philip Lyle Building, Level 4 University of Reading Reading RG6 6BX United Kingdom Tel: +44 (0) 118 378 7535 http://www.nexml.org http://rutgervos.blogspot.com From cjfields at illinois.edu Mon Aug 9 15:59:15 2010 From: cjfields at illinois.edu (Chris Fields) Date: Mon, 9 Aug 2010 10:59:15 -0500 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: Just want to add my 2c, from the bioperl perspective. The vast majority of bioperl devs have other jobs or obligations, so when they develop for whatever Bio* it tends to be to scratch a particular itch (fulfill something they need), and not much beyond that. With BioPerl the tendency is that whoever codes first wins; talking about design leads to bikeshedding and tends to go nowhere unless you have something to point to (just drop in on the threads on the perl6 mail list sometime for many examples of this). Yes, sometimes we have conflicts of opinion and get into spats, sometimes new code clobbers tests (git/github helps here), and sometime we even get a pretty decent design out of it (sometimes not :), but regardless every time we have code to work with or something to point to for our efforts. It's good to discuss things, but you have to produce something of value at the end of the day. So, just to reiterate what Rutger and Pjotr are saying, actions speak volumes. Take up the reins on something, get involved, and actually do something that benefits the project you are interested in. chris On Aug 9, 2010, at 8:45 AM, Rutger Vos wrote: > I certainly appreciate Pjotr's point that whoever writes the code calls the > shots - and since I have written zero code my opinion carries no weight. > That said, now that this thread was started there is probably no harm in > trying to address the points in the original post, if only for posterity's > sake. I would start by saying that I commend Anurag for his (youthful, if he > doesn't mind) enthusiasm. It seems to me that the points he raises are > already being addressed in a way that evidently suits the bioruby community: > > >> I would suggest that the list be constantly updated : >> 1. short and long term goals - targets for minor and major releases, >> prioritizing bugs or feature requests or design decisions. >> > > There is currently on the bioruby homepage a link to a bug tracker and a > feature request tracker. There is also one on the github page. Adding > another technological fix (some tracker tool) will do nothing but fragment > things. > > As far as design decisions are concerned, none of the OBF(-like) projects I > follow are really designed by committee, so in general there are no formal > decisions that need to be communicated to lower echelons. > > >> 2. what is cooking - each developer could update the list on what he/she is >> working at, or/and a fortnightly or a monthly update on the cummulative >> development status( how much of the target has been achieved and stuff ) >> > > On the bioruby homepage are links to a number of blogs by people who very > generously take the time to record what they do so that others can learn > from that and use it. That is probably about as good as it's gonna get given > the time constraints that researcher/programmers are under. > > >> 3. important decisions and changes. >> > > I doubt that this is how things work. People use bioruby to get their work > done, and they add things if they are deemed useful. This isn't the kind of > open source project where the user base vastly outnumbers the developer > community (e.g. apache, firefox, and so on) so that there would need to be > impressive "milestones" and cool sounding code names sent through formal > lines of communication. > > >> Perhaps, a development specific list can be setup to keep the user and the >> developer space segregated. A development list can also be attached to the >> issue tracker so that developers are automatically updated on new bugs and >> feature requests. >> > > I have been subscribed to a number of -guts at example.org mailing > lists to which bugs and commits are automatically piped. They have been of > dubious value - I filter the messages automatically from my inbox to some > folder which I then never visit, it turns out. In any case, to say that a > list can be setup is to say that a real person should spend time setting up > a list. Maybe that is not necessary given that source code repositories and > bug trackers have RSS feeds. > > >> Or, a blog can be setup where regular commiters have posting access. >> > > Again, this is something that a real person would need to do. To the extent > that I know the people in the bioruby core (I only "know" a couple) I know > that they are all people with heavy academic work loads - perhaps even > (heaven forbid) with departmental duties on top of that. They do what they > can, I assume they already pretty much exhaust their copious amounts of > spare time as it is :) > > >> P.S : The idea behind this mail is to spark some discussion on a more >> efficient software development culture and hopefully adopt it :). >> > > I don't think cultures are ever adopted unless there is very, very forceful > management that imposes it (which of course there isn't for OBF projects) or > unless it grows around certain key people with long term involvement in a > project. But this is probably just a roundabout way of repeating Pjotr's > point that the way to change things is to act. > > Rutger > > -- > Dr. Rutger A. Vos > School of Biological Sciences > Philip Lyle Building, Level 4 > University of Reading > Reading > RG6 6BX > United Kingdom > Tel: +44 (0) 118 378 7535 > http://www.nexml.org > http://rutgervos.blogspot.com > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From pjotr.public14 at thebird.nl Mon Aug 9 17:28:51 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 9 Aug 2010 19:28:51 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: <20100809172851.GA28959@thebird.nl> On Mon, Aug 09, 2010 at 10:59:15AM -0500, Chris Fields wrote: > Just want to add my 2c, from the bioperl perspective. The vast > majority of bioperl devs have other jobs or obligations, so when > they develop for whatever Bio* it tends to be to scratch a > particular itch (fulfill something they need), and not much beyond > that. The few people that go beyond scratching an itch, are the ones we should really treasure as they get little in return (apart from criticism). In particular people like Naohisa (BioRuby) and Chris (BioPerl), who work on testing and integration, get really very little recognition for their efforts. Likewise people like Hilmar, Rutger and Christian, who put in effort guiding students in GSoC and get very little recognition for their work. This is a good place to thank you. I notice what you do :). And I think you are making the world a better place. Thank you for going beyond your remit and spending your free time. Pj. From hlapp at drycafe.net Tue Aug 10 01:45:51 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Mon, 9 Aug 2010 21:45:51 -0400 Subject: [BioRuby] development updates? In-Reply-To: References: Message-ID: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Hi Anurag: On Aug 8, 2010, at 11:32 AM, Anurag Priyam wrote: > Today I had a chat with Jan when I found that a manuscript on > BioRuby is > under progress. I do not remember it being discussed on the list. I'm curious - why do you think should it have been discussed on the list (assuming that your point here is that you think that's what should have happened)? -hilmar -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From anurag08priyam at gmail.com Tue Aug 10 04:02:37 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Tue, 10 Aug 2010 09:32:37 +0530 Subject: [BioRuby] development updates? In-Reply-To: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Message-ID: > I'm curious - why do you think should it have been discussed on the list > (assuming that your point here is that you think that's what should have > happened)? Well, I have been lurking around in Debian, and Git mailing list for a while and nothing happens off the list there. I kind of like that. Actually I was expecting something like that here too. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From anurag08priyam at gmail.com Tue Aug 10 04:07:58 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Tue, 10 Aug 2010 09:37:58 +0530 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Message-ID: >> I'm curious - why do you think should it have been discussed on the list >> (assuming that your point here is that you think that's what should have >> happened)? > > Well, I have been lurking around in Debian, and Git mailing list for a > while and nothing happens off the list there. I kind of like that. > Actually I was expecting something like that here too. That actually was the basis of my entire mail. But after Rutger's response I kind of understand what I was not seeing before. So, it is fine :). -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From georgkam at gmail.com Tue Aug 10 07:37:11 2010 From: georgkam at gmail.com (George Githinji) Date: Tue, 10 Aug 2010 10:37:11 +0300 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? Message-ID: Hi all, The Regional Students Group for Eastern Africa (RSG-EA) is one of the grass-root level bodies of the International Society for Computational Biology Student Council (ISCB-SC). The group has membership from ten countries namely Burundi, Democratic Republic of Congo, Djibouti, Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. Recently we proposed to organize a biohakathon three day event to: 1) Learn how to collaborate on bioinformatics programming projects using open source tools. 2) Forge an East African bioinformatics programming community. 3) Contribute a module/code to Bioruby library. The event has been sponsored by a grant from ISCB and ILRI/Beca bioinformatics platform in Nairobi, Kenya. We would like to seek for a suitable project work from one of the developer(s) and the community. The project should ideally be of beginner to intermediate level difficulty. A third of the participants will be of intermediate level programming skills with experience from Java,Python and Perl. while the rest will have beginner level skills. We were also wondering whether it would be possible to get one of the lead contributors to bioruby project to give a short 15-20 minutes introductory talk to the participants. We have excellent video conferencing facilities at the ILRI/Beca hub. The event is slated to take place in late September. Thank you -- --------------- Sincerely George KEMRI/Wellcome-Trust Research Program Skype: george_g2 Blog: http://biorelated.wordpress.com/ From ralf at ark.in-berlin.de Tue Aug 10 09:37:35 2010 From: ralf at ark.in-berlin.de (Ralf Stephan) Date: Tue, 10 Aug 2010 11:37:35 +0200 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: Message-ID: <0F6E98FF-4C66-4B5C-87AC-016D56B5A96D@ark.in-berlin.de> On Aug 10, 2010, at 9:37 AM, George Githinji wrote: > 1) Learn how to collaborate on bioinformatics programming projects > using open source tools. You might consider the realistic approach: 1A) Use bioruby for an interesting project 1B) Find bug (there is always one!) 1C) Fix bug 1D) Send patch (or use github, but why) This is exactly how your students will later be confronted with the possibility of Open Source collaboration. Regards, Ralf Stephan http://www.ark.in-berlin.de pub 1024D/C5114CB2 2009-06-07 [expires: 2011-06-06] Key fingerprint = 76AE 0D21 C06C CBF9 24F8 7835 1809 DE97 C511 4CB2 From hlapp at drycafe.net Tue Aug 10 17:18:29 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Tue, 10 Aug 2010 13:18:29 -0400 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> Message-ID: <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> On Aug 10, 2010, at 12:02 AM, Anurag Priyam wrote: > I have been lurking around in Debian, and Git mailing list for a > while and nothing happens off the list there. How do you know that? -hilmar -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From anurag08priyam at gmail.com Tue Aug 10 17:25:41 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Tue, 10 Aug 2010 22:55:41 +0530 Subject: [BioRuby] development updates? In-Reply-To: <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: >> I have been lurking around in Debian, and Git mailing list for a while and >> nothing happens off the list there. > > > How do you know that? Not literally. Just be around on the git list and you will know what I mean. -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From cjfields at illinois.edu Tue Aug 10 17:24:12 2010 From: cjfields at illinois.edu (Chris Fields) Date: Tue, 10 Aug 2010 12:24:12 -0500 Subject: [BioRuby] development updates? In-Reply-To: <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: <991C4D70-3B10-4F78-8889-B522E358658B@illinois.edu> On Aug 10, 2010, at 12:18 PM, Hilmar Lapp wrote: > On Aug 10, 2010, at 12:02 AM, Anurag Priyam wrote: > >> I have been lurking around in Debian, and Git mailing list for a while and nothing happens off the list there. > > > How do you know that? > > -hilmar The mail list version of Schr?dinger's cat? :> chris From pjotr.public14 at thebird.nl Tue Aug 10 18:00:24 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 10 Aug 2010 20:00:24 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: <20100810180024.GB9182@thebird.nl> On Tue, Aug 10, 2010 at 10:55:41PM +0530, Anurag Priyam wrote: > >> I have been lurking around in Debian, and Git mailing list for a while and > >> nothing happens off the list there. Debian Bio-Med is off-list. Debian has a board that discusses (partly) off-list. Ubuntu, a derivative of Debian, is off-list. Just examples. I am certain help pages and 'manuscripts' are not fully discussed on the list. Personal criticism would be off-list (normally). Especially with a gigantic project like Debian, it would be suicide to do everything via the list. Pj. From hlapp at drycafe.net Tue Aug 10 18:28:22 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Tue, 10 Aug 2010 14:28:22 -0400 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: Hi Anurag: On Aug 10, 2010, at 1:25 PM, Anurag Priyam wrote: >>> I have been lurking around in Debian, and Git mailing list for a >>> while and >>> nothing happens off the list there. >> >> >> How do you know that? > > Not literally. Just be around on the git list and you will know what > I mean. I am trying to make you take a step back and think. Please don't be evasive. You do not know what happens offline on that list, nor do you know for any other mailing list or community - it's by the definition of the word "off-line" that you don't know. Just because you see a lot happening on-list doesn't mean that a lot can't also happen off-list. What you encountered is that the nature of things discussed online and those discussed offline is not identical between, say, the bioruby list and the git list. But that's not a surprise - pick any two lists or communities and you will find a difference. That is because a community's practices are defined by the people who populate them, and different communities are populated by different people. Those differences are not bad; rather, I suggest you try to appreciate them. What should ultimately count is the success of the community in fostering coherence, and in creating useful software. If you feel that Bioruby is falling short on those counts, and that shifting the balance of things discussed onlist and offlist in a certain direction would help rectify that, then I think most if not everyone here would be curious to hear your specific thoughts. But just pointing to another community and saying it is different isn't very productive - it's just pointing out the expected. Cheers, -hilmar -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From anurag08priyam at gmail.com Tue Aug 10 20:00:06 2010 From: anurag08priyam at gmail.com (Anurag Priyam) Date: Wed, 11 Aug 2010 01:30:06 +0530 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: > I am trying to make you take a step back and think. Please don't be evasive. > You do not know what happens offline on that list, nor do you know for any > other mailing list or community - it's by the definition of the word > "off-line" that you don't know. Just because you see a lot happening on-list > doesn't mean that a lot can't also happen off-list. > > What you encountered is that the nature of things discussed online and those > discussed offline is not identical between, say, the bioruby list and the > git list. But that's not a surprise - pick any two lists or communities and > you will find a difference. That is because a community's practices are > defined by the people who populate them, and different communities are > populated by different people. > > > Those differences are not bad; rather, I suggest you try to appreciate them. > What should ultimately count is the success of the community in fostering > coherence, and in creating useful software. > Right. I understand your point and I am not being critical of the differences. I have tried to explain my point below. > If you feel that Bioruby is falling short on those counts, and that shifting > the balance of things discussed onlist and offlist in a certain direction > would help rectify that, then I think most if not everyone here would be > curious to hear your specific thoughts. But just pointing to another > community and saying it is different isn't very productive - it's just > pointing out the expected. I do not feel that BioRuby is falling short on any counts. The jist of my mail was only this : "Hey, guys on this list they do this X. I think that X is cool. *Maybe* you guys will like it too. *Maybe* we could do it too." I was just being enthusiastic about that X, and wanted to *share* it. However things have gone totally awry. Maybe the way I essayed it down was wrong, or maybe I failed to realize that I should be a little more formal here. Or, maybe just that I should grow up a little more. I tend to be overly enthusiastic about things. In any case, it is my mistake and will you all please forgive me about it? -- Anurag Priyam, 3rd Year Undergraduate, Department of Mechanical Engineering, IIT Kharagpur. +91-9775550642 From pjotr.public14 at thebird.nl Tue Aug 10 20:15:24 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 10 Aug 2010 22:15:24 +0200 Subject: [BioRuby] development updates? In-Reply-To: References: <424B26DB-D7B4-48DB-8A76-F6029998ABAB@drycafe.net> <9C65C5A7-5105-4BE2-A395-BED6AEDF8847@drycafe.net> Message-ID: <20100810201524.GA12462@thebird.nl> > I have been lurking around in Debian, and Git mailing list for a > while and nothing happens off the list there. Ultimately code talks. As Hilmar points out. In OSS a successful project is one that remains relevant. Refrain: We are all here to keep BioRuby relevant. That is the point of this mailing list. > I was just being enthusiastic about that X, and wanted to *share* > it. However things have gone totally awry. Maybe the way I essayed > it down was wrong, or maybe I failed to realize that I should be a > little more formal here. Or, maybe just that I should grow up a > little more. I tend to be overly enthusiastic about things. In any > case, it is my mistake and will you all please forgive me about it? No need to forgive you for being enthusiastic. Enthusiasm is cool. It would be great if you use your energy to run with an aspect of your ideas. For one, documentation needs work. Prioritizing work and distribution of work would be good too (if you can get people to agree). The finding of synergy in a project is laudable. Some projects benefit from strong leadership - like the Linux kernel. Note that strong leadership comes with respect - and in OSS that is based (again) on code. Meanwhile, a problem, as Chris pointed out, is that everyone is really busy. Programming happens in short bursts of activity, and tends to be kinda 'ad hoc'. I think it is actually pretty amazing that we have good individuals who modify 'ad hoc' code for broader use, so it becomes available to more people. Refrain: We are all here to keep BioRuby relevant. That is the point of this mailing list. Pj. From georgkam at gmail.com Wed Aug 11 05:53:12 2010 From: georgkam at gmail.com (George Githinji) Date: Wed, 11 Aug 2010 08:53:12 +0300 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: <0F6E98FF-4C66-4B5C-87AC-016D56B5A96D@ark.in-berlin.de> References: <0F6E98FF-4C66-4B5C-87AC-016D56B5A96D@ark.in-berlin.de> Message-ID: Thanks Ralf. I appreciate your advice. On Tue, Aug 10, 2010 at 12:37 PM, Ralf Stephan wrote: > > On Aug 10, 2010, at 9:37 AM, George Githinji wrote: >> ?1) Learn how to collaborate on bioinformatics programming projects >> using open source tools. > > You might consider the realistic approach: > 1A) Use bioruby for an interesting project > 1B) Find bug (there is always one!) > 1C) Fix bug > 1D) Send patch (or use github, but why) > > This is exactly how your students will later be confronted > with the possibility of Open Source collaboration. > > Regards, > > Ralf Stephan > http://www.ark.in-berlin.de > pub ? 1024D/C5114CB2 2009-06-07 [expires: 2011-06-06] > ? ? ?Key fingerprint = 76AE 0D21 C06C CBF9 24F8 ?7835 1809 DE97 C511 4CB2 > > > > > -- --------------- Sincerely George KEMRI/Wellcome-Trust Research Program Skype: george_g2 Blog: http://biorelated.wordpress.com/ From pjotr.public14 at thebird.nl Thu Aug 12 14:30:12 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 12 Aug 2010 16:30:12 +0200 Subject: [BioRuby] GFF3 Message-ID: <20100812143012.GA31206@thebird.nl> I intend to use GFF3 and document its use. In my gff3 github branch (see http://github.com/pjotrp/bioruby/tree/gff3) I have just added a first example for fetching sequence data from GFF3. First I took an example from Lincoln Stein (in his BioPerl repository) and stuck that in ./test/data/gff/test.gff3. This data contains empty lines - so I modified the GFF3 parser to ignore those. Before I continue, I also wonder about the wisdom of including a Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the @definition with @entry_id. Not only that, the sequence contains white space, which does not match GFF's positioning data: #> Now, to print FASTA I now do: gff3.sequences.each do | item | print item.to_fasta(item.entry_id, 70) end (to_fasta is being deprecated) To get a FASTA sequence I would like to do the sane: gff3.sequences.each do | item | rec = Bio::FastaFormat.new('> '+item.definition.strip+"\n"+item.data) print rec end where item.data is just the clean sequence. The current implementation is rather uninituitive. I realise GFF3 contains FASTA, but there is no reason to store it like that. How about removing the contained Bio::FastaFormat and just use a sequence string? And remove the white space by default? It does also away with FASTA formatting - the to_fasta in GFF3. I can make the changes, if you agree. Pj. From mh6 at sanger.ac.uk Thu Aug 12 14:42:23 2010 From: mh6 at sanger.ac.uk (Michael Paulini) Date: Thu, 12 Aug 2010 15:42:23 +0100 Subject: [BioRuby] GFF3 In-Reply-To: <20100812143012.GA31206@thebird.nl> References: <20100812143012.GA31206@thebird.nl> Message-ID: <4C64084F.7090905@sanger.ac.uk> Pjotr, are you coming over to the GMOD meeting in Cambridge? Because if we need/want to make changes to teh GFF3 specifications, we could discuss it there, as some from Lincoln's group will also be there. ... and yes, the inlined fasta at the end is not a perfect solution. Michael On 12/08/10 15:30, Pjotr Prins wrote: > I intend to use GFF3 and document its use. > > In my gff3 github branch (see http://github.com/pjotrp/bioruby/tree/gff3) I > have just added a first example for fetching sequence data from GFF3. First I > took an example from Lincoln Stein (in his BioPerl repository) and stuck that > in ./test/data/gff/test.gff3. This data contains empty lines - so I modified > the GFF3 parser to ignore those. > > Before I continue, I also wonder about the wisdom of including a > Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the > @definition with @entry_id. Not only that, the sequence contains white space, > which does not match GFF's positioning data: > > # @source_data=# @data="\nACGAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTA\nGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACA\nCCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGAT\nAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTAGT\nGCCCAAGAATGCGATCCCAGAAGTCTTGGTTCTAAAGTCGTCGGAAAGATTTGAGGAACTGCCATACAGC\nCCGTGGGTGAAACTGTCGACATCCATTGTGCGAATAGGCCTGCTAGTGAC\n\n", > @definition="test01">> > > Now, to print FASTA I now do: > > gff3.sequences.each do | item | > print item.to_fasta(item.entry_id, 70) > end > > (to_fasta is being deprecated) > > To get a FASTA sequence I would like to do the sane: > > gff3.sequences.each do | item | > rec = Bio::FastaFormat.new('> '+item.definition.strip+"\n"+item.data) > print rec > end > > where item.data is just the clean sequence. > > The current implementation is rather uninituitive. I realise GFF3 contains > FASTA, but there is no reason to store it like that. How about removing the > contained Bio::FastaFormat and just use a sequence string? And remove the white > space by default? > > It does also away with FASTA formatting - the to_fasta in GFF3. > > I can make the changes, if you agree. > > Pj. > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby -- The Wellcome Trust Sanger Institute is operated by Genome Research Limited, a charity registered in England with number 1021457 and a company registered in England with number 2742969, whose registered office is 215 Euston Road, London, NW1 2BE. From ngoto at gen-info.osaka-u.ac.jp Thu Aug 12 15:12:05 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Fri, 13 Aug 2010 00:12:05 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100812143012.GA31206@thebird.nl> References: <20100812143012.GA31206@thebird.nl> Message-ID: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Hi, On Thu, 12 Aug 2010 16:30:12 +0200 Pjotr Prins wrote: > I intend to use GFF3 and document its use. > > In my gff3 github branch (see http://github.com/pjotrp/bioruby/tree/gff3) I > have just added a first example for fetching sequence data from GFF3. First I > took an example from Lincoln Stein (in his BioPerl repository) and stuck that > in ./test/data/gff/test.gff3. Could you please tell me the complete URL of the Lincoln's test data? Why I'd like to know the origin is: I submitted the test.gff3 to the GFF3 Validator, (http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online ) and it is reported as "Invalid". So, I'd like to know if this is intended or not, and that best way to know that is seeing the file's development history. > This data contains empty lines - so I modified > the GFF3 parser to ignore those. How to treat empty lines is undefined in the GFF3 spec. (http://www.sequenceontology.org/gff3.shtml) It may be good to ignore empty lines. > Before I continue, I also wonder about the wisdom of including a > Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the > @definition with @entry_id. Not only that, the sequence contains white space, > which does not match GFF's positioning data: > > # @source_data=# @data="\nACGAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTA\nGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACA\nCCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGAT\nAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTAGT\nGCCCAAGAATGCGATCCCAGAAGTCTTGGTTCTAAAGTCGTCGGAAAGATTTGAGGAACTGCCATACAGC\nCCGTGGGTGAAACTGTCGACATCCATTGTGCGAATAGGCCTGCTAGTGAC\n\n", > @definition="test01">> You can see that FastaFormat object is stored in the @source_data. It will be parsed only when the sequence is really needed. This is a kind of lazy evaluation. Please execute puts gff3.sequences[0][0..100] and report what sequence is shown. > Now, to print FASTA I now do: > > gff3.sequences.each do | item | > print item.to_fasta(item.entry_id, 70) > end gff3.sequences.each do | item | print item.output(:fasta) end -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From pjotr.public14 at thebird.nl Thu Aug 12 16:10:16 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 12 Aug 2010 18:10:16 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100812161016.GA32552@thebird.nl> On Fri, Aug 13, 2010 at 12:12:05AM +0900, Naohisa GOTO wrote: > Could you please tell me the complete URL of the Lincoln's > test data? Why I'd like to know the origin is: > I submitted the test.gff3 to the GFF3 Validator, > (http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online ) > and it is reported as "Invalid". So, I'd like to know if this > is intended or not, and that best way to know that is seeing > the file's development history. proper test data for the module by Lincoln: http://github.com/bioperl/bioperl-live/blob/master/t/data/biodbgff/test.gff3 > > This data contains empty lines - so I modified > > the GFF3 parser to ignore those. > > How to treat empty lines is undefined in the GFF3 spec. > (http://www.sequenceontology.org/gff3.shtml) > It may be good to ignore empty lines. I think so. > > Before I continue, I also wonder about the wisdom of including a > > Bio::FastaFormat record *inside* a Bio::Sequence record. This duplicates the > > @definition with @entry_id. Not only that, the sequence contains white space, > > which does not match GFF's positioning data: > > > > # > @source_data=# > @data="\nACGAAGATTTGTATGACTGATTTATCCTGGACAGGCATTGGTCAGATGTCTCCTTCCGTATCGTCGTTTA\nGTTGCAAATCCGAGTGTTCGGGGGTATTGCTATTTGCCACCTAGAAGCGCAACATGCCCAGCTTCACACA\nCCATAGCGAACACGCCGCCCCGGTGGCGACTATCGGTCGAAGTTAAGACAATTCATGGGCGAAACGAGAT\nAATGGGTACTGCACCCCTCGTCCTGTAGAGACGTCACAGCCAACGTGCCTTCTTATCTTGATACATTAGT\nGCCCAAGAATGCGATCCCAGAAGTCTTGGTTCTAAAGTCGTCGGAAAGATTTGAGGAACTGCCATACAGC\nCCGTGGGTGAAACTGTCGACATCCATTGTGCGAATAGGCCTGCTAGTGAC\n\n", > > @definition="test01">> > > You can see that FastaFormat object is stored in the @source_data. > It will be parsed only when the sequence is really needed. > This is a kind of lazy evaluation. Very lazy ;) But duplication of ID and containment of extraneous information. Not so efficient with space. We may want to change that. The main problem is that it is not intuitive to have a FastaFormat inside a Sequence object. But that could be just me. > Please execute > puts gff3.sequences[0][0..100] > and report what sequence is shown. > > > Now, to print FASTA I now do: > > > > gff3.sequences.each do | item | > > print item.to_fasta(item.entry_id, 70) > > end > > gff3.sequences.each do | item | > print item.output(:fasta) > end I should have known ;) Pj. From pjotr.public14 at thebird.nl Thu Aug 12 16:12:13 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 12 Aug 2010 18:12:13 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <4C64084F.7090905@sanger.ac.uk> References: <20100812143012.GA31206@thebird.nl> <4C64084F.7090905@sanger.ac.uk> Message-ID: <20100812161213.GB32552@thebird.nl> On Thu, Aug 12, 2010 at 03:42:23PM +0100, Michael Paulini wrote: > are you coming over to the GMOD meeting in Cambridge? I can't make it. Maybe next time. Pj. From mail at maasha.dk Fri Aug 13 12:25:46 2010 From: mail at maasha.dk (Martin Asser Hansen) Date: Fri, 13 Aug 2010 14:25:46 +0200 Subject: [BioRuby] Benchmarking FASTA file parsing Message-ID: Hello, I am new to Ruby and was testing bioruby (1.4.0) for parsing FASTA files. A rough comparison with Perl indicated that the bioruby parser was slow. Now I have hacked a parser of my own in Ruby in order to benchmark the bioruby parser. The result is disappointing -> my hack is roughly 3x faster. Admittedly, my hack should probably do a bit of format consistency checking, but that will only take a few % off the speed. Could someone explain why the bioruby parser is so slow? Is it possible to optimize the code without major rewriting? Here is the benchmark result: user system total real Hack 5.440000 0.010000 5.450000 ( 5.494207) Bio 18.410000 0.020000 18.430000 ( 18.579867) The code is shown below. Cheers, Martin #!/usr/bin/env ruby require 'stringio' require 'bio' require 'benchmark' class Fasta include Enumerable def initialize(io) @io = io end def each while entry = get_entry do yield entry end end def get_entry block = @io.gets("\n>") return nil if block.nil? block.chomp!("\n>") block.sub!( /^\s|^>/, "") (seq_name, seq) = block.split("\n", 2) seq.gsub!(/\s/, "") entry = {} entry[:seq_name] = seq_name entry[:seq] = seq entry end end data = <5_gECOjxwXsN1/1 AACGNTACTATCGTGACATGCGTGCAGGATTACAC >3_8ICOjxwXsN1/1 ACTCNAGGGTTCGATTCCCTTCAACCGCCCCATAA >3_GUCOjxwXsN1/1 TTGCNTCCTTCTTCTGCCTTCGTTGGCTCAGATTG >5_BWCOjxwXsN1/1 TATATACAGGAATCCATTGTTGTTTAGATTCAGTT >7_NZCOjxwXsN1/1 AGGTGATCCAGCCGCACCTTCCGATACGGCTACCT >3_2VCOjxwXsN1/1 CTTTTCCAGGTGTGTAGACATCTTCACCCATTAAG >5_kVCOjxwXsN1/1 CTACACCTAAGTTACATCGTCCATTATTTTCCAAT >1_GbCOjxwXsN1/1 CCAGACAACTAGGATGTTGGCTTAGAAGCAGCCAT >5_fTCOjxwXsN1/1 TTAGCTTTAACCATTTTCTTTTTGTCTAAAGCAAA >3_VWCOjxwXsN1/1 TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC DATA io1 = StringIO.new(data) io2 = StringIO.new(data) fasta1 = Fasta.new(io1) fasta2 = Bio::FastaFormat.open(io2) Benchmark.bm(5) do |timer| timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } end From tomoakin at kenroku.kanazawa-u.ac.jp Fri Aug 13 13:37:06 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Fri, 13 Aug 2010 22:37:06 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: Message-ID: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> Hi, The benchmark is interesting. > Is it possible to optimize the code without major rewriting? Using ruby 1.9.2 (RC2) makes it 2.6 times faster without any rewriting the bioruby parser code :) compared to ruby-1.8 (1.8.7-p299). $ ~/ruby192/bin/ruby benchfasta user system total real Hack 3.800000 0.000000 3.800000 ( 3.800830) Bio 13.090000 0.000000 13.090000 ( 13.095722) $ ~/ruby187/bin/ruby benchfasta user system total real Hack 7.460000 0.000000 7.460000 ( 7.456281) Bio 34.670000 0.000000 34.670000 ( 34.680271) As you stated 3 times faster with the hack, you may be already using ruby 1.9. Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and this part will not be the bottle neck of any application. How fast do you need it be? -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From ktym at hgc.jp Fri Aug 13 14:46:20 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Fri, 13 Aug 2010 23:46:20 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: Message-ID: <0CCB19A9-2753-43E6-9E1B-04E75BD1DF20@hgc.jp> Hi, Thank you for your interesting post. :-) I used to love benchmarking bottlenecks in BioRuby. Could you also try to compare parsing whole GenBank or thousands of BLAST results or FASTQ files produced by NGSs with BioPerl, BioRuby and hopefully your version? As the FASTA is a most simple (but fuzzy) format in biology, I suppose the speed of parsing FASTA entry may depend on how many variations do you expect to allow in the defline of the (loosely defined) FASTA format. Most importantly, I also believe the current speed of parsing FASTA files is practically enough as Nishiyama-san stated. It required me >30min to download a file containing ~5.6 million protein sequences from KEGG (only 2.3G bytes). ftp://ftp.genome.jp/pub/kegg/genes/fasta/genes.pep The cat and grep commands took 1 min to read through the file. ------------------------------------------------------------ % time cat genes.pep > /dev/null cat genes.pep > /dev/null 0.02s user 1.84s system 3% cpu 1:00.91 total % time egrep '^>' genes.pep | wc -l 5604761 egrep '^>' genes.pep 12.71s user 2.13s system 23% cpu 1:04.46 total wc -l 0.44s user 0.21s system 1% cpu 1:04.46 total ------------------------------------------------------------ I modified your benchmark to do some real tasks -- counting sequences, printing sequence ID and the sequence length. ------------------------------------------------------------ file = "genes.pep" io1 = File.open(file) io2 = file fasta1 = Fasta.new(io1) fasta2 = Bio::FlatFile.auto(io2) c1 = 0 c2 = 0 Benchmark.bm(5) do |timer| timer.report('Hack') { 1.times { fasta1.each { |entry1| c1 += 1; $stderr.print c1, "\t", entry1[:seq_name][/^\S+/], "\t", entry1[:seq].length, "\n" } } } timer.report('Bio') { 1.times { fasta2.each { |entry2| c2 += 1; $stderr.print c2, "\t", entry2.entry_id, "\t", entry2.length, "\n" } } } end ------------------------------------------------------------ Then, your code took 3 min (sounds great!) and the current BioRuby implementation took 9 min. % ruby-1.8 benchfasta.rb genes.pep user system total real Hack 146.180000 27.820000 174.000000 (191.343770) Bio 480.940000 38.060000 519.000000 (557.216022) It could be painful if you need to deal with more sequences, however, please note that the number of whole protein entries in UniProt (which is believed to contain known protein universe to date) is only twice larger than the KEGG (which covers almost all protein sequences in >1200 completed genomes). http://www.expasy.org/sprot/relnotes/relstat.html http://www.ebi.ac.uk/uniprot/TrEMBLstats/ http://www.genome.jp/en/db_growth.html#genes > Is it possible to optimize the code without major rewriting? Of course, it would be great if you could contribute improved codes or suggest some possible ways to optimize the current implementation. Regards, Toshiaki Katayama, just back from summer vacation ;-) On 2010/08/13, at 21:25, Martin Asser Hansen wrote: > Hello, > > > I am new to Ruby and was testing bioruby (1.4.0) for parsing FASTA files. A > rough comparison with Perl indicated that the bioruby parser was slow. Now I > have hacked a parser of my own in Ruby in order to benchmark the bioruby > parser. The result is disappointing -> my hack is roughly 3x faster. > Admittedly, my hack should probably do a bit of format consistency checking, > but that will only take a few % off the speed. > > Could someone explain why the bioruby parser is so slow? > > Is it possible to optimize the code without major rewriting? > > Here is the benchmark result: > > user system total real > Hack 5.440000 0.010000 5.450000 ( 5.494207) > Bio 18.410000 0.020000 18.430000 ( 18.579867) > > > The code is shown below. > > Cheers, > > > Martin > > #!/usr/bin/env ruby > > require 'stringio' > require 'bio' > require 'benchmark' > > class Fasta > include Enumerable > > def initialize(io) > @io = io > end > > def each > while entry = get_entry do > yield entry > end > end > > def get_entry > block = @io.gets("\n>") > return nil if block.nil? > > block.chomp!("\n>") > block.sub!( /^\s|^>/, "") > > (seq_name, seq) = block.split("\n", 2) > seq.gsub!(/\s/, "") > > entry = {} > entry[:seq_name] = seq_name > entry[:seq] = seq > entry > end > end > > data = <> 5_gECOjxwXsN1/1 > AACGNTACTATCGTGACATGCGTGCAGGATTACAC >> 3_8ICOjxwXsN1/1 > ACTCNAGGGTTCGATTCCCTTCAACCGCCCCATAA >> 3_GUCOjxwXsN1/1 > TTGCNTCCTTCTTCTGCCTTCGTTGGCTCAGATTG >> 5_BWCOjxwXsN1/1 > TATATACAGGAATCCATTGTTGTTTAGATTCAGTT >> 7_NZCOjxwXsN1/1 > AGGTGATCCAGCCGCACCTTCCGATACGGCTACCT >> 3_2VCOjxwXsN1/1 > CTTTTCCAGGTGTGTAGACATCTTCACCCATTAAG >> 5_kVCOjxwXsN1/1 > CTACACCTAAGTTACATCGTCCATTATTTTCCAAT >> 1_GbCOjxwXsN1/1 > CCAGACAACTAGGATGTTGGCTTAGAAGCAGCCAT >> 5_fTCOjxwXsN1/1 > TTAGCTTTAACCATTTTCTTTTTGTCTAAAGCAAA >> 3_VWCOjxwXsN1/1 > TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC > DATA > > io1 = StringIO.new(data) > io2 = StringIO.new(data) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > Benchmark.bm(5) do |timer| > timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } > timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } > end > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From ngoto at gen-info.osaka-u.ac.jp Fri Aug 13 14:47:35 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Fri, 13 Aug 2010 23:47:35 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: Message-ID: <20100813144735.B60FF1CBC5AA@idnmail.gen-info.osaka-u.ac.jp> Hi, On Fri, 13 Aug 2010 14:25:46 +0200 Martin Asser Hansen wrote: > io1 = StringIO.new(data) > io2 = StringIO.new(data) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > Benchmark.bm(5) do |timer| > timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } > timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } > end To rewind the IO (StringIO or Bio::FlatFile object) every time after reading will be needed during the benchmark. #(snip) Benchmark.bm(5) do |timer| timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| }; io1.rewind } } timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| }; fasta2.rewind } } end Why using "fasta2.rewind" instead of "io2.rewind" is that the "fasta2" is an instance of Bio::FlatFile, IO wrapper used in BioRuby, and to keep consistency of information inside the wrapper, it is recommended using fasta2.rewind rather than io2.rewind. I applied above changes, and reduced iteration count to 100,000 times, and get the result with the same tendency. (ruby 1.8.7-p299 (debian Squeeze 1.8.7.299-1)) user system total real Hack 7.240000 0.160000 7.400000 ( 7.390807) Bio 23.250000 0.850000 24.100000 ( 24.100267) (ruby 1.9.1-p243 with env LANG=C) user system total real Hack 5.600000 0.010000 5.610000 ( 5.605175) Bio 15.920000 0.000000 15.920000 ( 15.917899) With E.coli genome ORF data, the difference become smaller, especially in Ruby 1.9.1. (snip) # ftp://ftp.ncbi.nih.gov:/genbank/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655/U00096.ffn io1 = File.open('U00096.ffn') io2 = File.open('U00096.ffn') fasta1 = Fasta.new(io1) fasta2 = Bio::FastaFormat.open(io2) Benchmark.bm(5) do |timer| timer.report('Hack') { 100.times { fasta1.each { |entry1| }; io1.rewind } } timer.report('Bio') { 100.times { fasta2.each { |entry2| }; fasta2.rewind } } end (ruby 1.8.7-p299) user system total real Hack 8.340000 0.140000 8.480000 ( 8.492107) Bio 13.480000 0.520000 14.000000 ( 13.998213) (Ruby 1.9.1-p243 with env LANG=C) user system total real Hack 9.130000 0.140000 9.270000 ( 9.270361) Bio 9.380000 0.180000 9.560000 ( 9.565899) -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From mail at maasha.dk Fri Aug 13 14:51:43 2010 From: mail at maasha.dk (Martin Asser Hansen) Date: Fri, 13 Aug 2010 16:51:43 +0200 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> Message-ID: > > > As you stated 3 times faster with the hack, you may be already using ruby > 1.9. > > I am using ruby 1.9.1, and I am using a fairly fast computer, but I am actually questioning the quality of the code. > Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and this > part will not be the bottle neck of any application. > How fast do you need it be? > Mind you that the Benchmark is performed on StringIO data, and that the script does not touch the disk! In a real test, it will be much slower! I did not test on real data and more speed issues may surface (I have no idea how Ruby's file buffering compares to Perl's, performance-wise). I was contemplating porting some Biopieces (www.biopieces.org) from Perl to Ruby. Biopieces are used for everyday slicing and dicing of all sorts of biological data in a very simple and flexible manner. While Biopieces are not as fast as dedicated scripts, they are fast enough for convenient analysis of NGS data, but I will not accept a +300% speed penalty (i.e. read_fasta). I have been trying to get an overview of the code in Bio::FastaFormat, but I find it hard to read (that could be because I am not used to Ruby, or OO for that matter). It strikes me that the FastaFormat class does a number of irrelevant things like subparsing comments when not strictly necessary. In fact, the FASTA format actually don't use comments prefixed with # (semicolon can be used, but I will strongly advice against it since most software don't deal with it). Also, parsing is dependent on the record separator being '\n' - that could be considered a bug. There seem to be an overuse of substitutions, transliterations and regex matching. How about keeping it nice an tight? ala: SEP = $/ FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ def get_entry block = @io.gets(SEP + ">") return nil if block.nil? if block =~ FASTA_REGEX seq_name = $1 seq = $2 else raise "Bad FASTA entry->#{block}" end seq.gsub!(/\s/, "") end Cheers, Martin > -- > Tomoaki NISHIYAMA > > Advanced Science Research Center, > Kanazawa University, > 13-1 Takara-machi, > Kanazawa, 920-0934, Japan > > From tomoakin at kenroku.kanazawa-u.ac.jp Sat Aug 14 03:42:07 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Sat, 14 Aug 2010 12:42:07 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> Message-ID: <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> Hi, > Mind you that the Benchmark is performed on StringIO data, and that > the script does not touch the disk! > In a real test, it will be much slower! My initial thought was :- That's true, and therefore the pure parser part which runs fairly fast with O(N) is not the primary problem. If you push the entries into a hash it will be much more time consuming. But realized that its two orders slower... (due to the benchmark code as pointed out by goto-san) 20 min for 100 M could be painful. > I have been trying to get an overview of the code in Bio::FastaFormat, > but I find it hard to read (that could be because I am not used to > Ruby, or OO for that matter). For one thing, the Bio::FastaFormat is designed to work with Bio::FlatFile. If you write a dedicated fasta parser that could run much faster. # I would write C codes for a very simple operation on NGS data. # That will run 100 times faster. # When the necessary operation is a bit more complex, I would use ruby. much much more time consuming.... Perhaps the target is to process about 20 ~ 1000 M reads with each of them having 25 to 150 nt for the time being. Thats quite different situation compared to process the ~ 0.1 M entry of 50-10000 aa residues or nucleotides in a genome. The relative cost for the entry separation becomes higher compared with the sequence processing within the entry. So, it may worth to write NGS dedicated parser rather than sticking on FlatFile. Playing around the benchmark, about the half of execution time is for garbage collection, and the order of execution is somewhat relevant to get the number. If you can suppress unnecessary object generation to the minimum and disable GC, that will perhaps make it run much faster. $ diff -u benchfasta benchfasta-hash-GC-b --- benchfasta 2010-08-13 21:45:21.000000000 +0900 +++ benchfasta-hash-GC-b 2010-08-14 11:53:20.000000000 +0900 @@ -34,6 +34,9 @@ end end +count = ARGV.shift.to_i +count = 2 if count == nil + data = <5_gECOjxwXsN1/1 AACGNTACTATCGTGACATGCGTGCAGGATTACAC @@ -57,12 +60,23 @@ TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC DATA -io1 = StringIO.new(data) -io2 = StringIO.new(data) +io0 = StringIO.new(data * count) +io1 = StringIO.new(data * count) +io2 = StringIO.new(data * count) +fasta0 = Fasta.new(io0) fasta1 = Fasta.new(io1) fasta2 = Bio::FastaFormat.open(io2) -Benchmark.bm(5) do |timer| - timer.report('Hack') { 10_000_000.times { fasta1.each { | entry1| } } } - timer.report('Bio') { 10_000_000.times { fasta2.each { | entry2| } } } +hash0=Hash.new +hash1=Hash.new +hash2=Hash.new + +Benchmark.bm(8) do |timer| + GC.enable;GC.start;GC.disable; + timer.report('Bio') { i=0; fasta2.each { |entry2| i+=1; hash2 [entry2.definition + i.to_s] = entry2.seq[2..25]} } + hash2 = nil; GC.enable;GC.start;GC.disable; + timer.report('Hack') { i=0; fasta0.each { |entry1| i+=1; hash0 [entry1[:seq_name] + i.to_s] = entry1[:seq][2..25]} } + hash0 = nil; GC.enable;GC.start;GC.disable; + timer.report('Hack-seq') { i=0; fasta1.each { |entry1| i+=1; hash1 [entry1[:seq_name] + i.to_s] = Bio::Sequence::NA.new(entry1[:seq]) [2..25]} } + hash1 = nil; GC.enable;GC.start;GC.disable; end -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/13, at 23:51, Martin Asser Hansen wrote: > > As you stated 3 times faster with the hack, you may be already > using ruby 1.9. > > > I am using ruby 1.9.1, and I am using a fairly fast computer, but I > am actually questioning the quality of the code. > > Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and > this > part will not be the bottle neck of any application. > How fast do you need it be? > > Mind you that the Benchmark is performed on StringIO data, and that > the script does not touch the disk! In a real test, it will be much > slower! I did not test on real data and more speed issues may > surface (I have no idea how Ruby's file buffering compares to > Perl's, performance-wise). > > I was contemplating porting some Biopieces (www.biopieces.org) from > Perl to Ruby. Biopieces are used for everyday slicing and dicing of > all sorts of biological data in a very simple and flexible manner. > While Biopieces are not as fast as dedicated scripts, they are fast > enough for convenient analysis of NGS data, but I will not accept a > +300% speed penalty (i.e. read_fasta). > > I have been trying to get an overview of the code in > Bio::FastaFormat, but I find it hard to read (that could be because > I am not used to Ruby, or OO for that matter). It strikes me that > the FastaFormat class does a number of irrelevant things like > subparsing comments when not strictly necessary. In fact, the FASTA > format actually don't use comments prefixed with # (semicolon can > be used, but I will strongly advice against it since most software > don't deal with it). Also, parsing is dependent on the record > separator being '\n' - that could be considered a bug. There seem > to be an overuse of substitutions, transliterations and regex > matching. How about keeping it nice an tight? ala: > > SEP = $/ > FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ > > def get_entry > block = @io.gets(SEP + ">") > return nil if block.nil? > > if block =~ FASTA_REGEX > seq_name = $1 > seq = $2 > else > raise "Bad FASTA entry->#{block}" > end > > seq.gsub!(/\s/, "") > end > > > Cheers, > > > Martin > > -- > Tomoaki NISHIYAMA > > Advanced Science Research Center, > Kanazawa University, > 13-1 Takara-machi, > Kanazawa, 920-0934, Japan > > From mail at maasha.dk Sat Aug 14 08:21:39 2010 From: mail at maasha.dk (Martin Asser Hansen) Date: Sat, 14 Aug 2010 10:21:39 +0200 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> Message-ID: I was hoping for an easy to use generic FASTA parser in bioruby. I think it would be confusing with two flavors of parsers for short/long entries. Also, I think that with a minor effort the existing parser could be optimized a fair bit. Subparsing of the defline should not be done for generic parsing, but rather when needed. Without any experience I think that disabling GC sounds like a bad idea. Of cause C is always faster, but Ruby is nicer. Cheers, Martin On Sat, Aug 14, 2010 at 5:42 AM, Tomoaki NISHIYAMA < tomoakin at kenroku.kanazawa-u.ac.jp> wrote: > Hi, > > Mind you that the Benchmark is performed on StringIO data, and that the > script does not touch the disk! > > In a real test, it will be much slower! > > > My initial thought was :- That's true, and therefore the pure parser part > which runs fairly fast with O(N) > is not the primary problem. If you push the entries into a hash it will be > much more time consuming. > > But realized that its two orders slower... (due to the benchmark code as > pointed out by goto-san) > 20 min for 100 M could be painful. > > I have been trying to get an overview of the code in Bio::FastaFormat, > > but I find it hard to read (that could be because I am not used to Ruby, or > OO for that matter). > > > For one thing, the Bio::FastaFormat is designed to work with Bio::FlatFile. > If you write a dedicated fasta parser that could run much faster. > > > # I would write C codes for a very simple operation on NGS data. > > # That will run 100 times faster. > > # When the necessary operation is a bit more complex, I would use ruby. > much much more time consuming.... > > > Perhaps the target is to process about 20 ~ 1000 M reads with each of > > them having 25 to 150 nt for the time being. > > Thats quite different situation compared to process the > > ~ 0.1 M entry of 50-10000 aa residues or nucleotides in a genome. > > The relative cost for the entry separation becomes higher compared with the > sequence > > processing within the entry. > > > So, it may worth to write NGS dedicated parser rather than sticking on > FlatFile. > > > Playing around the benchmark, about the half of execution time is for > garbage collection, > > and the order of execution is somewhat relevant to get the number. > > If you can suppress unnecessary object generation to the minimum and > disable GC, that will > > perhaps make it run much faster. > > > $ diff -u benchfasta benchfasta-hash-GC-b > --- benchfasta 2010-08-13 21:45:21.000000000 +0900 > +++ benchfasta-hash-GC-b 2010-08-14 11:53:20.000000000 +0900 > @@ -34,6 +34,9 @@ > end > end > > +count = ARGV.shift.to_i > +count = 2 if count == nil > + > data = < >5_gECOjxwXsN1/1 > AACGNTACTATCGTGACATGCGTGCAGGATTACAC > @@ -57,12 +60,23 @@ > TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC > DATA > > -io1 = StringIO.new(data) > -io2 = StringIO.new(data) > +io0 = StringIO.new(data * count) > +io1 = StringIO.new(data * count) > +io2 = StringIO.new(data * count) > +fasta0 = Fasta.new(io0) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > -Benchmark.bm(5) do |timer| > - timer.report('Hack') { 10_000_000.times { fasta1.each { |entry1| } } } > - timer.report('Bio') { 10_000_000.times { fasta2.each { |entry2| } } } > +hash0=Hash.new > +hash1=Hash.new > +hash2=Hash.new > + > +Benchmark.bm(8) do |timer| > + GC.enable;GC.start;GC.disable; > + timer.report('Bio') { i=0; fasta2.each { |entry2| i+=1; > hash2[entry2.definition + i.to_s] = entry2.seq[2..25]} } > + hash2 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack') { i=0; fasta0.each { |entry1| i+=1; > hash0[entry1[:seq_name] + i.to_s] = entry1[:seq][2..25]} } > + hash0 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack-seq') { i=0; fasta1.each { |entry1| i+=1; > hash1[entry1[:seq_name] + i.to_s] = > Bio::Sequence::NA.new(entry1[:seq])[2..25]} } > + hash1 = nil; GC.enable;GC.start;GC.disable; > end > > > > > > -- > > Tomoaki NISHIYAMA > > > Advanced Science Research Center, > > Kanazawa University, > > 13-1 Takara-machi, > > Kanazawa, 920-0934, Japan > > > On 2010/08/13, at 23:51, Martin Asser Hansen wrote: > > >> As you stated 3 times faster with the hack, you may be already using ruby >> 1.9. >> >> > I am using ruby 1.9.1, and I am using a fairly fast computer, but I am > actually questioning the quality of the code. > > >> Anyway, I think 13 or 18 seconds for 100 M entry is fast enough and this >> part will not be the bottle neck of any application. >> How fast do you need it be? >> > > Mind you that the Benchmark is performed on StringIO data, and that the > script does not touch the disk! In a real test, it will be much slower! I > did not test on real data and more speed issues may surface (I have no idea > how Ruby's file buffering compares to Perl's, performance-wise). > > I was contemplating porting some Biopieces (www.biopieces.org) from Perl > to Ruby. Biopieces are used for everyday slicing and dicing of all sorts of > biological data in a very simple and flexible manner. While Biopieces are > not as fast as dedicated scripts, they are fast enough > for convenient analysis of NGS data, but I will not accept a +300% speed > penalty (i.e. read_fasta). > > I have been trying to get an overview of the code in Bio::FastaFormat, but > I find it hard to read (that could be because I am not used to Ruby, or OO > for that matter). It strikes me that the FastaFormat class does a number of > irrelevant things like subparsing comments when not strictly necessary. In > fact, the FASTA format actually don't use comments prefixed with # > (semicolon can be used, but I will strongly advice against it since most > software don't deal with it). Also, parsing is dependent on the record > separator being '\n' - that could be considered a bug. There seem to be an > overuse of substitutions, transliterations and regex matching. How about > keeping it nice an tight? ala: > > SEP = $/ > FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ > > def get_entry > block = @io.gets(SEP + ">") > return nil if block.nil? > > if block =~ FASTA_REGEX > seq_name = $1 > seq = $2 > else > raise "Bad FASTA entry->#{block}" > end > > seq.gsub!(/\s/, "") > end > > > Cheers, > > > Martin > > >> -- >> Tomoaki NISHIYAMA >> >> Advanced Science Research Center, >> Kanazawa University, >> 13-1 Takara-machi, >> Kanazawa, 920-0934, Japan >> >> > > From tomoakin at kenroku.kanazawa-u.ac.jp Sat Aug 14 14:52:57 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Sat, 14 Aug 2010 23:52:57 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> Message-ID: <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> Hi, > Subparsing of the defline should not be done for generic parsing, > but rather when needed. To my understanding, the subparsing of the definition occurs only when needed, ie when entry_id, identifiers, gi, etc. is called, in current code. If only definition is called, it is not further parsed. > Without any experience I think that disabling GC sounds like a bad > idea. Yes, completely disabling GC is generally a bad idea. A code running with 6 Gbytes mem could eat 60 Gbytes or more... (Yes it seems two or three-fold faster if there is enough memory, but this trade-off is too extreme). But since the GC dominates the running time, it is an important target for optimization. http://en.wikibooks.org/wiki/Ruby_Programming/Reference/Objects/GC A more moderate reduction of GC frequency will surely speedup the process 30~50%. Admittedly, explicit GC.disable, GC.start make the code ugly. Trial on tweaking the parameters in gc.c did only a minor (~5%) improvement. Careful coding to reduce object creation might contribute to speed up. One of questionable variable is @entry_overrun Is this variable and attr_reader :entry_overrun really required yet or is just a trace of older code? > Goto-San Since there is only two other variables, which is apparently essential, this third variable might account significant speed reduction. A tests suggested again removing 3 lines can improve 5%. (Unfortunately not 50%) diff --git a/lib/bio/db/fasta.rb b/lib/bio/db/fasta.rb index 7ea668e..95f3be4 100644 --- a/lib/bio/db/fasta.rb +++ b/lib/bio/db/fasta.rb @@ -111,7 +111,7 @@ module Bio # The seuqnce lines in text. attr_accessor :data - attr_reader :entry_overrun +# attr_reader :entry_overrun # Stores the comment and sequence information from one entry of the # FASTA format string. If the argument contains more than one @@ -119,8 +119,8 @@ module Bio def initialize(str) @definition = str[/.*/].sub(/^>/, '').strip # 1st line @data = str.sub(/.*/, '') # rests - @data.sub!(/^>.*/m, '') # remove trailing entries for sure - @entry_overrun = $& +# @data.sub!(/^>.*/m, '') # remove trailing entries for sure +# @entry_overrun = $& end # Returns the stored one entry as a FASTA format. (same as to_s) -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/14, at 17:21, Martin Asser Hansen wrote: > I was hoping for an easy to use generic FASTA parser in bioruby. I > think it would be confusing with two flavors of parsers for short/ > long entries. Also, I think that with a minor effort the existing > parser could be optimized a fair bit. Subparsing of the defline > should not be done for generic parsing, but rather when needed. > Without any experience I think that disabling GC sounds like a bad > idea. Of cause C is always faster, but Ruby is nicer. > > > Cheers, > > > Martin > > > > > On Sat, Aug 14, 2010 at 5:42 AM, Tomoaki NISHIYAMA > wrote: > Hi, > >> Mind you that the Benchmark is performed on StringIO data, and >> that the script does not touch the disk! >> In a real test, it will be much slower! > > My initial thought was :- That's true, and therefore the pure > parser part which runs fairly fast with O(N) > is not the primary problem. If you push the entries into a hash it > will be much more time consuming. > > But realized that its two orders slower... (due to the benchmark > code as pointed out by goto-san) > 20 min for 100 M could be painful. > >> I have been trying to get an overview of the code in >> Bio::FastaFormat, >> but I find it hard to read (that could be because I am not used to >> Ruby, or OO for that matter). > > > For one thing, the Bio::FastaFormat is designed to work with > Bio::FlatFile. > If you write a dedicated fasta parser that could run much faster. > > # I would write C codes for a very simple operation on NGS data. > # That will run 100 times faster. > # When the necessary operation is a bit more complex, I would use > ruby. much much more time consuming.... > > Perhaps the target is to process about 20 ~ 1000 M reads with each of > them having 25 to 150 nt for the time being. > Thats quite different situation compared to process the > ~ 0.1 M entry of 50-10000 aa residues or nucleotides in a genome. > The relative cost for the entry separation becomes higher compared > with the sequence > processing within the entry. > > So, it may worth to write NGS dedicated parser rather than sticking > on FlatFile. > > Playing around the benchmark, about the half of execution time is > for garbage collection, > and the order of execution is somewhat relevant to get the number. > If you can suppress unnecessary object generation to the minimum > and disable GC, that will > perhaps make it run much faster. > > $ diff -u benchfasta benchfasta-hash-GC-b > --- benchfasta 2010-08-13 21:45:21.000000000 +0900 > +++ benchfasta-hash-GC-b 2010-08-14 11:53:20.000000000 +0900 > @@ -34,6 +34,9 @@ > end > end > > +count = ARGV.shift.to_i > +count = 2 if count == nil > + > data = < >5_gECOjxwXsN1/1 > AACGNTACTATCGTGACATGCGTGCAGGATTACAC > @@ -57,12 +60,23 @@ > TTATGATGCGCGTGGCGAACGTGAACGCGTTAAAC > DATA > > -io1 = StringIO.new(data) > -io2 = StringIO.new(data) > +io0 = StringIO.new(data * count) > +io1 = StringIO.new(data * count) > +io2 = StringIO.new(data * count) > +fasta0 = Fasta.new(io0) > fasta1 = Fasta.new(io1) > fasta2 = Bio::FastaFormat.open(io2) > > -Benchmark.bm(5) do |timer| > - timer.report('Hack') { 10_000_000.times { fasta1.each { | > entry1| } } } > - timer.report('Bio') { 10_000_000.times { fasta2.each { | > entry2| } } } > +hash0=Hash.new > +hash1=Hash.new > +hash2=Hash.new > + > +Benchmark.bm(8) do |timer| > + GC.enable;GC.start;GC.disable; > + timer.report('Bio') { i=0; fasta2.each { |entry2| i+=1; hash2 > [entry2.definition + i.to_s] = entry2.seq[2..25]} } > + hash2 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack') { i=0; fasta0.each { |entry1| i+=1; hash0 > [entry1[:seq_name] + i.to_s] = entry1[:seq][2..25]} } > + hash0 = nil; GC.enable;GC.start;GC.disable; > + timer.report('Hack-seq') { i=0; fasta1.each { |entry1| i+=1; > hash1[entry1[:seq_name] + i.to_s] = Bio::Sequence::NA.new(entry1 > [:seq])[2..25]} } > + hash1 = nil; GC.enable;GC.start;GC.disable; > end > > > > > > > -- > Tomoaki NISHIYAMA > > Advanced Science Research Center, > Kanazawa University, > 13-1 Takara-machi, > Kanazawa, 920-0934, Japan > > > On 2010/08/13, at 23:51, Martin Asser Hansen wrote: > >> >> As you stated 3 times faster with the hack, you may be already >> using ruby 1.9. >> >> >> I am using ruby 1.9.1, and I am using a fairly fast computer, but >> I am actually questioning the quality of the code. >> >> Anyway, I think 13 or 18 seconds for 100 M entry is fast enough >> and this >> part will not be the bottle neck of any application. >> How fast do you need it be? >> >> Mind you that the Benchmark is performed on StringIO data, and >> that the script does not touch the disk! In a real test, it will >> be much slower! I did not test on real data and more speed issues >> may surface (I have no idea how Ruby's file buffering compares to >> Perl's, performance-wise). >> >> I was contemplating porting some Biopieces (www.biopieces.org) >> from Perl to Ruby. Biopieces are used for everyday slicing and >> dicing of all sorts of biological data in a very simple and >> flexible manner. While Biopieces are not as fast as dedicated >> scripts, they are fast enough for convenient analysis of NGS data, >> but I will not accept a +300% speed penalty (i.e. read_fasta). >> >> I have been trying to get an overview of the code in >> Bio::FastaFormat, but I find it hard to read (that could be >> because I am not used to Ruby, or OO for that matter). It strikes >> me that the FastaFormat class does a number of irrelevant things >> like subparsing comments when not strictly necessary. In fact, the >> FASTA format actually don't use comments prefixed with # >> (semicolon can be used, but I will strongly advice against it >> since most software don't deal with it). Also, parsing is >> dependent on the record separator being '\n' - that could be >> considered a bug. There seem to be an overuse of substitutions, >> transliterations and regex matching. How about keeping it nice an >> tight? ala: >> >> SEP = $/ >> FASTA_REGEX = /\s*>?([^#{SEP}]+)#{SEP}(.+)>?$/ >> >> def get_entry >> block = @io.gets(SEP + ">") >> return nil if block.nil? >> >> if block =~ FASTA_REGEX >> seq_name = $1 >> seq = $2 >> else >> raise "Bad FASTA entry->#{block}" >> end >> >> seq.gsub!(/\s/, "") >> end >> >> >> Cheers, >> >> >> Martin >> >> -- >> Tomoaki NISHIYAMA >> >> Advanced Science Research Center, >> Kanazawa University, >> 13-1 Takara-machi, >> Kanazawa, 920-0934, Japan >> >> > > From ngoto at gen-info.osaka-u.ac.jp Sun Aug 15 05:58:35 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Sun, 15 Aug 2010 14:58:35 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> Message-ID: <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> On Sat, 14 Aug 2010 23:52:57 +0900 Tomoaki NISHIYAMA wrote: > To my understanding, the subparsing of the definition occurs only > when needed, ie when entry_id, identifiers, gi, etc. is called, in > current code. > If only definition is called, it is not further parsed. Right. > Careful coding to reduce object creation might contribute to speed up. > One of questionable variable is > @entry_overrun > Is this variable and attr_reader :entry_overrun > really required yet or is just a trace of older code? > Goto-San The @entry_overrun has two means. 1. Adjustment of file position. The separator used to read a fasta entry is "\n>", but the ">" should be belonging to the next entry. To adjust this, the last ">" is stored to @entry_overrun. The Bio::FlatFile wrapper will use the content of @entry_overrun in the next time of reading. In addition, it is used to get proper file positions when indexing fasta files. 2. Integrity of data format In Bio::FastaFormat.new(str), if the str contains two or more fasta data, the sequence could be wring with naive parser. For example, for ">test1\nATATATAT\n>test2\nGCGCGCGC\n", the sequence could be "ATATAT>test2GCGCGCGC" without the cutting process of the trailing entries. In addition, to store the removed element to @entry_overrun may help debugging of user's code and might prevent data loss. Indeed, in the current code, both 1 and 2 are done at a time with the lines @data.sub!(/^>.*/m, '') # remove trailing entries for sure @entry_overrun = $& The 1 might be skipped when reading all data at a time without file positions. The 2 might be skipped if we can ignore such kind of mistakes to give two or more entries to the Bio::FastaFormat.new. -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From tomoakin at kenroku.kanazawa-u.ac.jp Sun Aug 15 06:19:03 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Sun, 15 Aug 2010 15:19:03 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> Message-ID: Hi, > 1. Adjustment of file position. > The separator used to read a fasta entry is "\n>", but the ">" > should be belonging to the next entry. To adjust this, the last > ">" is stored to @entry_overrun. The Bio::FlatFile wrapper will > use the content of @entry_overrun in the next time of reading. I first thought as such, but I could not find the code that actually use it. Could you specify where it is used? I could find only several places defining it. Maybe there was a reformation of Flatfile buffering to use ungets but not entry_overrun? #at bioruby/lib/bio/ $ grep entry_overrun * */* */*/* */*/*/* db/fasta.rb:# attr_reader :entry_overrun db/fasta.rb:# @entry_overrun = $& db/fastq.rb: # entry_overrun db/fastq.rb: attr_reader :entry_overrun db/fastq.rb: @entry_overrun = sc.rest db/nbrf.rb: @entry_overrun = $& db/nbrf.rb: attr_reader :entry_overrun db/newick.rb: @entry_overrun = $1 db/newick.rb: attr_reader :entry_overrun appl/blast/format0.rb: @entry_overrun = $1 appl/blast/format0.rb: attr_reader :entry_overrun appl/blast/rpsblast.rb: @entry_overrun = $1 appl/fasta/format10.rb: @entry_overrun = overruns.join('') appl/fasta/format10.rb: attr_reader :entry_overrun appl/spidey/report.rb: @entry_overrun = $1 appl/spidey/report.rb: attr_reader :entry_overrun -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From ngoto at gen-info.osaka-u.ac.jp Sun Aug 15 06:39:22 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Sun, 15 Aug 2010 15:39:22 +0900 Subject: [BioRuby] Benchmarking FASTA file parsing In-Reply-To: References: <363FADBF-2A69-480B-AA61-AFAA2013FB0D@kenroku.kanazawa-u.ac.jp> <46A4F739-340A-4104-AB45-A9E69281D7B3@kenroku.kanazawa-u.ac.jp> <21C908D0-B7B9-4A4A-B47F-13CE26E2B0E6@kenroku.kanazawa-u.ac.jp> <20100815055836.0C8AB1CBC3EC@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100815063922.843861CBC3C8@idnmail.gen-info.osaka-u.ac.jp> On Sun, 15 Aug 2010 15:19:03 +0900 Tomoaki NISHIYAMA wrote: > Hi, > > > 1. Adjustment of file position. > > The separator used to read a fasta entry is "\n>", but the ">" > > should be belonging to the next entry. To adjust this, the last > > ">" is stored to @entry_overrun. The Bio::FlatFile wrapper will > > use the content of @entry_overrun in the next time of reading. > > I first thought as such, but I could not find the code that actually > use it. Could you specify where it is used? The "adjustment of file position" have already been replaced by a constant DELIMITER_OVERRUN. I'm sorry I've forgotten things. So, currently, only the role of the 2 is expected. -- Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From pjotr.public14 at thebird.nl Mon Aug 16 11:22:56 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 16 Aug 2010 13:22:56 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100816112256.GA4509@thebird.nl> On Fri, Aug 13, 2010 at 12:12:05AM +0900, Naohisa GOTO wrote: > > Now, to print FASTA I now do: > > > > gff3.sequences.each do | item | > > print item.to_fasta(item.entry_id, 70) > > end > > gff3.sequences.each do | item | > print item.output(:fasta) > end As it stands, it is not a direct replacement as the entry_id gets printed twice. Also when I replace line 971: @sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('') with the output(:fasta) equivalent, the unit test in BioRuby fails, because the ID becomes 'test01 test01' instead of 'test01'. Does it mean we have to modify 's', to get the proper output? Pj. From pjotr.public14 at thebird.nl Mon Aug 16 12:05:30 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 16 Aug 2010 14:05:30 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <20100816120530.GA4996@thebird.nl> The GFF3 module parses GFF3 files and stores them in memory. We could do something with that data. Most people will want to fetch mRNA and CDS's. BioPerl has some similar facility. How about adding a module GFF3::Sequence with methods that fetch the mRNA (splicing) variants and CDS's that belong to an ID? Or do you think an implementation would be ambiguous? Or is that already in there? I must admit I can't find it. Pj. From ngoto at gen-info.osaka-u.ac.jp Mon Aug 16 12:17:36 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Mon, 16 Aug 2010 21:17:36 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816112256.GA4509@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816112256.GA4509@thebird.nl> Message-ID: <20100816121736.A60491CBC3C2@idnmail.gen-info.osaka-u.ac.jp> On Mon, 16 Aug 2010 13:22:56 +0200 Pjotr Prins wrote: > On Fri, Aug 13, 2010 at 12:12:05AM +0900, Naohisa GOTO wrote: > > > Now, to print FASTA I now do: > > > > > > gff3.sequences.each do | item | > > > print item.to_fasta(item.entry_id, 70) > > > end > > > > gff3.sequences.each do | item | > > print item.output(:fasta) > > end > > As it stands, it is not a direct replacement as the entry_id gets > printed twice. Also when I replace This is considered to be a bug. The bug was reported previously, but was postponed. http://lists.open-bio.org/pipermail/bioruby/2009-April/000897.html > line 971: @sequences.collect { |s| s.to_fasta(s.entry_id, 70) }.join('') > > with the output(:fasta) equivalent, the unit test in BioRuby fails, > because the ID becomes 'test01 test01' instead of 'test01'. > > Does it mean we have to modify 's', to get the proper output? > The workaround is s.output(:fasta, :header=>s.entry_id, :width=>70) Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From ngoto at gen-info.osaka-u.ac.jp Mon Aug 16 12:40:28 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Mon, 16 Aug 2010 21:40:28 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816120530.GA4996@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> Message-ID: <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> On Mon, 16 Aug 2010 14:05:30 +0200 Pjotr Prins wrote: > The GFF3 module parses GFF3 files and stores them in memory. We could > do something with that data. Most people will want to fetch mRNA and > CDS's. BioPerl has some similar facility. > > How about adding a module GFF3::Sequence with methods that fetch the > mRNA (splicing) variants and CDS's that belong to an ID? Or do you > think an implementation would be ambiguous? Currently, the GFF parser in BioRuby is currently based on lines. To treat relations in the lines in a GFF3 file will be needed. A simple implementation would be to store all relations into a graph (or graphs) and then extracting information. BTW, for extracting sequence, I prefer GFF3::SequenceCutter or ExtractSequence rather than GFF3::Sequence. > Or is that already in there? I must admit I can't find it. No. Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 12:38:56 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 21:38:56 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816120530.GA4996@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> Message-ID: <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> Hi, It sounds me nice to have such feature. Maybe first thing to do for the implementation is to make a function to collect exons of a mRNA, sort and write join addresses, and finally pass to Sequence::NA.splicing. One thing needed before implementation is how to specify the sequene. GFF files sometimes comes without the sequence part and sometimes with the sequence. When the sequence is accompanied within the file its simple: just use it. If the sequence is not accompanied, may be we should pass a Hash of Bio::Sequence::NA? When a hash is supplied for a GFF with Sequence should the hash override the accompanied sequence? Some more exceptional thing would be handling circular genomes with some annotation spanning over the cut site of the genome. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/16, at 21:05, Pjotr Prins wrote: > The GFF3 module parses GFF3 files and stores them in memory. We could > do something with that data. Most people will want to fetch mRNA and > CDS's. BioPerl has some similar facility. > > How about adding a module GFF3::Sequence with methods that fetch the > mRNA (splicing) variants and CDS's that belong to an ID? Or do you > think an implementation would be ambiguous? > > Or is that already in there? I must admit I can't find it. > > Pj. > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 12:38:56 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 21:38:56 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816120530.GA4996@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> Message-ID: <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> Hi, It sounds me nice to have such feature. Maybe first thing to do for the implementation is to make a function to collect exons of a mRNA, sort and write join addresses, and finally pass to Sequence::NA.splicing. One thing needed before implementation is how to specify the sequene. GFF files sometimes comes without the sequence part and sometimes with the sequence. When the sequence is accompanied within the file its simple: just use it. If the sequence is not accompanied, may be we should pass a Hash of Bio::Sequence::NA? When a hash is supplied for a GFF with Sequence should the hash override the accompanied sequence? Some more exceptional thing would be handling circular genomes with some annotation spanning over the cut site of the genome. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/16, at 21:05, Pjotr Prins wrote: > The GFF3 module parses GFF3 files and stores them in memory. We could > do something with that data. Most people will want to fetch mRNA and > CDS's. BioPerl has some similar facility. > > How about adding a module GFF3::Sequence with methods that fetch the > mRNA (splicing) variants and CDS's that belong to an ID? Or do you > think an implementation would be ambiguous? > > Or is that already in there? I must admit I can't find it. > > Pj. > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From ktym at hgc.jp Mon Aug 16 13:40:25 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Mon, 16 Aug 2010 22:40:25 +0900 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: Message-ID: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Hi George, Oops, I just realized that I missed to read this thread. Sorry. ;) I'm very surprised and excited to know that you guys will organize a BioHackathon-like event in Kenya. Few hours ago, I finished a Skype meeting with the organizers and learnt about the plan described at http://rsg-ea-bio-sprint-2010.wikispaces.com/ (design of the poster is awesome, good job! :) Please use this mailing list to distill pre-hackathon preparations. We often asked "what can I contribute to the BioRuby project?" but it is usually difficult to assign a target and mentoring on it as the project itself has been self-organized. (The Google Summer of Code will be an exception. Mentors are working really hard and I really appreciate about that.) However, I take this opportunity to suggest several potential targets: (in addition to 1. finishing the newly introduced BioRuby plugin system and 2. supporting Semantic Web technologies on which we have been working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) === interfaces to external resources: * API for Ensemble (suggested by Jan Aerts) * API for UCSC (also suggested by Jan) * API for BioMart, InterMine etc. * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon === modern bioinformatics: * handling NGS data - wrappers and parsers for tools and libraries * Proteomics * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) === classical bioinformatics: * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. === visualization modules: * BioGraphics (already started by Jan) - genome mapping / comparative genomics? * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby === improving docs: * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. Regards Toshiaki Katayama On 2010/08/10, at 16:37, George Githinji wrote: > Hi all, > The Regional Students Group for Eastern Africa (RSG-EA) is one of the > grass-root level bodies of the International Society for Computational > Biology Student Council (ISCB-SC). The group has membership from ten > countries namely Burundi, Democratic Republic of Congo, Djibouti, > Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. > Recently we proposed to organize a biohakathon three day event to: > > 1) Learn how to collaborate on bioinformatics programming projects > using open source tools. > 2) Forge an East African bioinformatics programming community. > 3) Contribute a module/code to Bioruby library. > > The event has been sponsored by a grant from ISCB and ILRI/Beca > bioinformatics platform in Nairobi, Kenya. > > We would like to seek for a suitable project work from one of the > developer(s) and the community. The project should ideally be of > beginner to intermediate level difficulty. A third of the participants > will be of intermediate level programming skills with experience from > Java,Python and Perl. while the rest will have beginner level skills. > > We were also wondering whether it would be possible to get one of the > lead contributors to bioruby project to give a short 15-20 minutes > introductory talk to the participants. We have excellent video > conferencing facilities at the ILRI/Beca hub. The event is slated to > take place in late September. > > Thank you > > -- > --------------- > Sincerely > George > KEMRI/Wellcome-Trust Research Program > Skype: george_g2 > Blog: http://biorelated.wordpress.com/ > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From biopython at maubp.freeserve.co.uk Mon Aug 16 13:40:48 2010 From: biopython at maubp.freeserve.co.uk (Peter) Date: Mon, 16 Aug 2010 14:40:48 +0100 Subject: [BioRuby] GFF3 In-Reply-To: <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <3DF42522-7CF4-4AF2-8C84-CBC15EC098E6@kenroku.kanazawa-u.ac.jp> Message-ID: On Mon, Aug 16, 2010 at 1:38 PM, Tomoaki NISHIYAMA wrote: > > Some more exceptional thing would be handling circular genomes with some > annotation spanning over the cut site of the genome. > Hi all, In case you were not aware, the GFF3 specification was recently (July 2010) updated to explicitly support circular genomes via a new Is_circular flag in the GFF3 attributes field. This also defines how the co-ordinates of features spanning the origin should be defined. http://lists.open-bio.org/pipermail/biopython-dev/2010-July/008003.html http://sourceforge.net/mailarchive/message.php?msg_name=5B028E4D-30B2-4DCA-B41A-FF59ABDC4898%40mac.com Regards, Peter From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 13:52:02 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 22:52:02 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> Message-ID: <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> Hi, > A simple implementation would be to store all relations into a > graph (or graphs) and then extracting information. I recently wrote a program to extract all the mRNAs, but up to the addresses and not to the sequences. http://github.com/tomoakin/Bioruby-use/blob/master/src/gff2easytrack.rb This is not designed to be very general, but might be useful as a starting point. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/16, at 21:40, Naohisa GOTO wrote: > On Mon, 16 Aug 2010 14:05:30 +0200 > Pjotr Prins wrote: > >> The GFF3 module parses GFF3 files and stores them in memory. We could >> do something with that data. Most people will want to fetch mRNA and >> CDS's. BioPerl has some similar facility. >> >> How about adding a module GFF3::Sequence with methods that fetch the >> mRNA (splicing) variants and CDS's that belong to an ID? Or do you >> think an implementation would be ambiguous? > > Currently, the GFF parser in BioRuby is currently based on lines. > To treat relations in the lines in a GFF3 file will be needed. > A simple implementation would be to store all relations into a > graph (or graphs) and then extracting information. > > BTW, for extracting sequence, I prefer GFF3::SequenceCutter or > ExtractSequence rather than GFF3::Sequence. > > >> Or is that already in there? I must admit I can't find it. > > No. > > > Naohisa Goto > ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > From tomoakin at kenroku.kanazawa-u.ac.jp Mon Aug 16 14:08:09 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Mon, 16 Aug 2010 23:08:09 +0900 Subject: [BioRuby] csfasta parser Message-ID: <5EAE938B-D406-46D1-B274-086F82F25D05@kenroku.kanazawa-u.ac.jp> Hi, I modified fasta.rb to parse csfasta format a modified version of fasta to handle color sequence produced by SOLiD sequencers by Lifetechnologies (Formally Applied Biosystems). The most important difference is that the sequence is a nucleotide followed by colors specified by numbers [0-3]. When the sequencer fail to assign a color it may be represented by a dot ".". The other difference is that mapping location may be added to the definition line without space but separated with comma ",". Thus the entry_id extraction should be based on comma rather than space. In some case, more interest is for the mapping location or entry id itself, and the data is not touched at all. So, I made it to store the entry and definition, but the data is not extracted at initialization but left for lazy evaluation. The code can be found at http://github.com/tomoakin/bioruby/blob/master/lib/bio/db/csfasta.rb Note that naseq etc. is not tested. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From sararayburn at gmail.com Mon Aug 16 19:09:18 2010 From: sararayburn at gmail.com (Sara Rayburn) Date: Mon, 16 Aug 2010 14:09:18 -0500 Subject: [BioRuby] [GSoC] final project status Message-ID: Hi everyone, Well, GSoC is finished for this summer. Thanks for a great experience working with Bioruby (especially thanks to Christian and Diana for mentoring). It's been a fun and challenging experience and I'm looking forward to continuing to work on the project beyond the scope of GSoC. Here's a quick rundown of the final status of my project: -- The speciation/duplication inference algorithm is implemented in Bio::Algorithm::SDI for fully binary gene & species trees. There is also an alternative algorithm that will reroot the gene tree to minimize the number of duplicaitons. This is in Bio::Algorithm::SDIR. The more generalized algorithm is implemented in Bio::Algorithm::GSDI, but is unverified and not recommended for general use yet. There are things that i'd like to further work on, including refactoring some of the code and improving my unit tests. Also, I'm going to continue working on verifying the generalized algorithm. Again, thanks for a great summer and a great opportunity! Sara Rayburn From georgkam at gmail.com Tue Aug 17 11:39:45 2010 From: georgkam at gmail.com (George Githinji) Date: Tue, 17 Aug 2010 14:39:45 +0300 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: Thank you very much Toshiaki. We really appreciated the call and the much advice and helpful conversation that we held. We are distilling on the various ideas and we will update you and the list on what will be most appropriate and achievable for us. On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: > Hi George, > > Oops, I just realized that I missed to read this thread. Sorry. ;) > > I'm very surprised and excited to know that you guys will organize > a BioHackathon-like event in Kenya. > > Few hours ago, I finished a Skype meeting with the organizers > and learnt about the plan described at > > http://rsg-ea-bio-sprint-2010.wikispaces.com/ > (design of the poster is awesome, good job! :) > > Please use this mailing list to distill pre-hackathon preparations. > > We often asked "what can I contribute to the BioRuby project?" but > it is usually difficult to assign a target and mentoring on it > as the project itself has been self-organized. > (The Google Summer of Code will be an exception. Mentors are working > really hard and I really appreciate about that.) > > However, I take this opportunity to suggest several potential targets: > (in addition to 1. finishing the newly introduced BioRuby plugin system > and 2. supporting Semantic Web technologies on which we have been > working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ > and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) > > === interfaces to external resources: > > * API for Ensemble (suggested by Jan Aerts) > * API for UCSC (also suggested by Jan) > * API for BioMart, InterMine etc. > * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon > > === modern bioinformatics: > > * handling NGS data - wrappers and parsers for tools and libraries > * Proteomics > * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) > > === classical bioinformatics: > > * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) > > * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. > > === visualization modules: > > * BioGraphics (already started by Jan) - genome mapping / comparative genomics? > > * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby > > === improving docs: > > * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) > > * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. > > Regards > Toshiaki Katayama > > > On 2010/08/10, at 16:37, George Githinji wrote: > >> Hi all, >> The Regional Students Group for Eastern Africa (RSG-EA) is one of the >> grass-root level bodies of the International Society for Computational >> Biology Student Council (ISCB-SC). The group has membership from ten >> countries namely Burundi, Democratic Republic of Congo, Djibouti, >> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. >> Recently we proposed to organize a biohakathon three day event to: >> >> ?1) Learn how to collaborate on bioinformatics programming projects >> using open source tools. >> ?2) Forge an East African bioinformatics programming community. >> ?3) Contribute a module/code to Bioruby library. >> >> The event has been sponsored by a grant from ISCB and ILRI/Beca >> bioinformatics platform in Nairobi, Kenya. >> >> We would like to seek for ?a suitable project work from one of the >> developer(s) and the community. The project should ideally be of >> beginner to intermediate level difficulty. A third of the participants >> will be of intermediate level programming skills with experience from >> Java,Python and Perl. while the rest will have beginner level skills. >> >> We were also wondering whether it would be possible to get one of the >> lead contributors to bioruby project to give a short 15-20 minutes >> introductory talk to the participants. We have excellent video >> conferencing ?facilities at the ILRI/Beca hub. The event is slated to >> take place in late September. >> >> Thank you >> >> -- >> --------------- >> Sincerely >> George >> KEMRI/Wellcome-Trust Research Program >> Skype: george_g2 >> Blog: http://biorelated.wordpress.com/ >> _______________________________________________ >> BioRuby Project - http://www.bioruby.org/ >> BioRuby mailing list >> BioRuby at lists.open-bio.org >> http://lists.open-bio.org/mailman/listinfo/bioruby > > -- --------------- Sincerely George KEMRI/Wellcome-Trust Research Program Skype: george_g2 Blog: http://biorelated.wordpress.com/ From pjotr.public14 at thebird.nl Tue Aug 17 16:38:37 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 17 Aug 2010 18:38:37 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> Message-ID: <20100817163837.GA15726@thebird.nl> On Mon, Aug 16, 2010 at 10:52:02PM +0900, Tomoaki NISHIYAMA wrote: > Hi, > >> A simple implementation would be to store all relations into a >> graph (or graphs) and then extracting information. > > I recently wrote a program to extract all the mRNAs, but up to the > addresses > and not to the sequences. > > http://github.com/tomoakin/Bioruby-use/blob/master/src/gff2easytrack.rb > > This is not designed to be very general, but might be useful as a > starting point. Thanks for the nice example. It shows how you can filter GFF without storing everything in memory. Naturally that does not work for extracting all transcripts as GFF does not guarantee ordered data. Still, a good example. What I also like is that there is almost no coupling with other BioRuby modules (other than embedded Fasta). We should keep it that way. Question, have we ever seen GFF files that are not ordered? It makes so much sense to keep genes and their components together. I think it is somewhere argued that you can share parts between genes, but how often does that happen - and would they be far apart in the file? Even Lincoln states that you can split GFF files. That would not work if data is not together. I am thinking we can assume that related data comes with each other. This means we only have to cache a limited number records in memory to resolve dependencies. I'll probably write something in the coming week, as I need it. I'll design it to be a BioRuby plugin. For the time being. Pj. From tomoakin at kenroku.kanazawa-u.ac.jp Wed Aug 18 01:09:06 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Wed, 18 Aug 2010 10:09:06 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100817163837.GA15726@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> Message-ID: Hi, > Thanks for the nice example. It shows how you can filter GFF without > storing everything in memory. Naturally that does not work for > extracting all transcripts as GFF does not guarantee ordered data. I think the code is not dependent on the order of the GFF file. All the exon is stored in an array holding the exons that belong to the mRNA. The output order of the exons is dependent on the input GFF, but in this case the output order is not required to be specified. I could insert exonary.sort{ some ordering rule } before exonary.each{} if the output order matters. (Since this program was not to persist a long time and there was sufficient memory, I didn't care anything to keep the memory usage low). > I am thinking we can assume that related data comes with each other. The nature of gene/genome is not so simple. You can read on trans-splicing. So, unlinked parts of the genome can form a mature mRNA and protein thereof. If these parts are collected close in GFF file, then positional order is not preserved. If the GFF is sorted by the position, the parts are in distant position. > share parts between genes, For, shared parts between genes, it is frequent that micro RNA genes are on introns or exons of other genes. Also, for compact genomes, there is quite a number of genes having overlapping UTRs. On chloroplast genomes, even overlapped CDS are known. > Question, have we ever seen GFF files that are not ordered? I've never seen an unordered GFF file, but there could be different orders. 1. The lines are just sorted according to the location. 2. genes are ordered and the parts of the gene comes together. For example the arabidopsis GFF file looks like this and you can see that the feature itself is not ordered that protein 3760 comes earlier than exon 3631. Chr1 TAIR9 gene 3631 5899 . + . ID=AT1G01010;Note=protein_coding_gene;Name=AT1G01010 Chr1 TAIR9 mRNA 3631 5899 . + . ID=AT1G01010.1;Parent=AT1G01010;Name=AT1G01010.1;Index=1 Chr1 TAIR9 protein 3760 5630 . + . ID=AT1G01010.1-Protein;Name=AT1G01010.1;Derives_from=AT1G01010.1 Chr1 TAIR9 exon 3631 3913 . + . Parent=AT1G01010.1 Chr1 TAIR9 five_prime_UTR 3631 3759 . + . Parent=AT1G01010.1 Chr1 TAIR9 CDS 3760 3913 . + 0 Parent=AT1G01010.1,AT1G01010.1-Protein; Chr1 TAIR9 exon 3996 4276 . + . Parent=AT1G01010.1 Chr1 TAIR9 CDS 3996 4276 . + 2 Parent=AT1G01010.1,AT1G01010.1-Protein; > It makes so much sense to keep genes and their components together. I think GFF is an exchange format rather than to work directly with part of it. The data can be relatively easily stored into a RDB and extracted from it. Index on RDB will allow a fast identification of all feature in a specified region or a gene. That subset is good to work with. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan On 2010/08/18, at 1:38, Pjotr Prins wrote: > On Mon, Aug 16, 2010 at 10:52:02PM +0900, Tomoaki NISHIYAMA wrote: >> Hi, >> >>> A simple implementation would be to store all relations into a >>> graph (or graphs) and then extracting information. >> >> I recently wrote a program to extract all the mRNAs, but up to the >> addresses >> and not to the sequences. >> >> http://github.com/tomoakin/Bioruby-use/blob/master/src/ >> gff2easytrack.rb >> >> This is not designed to be very general, but might be useful as a >> starting point. > > Thanks for the nice example. It shows how you can filter GFF without > storing everything in memory. Naturally that does not work for > extracting all transcripts as GFF does not guarantee ordered data. > > Still, a good example. What I also like is that there is almost no > coupling with other BioRuby modules (other than embedded Fasta). We > should keep it that way. > > Question, have we ever seen GFF files that are not ordered? It makes > so much sense to keep genes and their components together. I think it > is somewhere argued that you can share parts between genes, but how > often does that happen - and would they be far apart in the file? > Even Lincoln states that you can split GFF files. That would not work > if data is not together. > > I am thinking we can assume that related data comes with each other. > This means we only have to cache a limited number records in memory > to resolve dependencies. > > I'll probably write something in the coming week, as I need it. I'll > design it to be a BioRuby plugin. For the time being. > > Pj. > From pjotr.public14 at thebird.nl Wed Aug 18 06:12:11 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Wed, 18 Aug 2010 08:12:11 +0200 Subject: [BioRuby] GFF3 In-Reply-To: References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> Message-ID: <20100818061211.GA18479@thebird.nl> On Wed, Aug 18, 2010 at 10:09:06AM +0900, Tomoaki NISHIYAMA wrote: >> Thanks for the nice example. It shows how you can filter GFF without >> storing everything in memory. Naturally that does not work for >> extracting all transcripts as GFF does not guarantee ordered data. > > I think the code is not dependent on the order of the GFF file. Sorry, I was not talking about your script. I merely stated your example shows *how* it is possible to filter data. My sentence was ambiguous. > I've never seen an unordered GFF file, but there could be different > orders. > 1. The lines are just sorted according to the location. > 2. genes are ordered and the parts of the gene comes together. > For example the arabidopsis GFF file looks like this and you can see > that the > feature itself is not ordered that protein 3760 comes earlier than exon > 3631. Thanks for that. In that case I can store the seekpos of every gene/location and use disk access instead. The way GFF is normally orgainized would hardly incur a penalty. I do the same with my BigBio FASTA reader. I want to get away from loading everything in memory. We can not assume that memory expansion keeps up with data load. It is fine as an 'optimization', but we should not take it for granted. > I think GFF is an exchange format rather than to work directly with > part of it. The data can be relatively easily stored into a RDB and > extracted from it. Index on RDB will allow a fast identification of > all feature in a specified region or a gene. That subset is good to > work with. I avoid RDB (assuming you mean RDBMS, and not the Rwanda Development Board), until BioRuby comes with an RDBMS that can be used in a transparent fashion. You can not assume every user has an RDBMS readily available. Pj. From tomoakin at kenroku.kanazawa-u.ac.jp Wed Aug 18 08:21:24 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Wed, 18 Aug 2010 17:21:24 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100818061211.GA18479@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> Message-ID: <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> Hi, Here is how the trans-splicing gene rps12 looks like in the genomic context. http://www.ncbi.nlm.nih.gov/nuccore/7525012?report=graph&v=60000:170000 > In that case I can store the seekpos of every > gene/location and use disk access instead. It should be safe if you scan the data and store the position in the GFF file of first and last record of every gene. > We can not assume that memory expansion keeps up with data load. > It is fine as an 'optimization', but we should not take it for > granted. The gene number within a genome doesn't grow so much. So, the memory becomes problematic only if you are dealing with multiple genomes or more fine features. Saving memory is another kind of optimization. It's good if we can achieve to do with less memory. I just don't care much as far as the problem fit in the memory I can use and run in a reasonable time. > I avoid RDB (assuming you mean RDBMS, and not the Rwanda Development > Board), until BioRuby comes with an RDBMS that can be used in a > transparent fashion. You can not assume every user has an RDBMS > readily > available. Oh, I meant relational database. It is for flexibility. Its just easier for me to use a RDBMS than to think of a new way to do without it. So, its just expression of my way. If you are always to query from the gene name, then gene name to seekpos index will be sufficient. But, then I would rather consider to store the parsed data object in PStore than to parse the GFF file again. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan From pjotr.public14 at thebird.nl Wed Aug 18 09:59:37 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Wed, 18 Aug 2010 11:59:37 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> Message-ID: <20100818095937.GA23171@thebird.nl> On Wed, Aug 18, 2010 at 05:21:24PM +0900, Tomoaki NISHIYAMA wrote: > Hi, > > Here is how the trans-splicing gene rps12 looks like in the genomic > context. > http://www.ncbi.nlm.nih.gov/nuccore/7525012?report=graph&v=60000:170000 Cool, huh :) > The gene number within a genome doesn't grow so much. So, the > memory becomes problematic only if you are dealing with multiple > genomes or more fine features. Yup. That is where we are heading. 100K people project, for example. > Saving memory is another kind of optimization. It's good if we can > achieve to do with less memory. I just don't care much as far as > the problem fit in the memory I can use and run in a reasonable > time. Sure, but I think it is short sighted to load everything in RAM when we think in more general BioRuby terms. > Oh, I meant relational database. It is for flexibility. > Its just easier for me to use a RDBMS than to think of a new way > to do without it. So, its just expression of my way. Sure, feel free to use an RDBMS. Just don't expect everyone to. > If you are always to query from the gene name, then gene name to > seekpos index will be sufficient. But, then I would rather consider > to store the parsed data object in PStore than to parse the GFF file > again. PStore is cool too. Pj. From rob.syme at gmail.com Sun Aug 22 06:22:18 2010 From: rob.syme at gmail.com (Rob Syme) Date: Sun, 22 Aug 2010 14:22:18 +0800 Subject: [BioRuby] BioSQL development Message-ID: Is there a particular person who has taken charge of the BioSQL part of Bioruby? I just want confirmation that I'm not using it in completely the wrong way. Are the classes designed so that you generate models for an app (a rails app, for example) that inherit from the Bio::SQL::whatever? eg: $ rails g model Biodatabase name:string authority:string description:text and then in app/model/biodatabase.rb you change: class Biodatabase < ActiveRecord::Base end into: class Biodatabase < Bio::SQL::Biodatabase # which inherits ActiveRecord::Base end If I get a handle on this, I'd be happy to write it up for http://bioruby.open-bio.org/wiki/Tutorial#BioSQL Thanks for all the work by the dev team. Much appreciated - I use bioruby almost every day. -r Rob Syme From ktym at hgc.jp Sun Aug 22 07:44:33 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Sun, 22 Aug 2010 16:44:33 +0900 Subject: [BioRuby] BioSQL development In-Reply-To: References: Message-ID: <2967C144-A2CF-4FF7-81AA-F0B25E55B9C0@hgc.jp> Hi Rob, Thank you for your will to volunteer for the documentation. Raoul is the current maintainer of the BioSQL module in BioRuby, but I heard that he is on vacation for now. Cheers, Toshiaki Katayama On 2010/08/22, at 15:22, Rob Syme wrote: > Is there a particular person who has taken charge of the BioSQL part of > Bioruby? > I just want confirmation that I'm not using it in completely the wrong way. > > Are the classes designed so that you generate models for an app (a rails > app, for example) that inherit from the Bio::SQL::whatever? eg: > > $ rails g model Biodatabase name:string authority:string description:text > > and then in app/model/biodatabase.rb you change: > class Biodatabase < ActiveRecord::Base > end > > into: > class Biodatabase < Bio::SQL::Biodatabase # which inherits > ActiveRecord::Base > end > > If I get a handle on this, I'd be happy to write it up for > http://bioruby.open-bio.org/wiki/Tutorial#BioSQL > > Thanks for all the work by the dev team. Much appreciated - I use bioruby > almost every day. > -r > > Rob Syme > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From ju at ncoffee.de Sun Aug 22 08:50:07 2010 From: ju at ncoffee.de (Julian Nordt) Date: Sun, 22 Aug 2010 10:50:07 +0200 Subject: [BioRuby] BioSQL development In-Reply-To: References: Message-ID: Hi Rob, I tried to use the BioSQL part of Bioruby for a webapplication based on rails. With the idea of giving users the capability to upload genomes to the application. However the mapping between BioSQL <-> GFF3 is(?)/were not figured out completly, which was one of the reasons why I changed my implementation to CHADO scheme. In regard to your question I did use BioSQL in the following way: def openBioConnection Bio::SQL.establish_connection(:adapter => "mysql", :host => "localhost", :username => "xxx", :password => "xxx", :database => "biosql_development") end def get_sequence(bio_entry_id) openBioConnection() if !bio_entry_id.blank? return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) end end def setSeqFeatName(id,name) seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) end (! This does not mean that it is the correct way or intended way of usage as I'm new to ruby/bioruby !) I had to make some small modifications to bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, unfortunately I do not remember where exactly. Mostlikely some changes in regard to the connection adapter and some changes to tablenames or similiar stuff. As you can see in ar-biosql.rb all the "classes" for the biosql orm are already defined, so one does not have to define the classes himself. Hope this helps, Julian Nordt On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme wrote: > Is there a particular person who has taken charge of the BioSQL part of > Bioruby? > I just want confirmation that I'm not using it in completely the wrong > way. > > Are the classes designed so that you generate models for an app (a rails > app, for example) that inherit from the Bio::SQL::whatever? eg: > > $ rails g model Biodatabase name:string authority:string description:text > > and then in app/model/biodatabase.rb you change: > class Biodatabase < ActiveRecord::Base > end > > into: > class Biodatabase < Bio::SQL::Biodatabase # which inherits > ActiveRecord::Base > end > > If I get a handle on this, I'd be happy to write it up for > http://bioruby.open-bio.org/wiki/Tutorial#BioSQL > > Thanks for all the work by the dev team. Much appreciated - I use bioruby > almost every day. > -r > > Rob Syme > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby > -- Using Opera's revolutionary email client: http://www.opera.com/mail/ From rob.syme at gmail.com Sun Aug 22 09:23:52 2010 From: rob.syme at gmail.com (Rob Syme) Date: Sun, 22 Aug 2010 17:23:52 +0800 Subject: [BioRuby] BioSQL development In-Reply-To: References: Message-ID: Thanks Toshiaki and Julian, Mapping features from gff may well become an issue. Until Raoul gets back, I might try and keep the data under the Chado schema as suggested by Julian. *If* I can get it clean enough, I'll offer it up for incorporation into bioruby. -r On Sun, Aug 22, 2010 at 4:50 PM, Julian Nordt wrote: > Hi Rob, > > I tried to use the BioSQL part of Bioruby for a webapplication based on > rails. With the idea of giving users the capability to upload genomes to the > application. However the mapping between BioSQL <-> GFF3 is(?)/were not > figured out completly, which was one of the reasons why I changed my > implementation to CHADO scheme. > > > In regard to your question I did use BioSQL in the following way: > > > def openBioConnection > Bio::SQL.establish_connection(:adapter => "mysql", > :host => "localhost", > :username => "xxx", > :password => "xxx", > :database => "biosql_development") > end > > > def get_sequence(bio_entry_id) > openBioConnection() > if !bio_entry_id.blank? > return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) > end > end > > > def setSeqFeatName(id,name) > seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) > Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) > end > > (! This does not mean that it is the correct way or intended way of usage > as I'm new to ruby/bioruby !) > > I had to make some small modifications to > bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, unfortunately I do > not remember where exactly. Mostlikely some changes in regard to the > connection adapter and some changes to tablenames or similiar stuff. > > As you can see in ar-biosql.rb all the "classes" for the biosql orm are > already defined, so one does not have to define the classes himself. > > Hope this helps, > > Julian Nordt > > > > > > > On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme wrote: > > Is there a particular person who has taken charge of the BioSQL part of >> Bioruby? >> I just want confirmation that I'm not using it in completely the wrong >> way. >> >> Are the classes designed so that you generate models for an app (a rails >> app, for example) that inherit from the Bio::SQL::whatever? eg: >> >> $ rails g model Biodatabase name:string authority:string description:text >> >> and then in app/model/biodatabase.rb you change: >> class Biodatabase < ActiveRecord::Base >> end >> >> into: >> class Biodatabase < Bio::SQL::Biodatabase # which inherits >> ActiveRecord::Base >> end >> >> If I get a handle on this, I'd be happy to write it up for >> http://bioruby.open-bio.org/wiki/Tutorial#BioSQL >> >> Thanks for all the work by the dev team. Much appreciated - I use bioruby >> almost every day. >> -r >> >> Rob Syme >> _______________________________________________ >> BioRuby Project - http://www.bioruby.org/ >> BioRuby mailing list >> BioRuby at lists.open-bio.org >> http://lists.open-bio.org/mailman/listinfo/bioruby >> >> > > -- > Using Opera's revolutionary email client: http://www.opera.com/mail/ > From ju at ncoffee.de Sun Aug 22 10:30:03 2010 From: ju at ncoffee.de (Julian Nordt) Date: Sun, 22 Aug 2010 12:30:03 +0200 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: Hi Rob, I just wanted to point that there are for sure people that have a greater experience in regard to the discussed db-schemas and might give you better advice on this topic than I'm able to do. As pointed out I have just recently started to work with bioruby. Hence it might be a good idea to consider further opinions on this topic. However I did wanted to reply to you request, as your scenario sounded somewhat similar to one of the projects I have to work on. -- Julian On Sun, 22 Aug 2010 11:23:52 +0200, Rob Syme wrote: > Thanks Toshiaki and Julian, > > Mapping features from gff may well become an issue. Until Raoul gets > back, I > might try and keep the data under the Chado schema as suggested by > Julian. > *If* I can get it clean enough, I'll offer it up for incorporation into > bioruby. > > -r > > > On Sun, Aug 22, 2010 at 4:50 PM, Julian Nordt wrote: > >> Hi Rob, >> >> I tried to use the BioSQL part of Bioruby for a webapplication based on >> rails. With the idea of giving users the capability to upload genomes >> to the >> application. However the mapping between BioSQL <-> GFF3 is(?)/were not >> figured out completly, which was one of the reasons why I changed my >> implementation to CHADO scheme. >> >> >> In regard to your question I did use BioSQL in the following way: >> >> >> def openBioConnection >> Bio::SQL.establish_connection(:adapter => "mysql", >> :host => "localhost", >> :username => "xxx", >> :password => "xxx", >> :database => "biosql_development") >> end >> >> >> def get_sequence(bio_entry_id) >> openBioConnection() >> if !bio_entry_id.blank? >> return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) >> end >> end >> >> >> def setSeqFeatName(id,name) >> seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) >> Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) >> end >> >> (! This does not mean that it is the correct way or intended way of >> usage >> as I'm new to ruby/bioruby !) >> >> I had to make some small modifications to >> bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, unfortunately I >> do >> not remember where exactly. Mostlikely some changes in regard to the >> connection adapter and some changes to tablenames or similiar stuff. >> >> As you can see in ar-biosql.rb all the "classes" for the biosql orm are >> already defined, so one does not have to define the classes himself. >> >> Hope this helps, >> >> Julian Nordt >> >> >> >> >> >> >> On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme wrote: >> >> Is there a particular person who has taken charge of the BioSQL part of >>> Bioruby? >>> I just want confirmation that I'm not using it in completely the wrong >>> way. >>> >>> Are the classes designed so that you generate models for an app (a >>> rails >>> app, for example) that inherit from the Bio::SQL::whatever? eg: >>> >>> $ rails g model Biodatabase name:string authority:string >>> description:text >>> >>> and then in app/model/biodatabase.rb you change: >>> class Biodatabase < ActiveRecord::Base >>> end >>> >>> into: >>> class Biodatabase < Bio::SQL::Biodatabase # which inherits >>> ActiveRecord::Base >>> end >>> >>> If I get a handle on this, I'd be happy to write it up for >>> http://bioruby.open-bio.org/wiki/Tutorial#BioSQL >>> >>> Thanks for all the work by the dev team. Much appreciated - I use >>> bioruby >>> almost every day. >>> -r >>> >>> Rob Syme >>> _______________________________________________ >>> BioRuby Project - http://www.bioruby.org/ >>> BioRuby mailing list >>> BioRuby at lists.open-bio.org >>> http://lists.open-bio.org/mailman/listinfo/bioruby >>> >>> >> >> -- >> Using Opera's revolutionary email client: http://www.opera.com/mail/ >> -- Using Opera's revolutionary email client: http://www.opera.com/mail/ From hlapp at drycafe.net Sun Aug 22 14:02:01 2010 From: hlapp at drycafe.net (Hilmar Lapp) Date: Sun, 22 Aug 2010 10:02:01 -0400 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: Is the issue with GFF3 in the Bioruby to BioSQL mapping, or is somehow in the BioSQL schema? I recall there was a thread on GFF recently which I wasn't able to follow, so if the answer is in that thread and isn't easy to sum up here, just point me there. -hilmar On Aug 22, 2010, at 6:30 AM, Julian Nordt wrote: > Hi Rob, > > I just wanted to point that there are for sure people that have a > greater > experience in regard to the discussed db-schemas and might give you > better > advice on this topic than I'm able to do. As pointed out I have just > recently started to work with bioruby. Hence it might be a good idea > to > consider further opinions on this topic. > > However I did wanted to reply to you request, as your scenario sounded > somewhat similar to one of the projects I have to work on. > > -- Julian > > On Sun, 22 Aug 2010 11:23:52 +0200, Rob Syme > wrote: > >> Thanks Toshiaki and Julian, >> >> Mapping features from gff may well become an issue. Until Raoul >> gets back, I >> might try and keep the data under the Chado schema as suggested by >> Julian. >> *If* I can get it clean enough, I'll offer it up for incorporation >> into >> bioruby. >> >> -r >> >> >> On Sun, Aug 22, 2010 at 4:50 PM, Julian Nordt wrote: >> >>> Hi Rob, >>> >>> I tried to use the BioSQL part of Bioruby for a webapplication >>> based on >>> rails. With the idea of giving users the capability to upload >>> genomes to the >>> application. However the mapping between BioSQL <-> GFF3 is(?)/ >>> were not >>> figured out completly, which was one of the reasons why I changed my >>> implementation to CHADO scheme. >>> >>> >>> In regard to your question I did use BioSQL in the following way: >>> >>> >>> def openBioConnection >>> Bio::SQL.establish_connection(:adapter => "mysql", >>> :host => "localhost", >>> :username => "xxx", >>> :password => "xxx", >>> :database => "biosql_development") >>> end >>> >>> >>> def get_sequence(bio_entry_id) >>> openBioConnection() >>> if !bio_entry_id.blank? >>> return Bio::SQL::Biosequence.find_by_bioentry_id(bio_entry_id) >>> end >>> end >>> >>> >>> def setSeqFeatName(id,name) >>> seq_feat = Bio::SQL::Seqfeature.find_by_seqfeature_id(id) >>> Bio::SQL::Seqfeature.update(seq_feat.id, :display_name => name) >>> end >>> >>> (! This does not mean that it is the correct way or intended way >>> of usage >>> as I'm new to ruby/bioruby !) >>> >>> I had to make some small modifications to >>> bio-1.4.0\lib\bio\io\biosql\ar-biosql.rb and biosql.rb, >>> unfortunately I do >>> not remember where exactly. Mostlikely some changes in regard to the >>> connection adapter and some changes to tablenames or similiar stuff. >>> >>> As you can see in ar-biosql.rb all the "classes" for the biosql >>> orm are >>> already defined, so one does not have to define the classes himself. >>> >>> Hope this helps, >>> >>> Julian Nordt >>> >>> >>> >>> >>> >>> >>> On Sun, 22 Aug 2010 08:22:18 +0200, Rob Syme >>> wrote: >>> >>> Is there a particular person who has taken charge of the BioSQL >>> part of >>>> Bioruby? >>>> I just want confirmation that I'm not using it in completely the >>>> wrong >>>> way. >>>> >>>> Are the classes designed so that you generate models for an app >>>> (a rails >>>> app, for example) that inherit from the Bio::SQL::whatever? eg: >>>> >>>> $ rails g model Biodatabase name:string authority:string >>>> description:text >>>> >>>> and then in app/model/biodatabase.rb you change: >>>> class Biodatabase < ActiveRecord::Base >>>> end >>>> >>>> into: >>>> class Biodatabase < Bio::SQL::Biodatabase # which inherits >>>> ActiveRecord::Base >>>> end >>>> >>>> If I get a handle on this, I'd be happy to write it up for >>>> http://bioruby.open-bio.org/wiki/Tutorial#BioSQL >>>> >>>> Thanks for all the work by the dev team. Much appreciated - I use >>>> bioruby >>>> almost every day. >>>> -r >>>> >>>> Rob Syme >>>> _______________________________________________ >>>> BioRuby Project - http://www.bioruby.org/ >>>> BioRuby mailing list >>>> BioRuby at lists.open-bio.org >>>> http://lists.open-bio.org/mailman/listinfo/bioruby >>>> >>>> >>> >>> -- >>> Using Opera's revolutionary email client: http://www.opera.com/mail/ >>> > > > -- > Using Opera's revolutionary email client: http://www.opera.com/mail/ > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== From rob.syme at gmail.com Sun Aug 22 14:17:45 2010 From: rob.syme at gmail.com (Rob Syme) Date: Sun, 22 Aug 2010 22:17:45 +0800 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: I've had a look around and a pretty solid mapping seems to be available: http://www.biosql.org/wiki/Annotation_Mapping#GFF3 Blue collar bioinformatics gave it a shot here: http://bcbio.wordpress.com/2009/02/22/exploring-bioperl-genbank-to-gff-mapping/ -r On 22 Aug 2010 22:02, "Hilmar Lapp" wrote: Is the issue with GFF3 in the Bioruby to BioSQL mapping, or is somehow in the BioSQL schema? I recall there was a thread on GFF recently which I wasn't able to follow, so if the answer is in that thread and isn't easy to sum up here, just point me there. -hilmar On Aug 22, 2010, at 6:30 AM, Julian Nordt wrote: > Hi Rob, > > I just wanted to point that there ... -- =========================================================== : Hilmar Lapp -:- Durham, NC -:- hlapp at drycafe dot net : =========================================================== _______________________________________________ BioRuby Project - http://www.bioruby.org/ BioRu... From ju at ncoffee.de Sun Aug 22 15:17:44 2010 From: ju at ncoffee.de (Julian Nordt) Date: Sun, 22 Aug 2010 17:17:44 +0200 Subject: [BioRuby] Fwd: Re: BioSQL development In-Reply-To: References: Message-ID: One more thing in regard to the mapping between BioSQL and GFF3: I tried to follow the mapping given by the biosql wiki and blue collar bioinformatics. The mapping is acceptable in the sense that you can store *most* or even all (?) of the features that GFF3 offers. The further I got though within the development the unclearer things got me, especially in terms of the "attribute" column. If you compare the table at the biosql wiki (for the attribute column) with the one at blue collar bioinformatics, one will notice that the there are keywords that occour in one, but not at the other table. That not mentioning the todos on the wiki regarding the "standard" columns. I havn't looked in that detail though through blue collars code, maybe the answer is given there. However I wrote a small library that managed to store most - but not all the given information of the GFF3-files - correctly to BioSQL. There were some points where the mapping has been unclear to me and where I stored the given information where I thought it would fit best. Considering that I chose a standard db schema to avoid any ambiguously and the fact that I experienced performance issues with MYSQL+Rails (not related to BioSQL) at the project made it enough for me to switch to CHADO backed by POSTGRES. The documentation regarding CHADO is in my opinion richer and most importantly one can follow gmod_bulk_load_gff3.pl for the mapping relatively easy, since it is well documented. I would very much welcome other opinions on the topic, especially in combination with the use of web applications. -- Julian On Sun, 22 Aug 2010 16:17:45 +0200, Rob Syme wrote: > I've had a look around and a pretty solid mapping seems to be available: > http://www.biosql.org/wiki/Annotation_Mapping#GFF3 > > Blue collar bioinformatics gave it a shot here: > http://bcbio.wordpress.com/2009/02/22/exploring-bioperl-genbank-to-gff-mapping/ > > -r > > On 22 Aug 2010 22:02, "Hilmar Lapp" wrote: > Is the issue with GFF3 in the Bioruby to BioSQL mapping, or is somehow in > the BioSQL schema? > > I recall there was a thread on GFF recently which I wasn't able to > follow, > so if the answer is in that thread and isn't easy to sum up here, just > point > me there. > > -hilmar > > > > On Aug 22, 2010, at 6:30 AM, Julian Nordt wrote: > >> Hi Rob, >> >> I just wanted to point that there ... -- Using Opera's revolutionary email client: http://www.opera.com/mail/ From pjotr.public14 at thebird.nl Mon Aug 23 12:16:16 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 23 Aug 2010 14:16:16 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <20100818095937.GA23171@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> <20100818095937.GA23171@thebird.nl> Message-ID: <20100823121616.GC2223@thebird.nl> Hi, I am in the process of providing GFF3 support for extracting mRNA, exons and CDSs from a GFF file. The BioRuby plugin is at: http://github.com/pjotrp/bioruby-gff3-plugin and the writeup is at: http://thebird.nl/bioruby/BioRuby_GFF3.html You are invited to comment on its contents. The first genome I am trying has over 600Mb of data, which, sadly, won't fit in a 2Gb RAM Thinkpad. I could use a large memory server or database, but that I consider cheating ;). BTW I am not suprised GFF3 support in, for example, BioSQL is patchy. The GFF3 standard is somewhat loosely defined, and open to interpretation. Not that it necessarily is a bad thing, though it is probably impossible to write the all encompassing parser. See the writeup. Pj. From ktym at hgc.jp Thu Aug 26 06:04:04 2010 From: ktym at hgc.jp (Toshiaki Katayama) Date: Thu, 26 Aug 2010 15:04:04 +0900 Subject: [BioRuby] BioRuby paper is just published Message-ID: Dear all, After 10 years of development, the BioRuby paper is finally published in the Bioinformatics journal. The article is open access, so please take a look. BioRuby: Bioinformatics software for the Ruby programming language Naohisa Goto, Pjotr Prins, Mitsuteru Nakao, Raoul Bonnal, Jan Aerts and Toshiaki Katayama Bioinformatics 2010; doi: 10.1093/bioinformatics/btq475 Abstract: http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq475 PDF: http://bioinformatics.oxfordjournals.org/cgi/reprint/btq475 For the future publication, please cite this paper when you use the BioRuby library for your work. :-) We sincerely thank all contributors (http://bioruby.open-bio.org/wiki/Contributors) so far. We are very sorry that we could not include all of your names in the manuscript due to the space limitation. I'd like to take this opportunity to thank Pjotr Prins who has been lead this happen by hosting regular Skype meetings and worked very hard for drafting and editing the manuscript, as a joint first author. I also thank DBCLS (Database Center for Life Science, Japan) for giving us several chances to meet each other by hosting the DBCLS BioHackathons (http://hackathon3.dbcls.jp http://www.ncbi.nlm.nih.gov/pubmed/20727200). Draft of the bioruby paper was emerged during the hackathons, and that's why we have only 6 authors in this publication. I ask your kind understanding on this. The BioRuby project was originally started at the KEGG laboratory in Kyoto University and some resources are now hosted by Human Genome Center in Tokyo University, so I'd like to express my appreciation to these two institutes as well. Additionally, I also thank IPA (Information-technology Promotion Agency Japan) for 1 year grant in 2005 which greatly extended our motivation for further developments in coming years. Best Regards, Toshiaki Katayama From biopython at maubp.freeserve.co.uk Thu Aug 26 08:24:57 2010 From: biopython at maubp.freeserve.co.uk (Peter) Date: Thu, 26 Aug 2010 09:24:57 +0100 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: References: Message-ID: On Thu, Aug 26, 2010 at 7:04 AM, Toshiaki Katayama wrote: > > Dear all, > > After 10 years of development, the BioRuby paper is finally published in the > Bioinformatics journal. The article is open access, so please take a look. > > BioRuby: Bioinformatics software for the Ruby programming language > Naohisa Goto, Pjotr Prins, Mitsuteru Nakao, Raoul Bonnal, Jan Aerts > and Toshiaki Katayama > Bioinformatics 2010; doi: 10.1093/bioinformatics/btq475 > > Abstract: > http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq475 > > PDF: > http://bioinformatics.oxfordjournals.org/cgi/reprint/btq475 > > For the future publication, please cite this paper when you use the BioRuby > library for your work. :-) > > We sincerely thank all contributors (http://bioruby.open-bio.org/wiki/Contributors) > so far. We are very sorry that we could not include all of your names in the > manuscript due to the space limitation. > > I'd like to take this opportunity to thank Pjotr Prins who has been lead this happen > by hosting regular Skype meetings and worked very hard for drafting and editing > the manuscript, as a joint first author. > > I also thank DBCLS (Database Center for Life Science, Japan) for giving us > several chances to meet each other by hosting the DBCLS BioHackathons > (http://hackathon3.dbcls.jp http://www.ncbi.nlm.nih.gov/pubmed/20727200). > Draft of the bioruby paper was emerged during the hackathons, and that's > why we have only 6 authors in this publication. I ask your kind understanding > on this. > > The BioRuby project was originally started at the KEGG laboratory in Kyoto > University and some resources are now hosted by Human Genome Center in > Tokyo University, so I'd like to express my appreciation to these two institutes > as well. Additionally, I also thank IPA (Information-technology Promotion > Agency Japan) for 1 year grant in 2005 which greatly extended our > motivation for further developments in coming years. > > Best Regards, > Toshiaki Katayama > Congratulation Katayama-san, Pjotr, and the rest of the team. This is excellent news. Peter @ Biopython P.S. Do either of you have an account on the OBF news server? Posting an announcement there (even just this email) would be great: http://news.open-bio.org/news/category/obf-projects/bioruby/ Please email me (or OBF support) if you need help with access. From pjotr.public14 at thebird.nl Thu Aug 26 17:53:44 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Thu, 26 Aug 2010 19:53:44 +0200 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: References: Message-ID: <20100826175344.GA10231@thebird.nl> Thanks Peter! I think this paper is an important milestone for BioRuby, and the Bio* projects in general. I think we took care to write the paper in such a way that it will help popularise the concept of OSS Bio* projects, and attract new developers to BioRuby, BioPython, BioJava and others. Just to show what it means, during this year's GSoC running up, someone (I won't mention names) had the gall to say that BioRuby was nowhere and not worth supporting, since we had no peer reviewed publication. Doh! Someone needed OSS explained. I guess that argument is buried now. What shines on any Bio* project shines on the others. We look forward to many cross Bio* collaborations. Thanks Peter, Brad, Hilmar, Chris and all others, for being above project interests, and supporting all Bio* projects. In true sportsmanship. I think we have a pretty and attractive paper, which anyone can take to his or her supervisor. That was the goal. Hut hut hut, go OBF! And go BioRuby, BioPython, BioPerl, and BioJava! Pj. On Thu, Aug 26, 2010 at 09:24:57AM +0100, Peter wrote: > On Thu, Aug 26, 2010 at 7:04 AM, Toshiaki Katayama wrote: > > After 10 years of development, the BioRuby paper is finally published in the > > Bioinformatics journal. The article is open access, so please take a look. From rutgeraldo at gmail.com Thu Aug 26 19:23:45 2010 From: rutgeraldo at gmail.com (Rutger Vos) Date: Thu, 26 Aug 2010 20:23:45 +0100 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: <20100826175344.GA10231@thebird.nl> References: <20100826175344.GA10231@thebird.nl> Message-ID: > What shines on any Bio* project shines on the others. We look forward > to many cross Bio* collaborations. Hear, hear. -- Dr. Rutger A. Vos School of Biological Sciences Philip Lyle Building, Level 4 University of Reading Reading RG6 6BX United Kingdom Tel: +44 (0) 118 378 7535 http://www.nexml.org http://rutgervos.blogspot.com From cjfields at illinois.edu Thu Aug 26 19:43:03 2010 From: cjfields at illinois.edu (Chris Fields) Date: Thu, 26 Aug 2010 14:43:03 -0500 Subject: [BioRuby] BioRuby paper is just published In-Reply-To: References: <20100826175344.GA10231@thebird.nl> Message-ID: On Aug 26, 2010, at 2:23 PM, Rutger Vos wrote: >> What shines on any Bio* project shines on the others. We look forward >> to many cross Bio* collaborations. > > Hear, hear. > > -- > Dr. Rutger A. Vos > School of Biological Sciences > Philip Lyle Building, Level 4 > University of Reading > Reading > RG6 6BX > United Kingdom > Tel: +44 (0) 118 378 7535 > http://www.nexml.org > http://rutgervos.blogspot.com :) (and, +1) chris From pjotr.public14 at thebird.nl Mon Aug 30 09:10:28 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Mon, 30 Aug 2010 11:10:28 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> Message-ID: <20100830091028.GA13445@thebird.nl> On Wed, Aug 18, 2010 at 05:21:24PM +0900, Tomoaki NISHIYAMA wrote: > The gene number within a genome doesn't grow so much. So, the > memory becomes problematic only if you are dealing with multiple > genomes or more fine features. > > Saving memory is another kind of optimization. It's good if we can > achieve to do with less memory. I just don't care much as far as > the problem fit in the memory I can use and run in a reasonable > time. Well, interesting news. The low memory version is actually 50% faster than the InMemory BioRuby edition. On a decent 15Gb server with fast drives (and ruby 1.8.7 (2010-08-16 patchlevel 302) [x86_64-linux]): When I parse a 500Mb GFF3 file, without FASTA information, with BioRuby it consumes 8.5 Gb RAM and takes 20 minutes. My NoCache version takes 1Gb RAM and 13 minutes. On my 2Gb laptop the native BioRuby version never completed (which, in my opinion, is unacceptable). Mine is the naive version - i.e. I only store file seek positions in memory, and reload and parse a record from disk every time. The record parser is BioRuby's, not mine. There are no optimizations. Even this is faster than BioRuby's default in memory model - which takes 19 minutes by itself to load and parse the data file; I only use the last 1 minute for digesting information and assembly of sequences. I am not 100% sure why this is, but I know that BioRuby consumes the whole file in memory first, splits it by line and, next, starts parsing GFF. Probably memory allocation and regex are expensive with really large buffers. I think BioRuby needs to provide iterators for on demand parsing of files, rather than big memory blobs. I also do it for FASTA in my BigBio project. It can be done transparently, as both InMemory and NoCache versions use the same algorithm. It will take me some time to complete a write-up on how to approach this for BioRuby, as I am keeping my head low next month. Note that, BioJava provides iteration too, as a default model, though I think their visitor pattern introduces too much complexity. In short: We can use simple Ruby iterators - it will work - and potentially even provides transparent LRU caching. I'll have numbers on that later, as that is my route to speed optimization. I know GFF3 components get reloaded and re-parsed many times. If you want to try, my code is at http://github.com/pjotrp/bioruby-gff3-plugin the current report is at http://thebird.nl/bioruby/BioRuby_GFF3.html Note: you may need my empty line patch for BioRuby to run the InMemory edition (my BioRuby GFF3 branch on github). Pj. From rob.syme at gmail.com Mon Aug 30 09:13:59 2010 From: rob.syme at gmail.com (Rob Syme) Date: Mon, 30 Aug 2010 17:13:59 +0800 Subject: [BioRuby] Chado Mappings and DataMapper vs ActiveRecord Message-ID: I've got some (very) early mappings up for the Chado DB schema (only the cv, general, sequence and pub modules so far). http://github.com/robsyme/RubyCHADO I'd be very happy to offer a more final version up for inclusion into BioRuby if others thought that it might be useful. The code is neither clever nor elegant, but it might save somebody else putting together all the relationships/associations in the future. At the moment, the models are based on DataMapper rather than ActiveRecord. DataMapper feels like a better fit to me, but if there are others with strong opinions about ORMs in BioRuby, I'd appreciate the input. -r From tomoakin at kenroku.kanazawa-u.ac.jp Tue Aug 31 02:12:37 2010 From: tomoakin at kenroku.kanazawa-u.ac.jp (Tomoaki NISHIYAMA) Date: Tue, 31 Aug 2010 11:12:37 +0900 Subject: [BioRuby] GFF3 In-Reply-To: <20100830091028.GA13445@thebird.nl> References: <20100812143012.GA31206@thebird.nl> <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> <20100830091028.GA13445@thebird.nl> Message-ID: <954164B5-931F-426C-8833-3CBCA8BA21D3@kenroku.kanazawa-u.ac.jp> Hi, > When I parse a 500Mb GFF3 file, without FASTA information, with > BioRuby it consumes 8.5 Gb RAM and takes 20 minutes. My NoCache > version takes 1Gb RAM and 13 minutes. This sounds nice! > I am not 100% sure why this is, but I know that BioRuby consumes the > whole file in memory first, splits it by line and, next, starts > parsing GFF. Probably memory allocation and regex are expensive with > really large buffers. During the conversation on "Benchmarking FASTA file parsing", I realized that GC takes quite a lot of time if a large memory is to be used. The mark and sweep algorithm in Matz ruby implementation scans over all the allocated objects every time the GC is run (which is not written in ruby code but implicitly runs if not suppressed). Since ruby-1.9.2 seems to have much better GC performance, I am interested how the performance compares in ruby-1.9.2. (I am also interested in GC.disable condition, but this may not work with 15 Gbytes though). Running your script with ruby 1.9 caused several errors, related to case when : removal of colon at the end of when line and changing colon to newline if the colon is not at the end of line was sufficient to run with ruby 1.9.2. (diff at the end) Either one of newline, semicolon, and "then" seems to work. > I only store file seek positions in > memory, and reload and parse a record from disk every time. The other good reason is that the data is perhaps not read from the disk many times but cached by the operating system and retained on memory. So this is not as bad as it sounds. Having 15 Gbytes, presumably 500 Mbytes file need not flushed. -- Tomoaki NISHIYAMA Advanced Science Research Center, Kanazawa University, 13-1 Takara-machi, Kanazawa, 920-0934, Japan diff --git a/bin/gff3-fetch b/bin/gff3-fetch index b8d4718..36e61f7 100755 --- a/bin/gff3-fetch +++ b/bin/gff3-fetch @@ -39,17 +39,17 @@ ARGV.each do | fn | gffdb = Bio::GFFbrowser::GFFdb.new(fn,options) gff = gffdb.assembler case gfftype - when 'mrna'||'mRNA' : + when 'mrna'||'mRNA' gff.each_mRNA_seq do | id, seq | puts ">"+id puts seq end - when 'exon': + when 'exon' gff.each_exon_seq do | id, seq | puts ">"+id puts seq end - when 'CDS': + when 'CDS' gff.each_CDS_seq do | id, seq | puts ">"+id puts seq diff --git a/lib/bio/db/gff/gffdb.rb b/lib/bio/db/gff/gffdb.rb index 5325fb9..9540154 100644 --- a/lib/bio/db/gff/gffdb.rb +++ b/lib/bio/db/gff/gffdb.rb @@ -26,7 +26,7 @@ module Bio cache_recs = options[:cache_records] @assembler = case cache_recs - when :cache_none : + when :cache_none NoCache.new(filename, options) else InMemory.new(filename, options) # default diff --git a/lib/bio/db/gff/gffparser.rb b/lib/bio/db/gff/gffparser.rb index 5522d81..e1ed9db 100644 --- a/lib/bio/db/gff/gffparser.rb +++ b/lib/bio/db/gff/gffparser.rb @@ -30,9 +30,12 @@ module Bio info "Added #{rec.feature_type} with component ID #{id}" else case rec.feature_type - when 'mRNA' || 'SO:0000234' : @mrnalist.add(id,rec) - when 'CDS' || 'SO:0000316' : @cdslist.add(id,rec) - when 'exon' || 'SO:0000147' : @exonlist.add(id,rec) + when 'mRNA' || 'SO:0000234' + @mrnalist.add(id,rec) + when 'CDS' || 'SO:0000316' + @cdslist.add(id,rec) + when 'exon' || 'SO:0000147' + @exonlist.add(id,rec) else if !IGNORE_FEATURES.include?(rec.feature_type) @unrecognized_features[rec.feature_type] = true From pjotr.public14 at thebird.nl Tue Aug 31 06:53:09 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 31 Aug 2010 08:53:09 +0200 Subject: [BioRuby] GFF3 In-Reply-To: <954164B5-931F-426C-8833-3CBCA8BA21D3@kenroku.kanazawa-u.ac.jp> References: <20100812151205.758711CBC50D@idnmail.gen-info.osaka-u.ac.jp> <20100816120530.GA4996@thebird.nl> <20100816124029.1A7AE1CBC3C2@idnmail.gen-info.osaka-u.ac.jp> <41003DE0-1C0B-4997-87C3-2FFA877C887C@kenroku.kanazawa-u.ac.jp> <20100817163837.GA15726@thebird.nl> <20100818061211.GA18479@thebird.nl> <60EE922B-709C-485A-9268-0D943145E40C@kenroku.kanazawa-u.ac.jp> <20100830091028.GA13445@thebird.nl> <954164B5-931F-426C-8833-3CBCA8BA21D3@kenroku.kanazawa-u.ac.jp> Message-ID: <20100831065309.GA20904@thebird.nl> On Tue, Aug 31, 2010 at 11:12:37AM +0900, Tomoaki NISHIYAMA wrote: > During the conversation on "Benchmarking FASTA file parsing", I > realized that GC takes quite a lot of time if a large memory is to > be used. The mark and sweep algorithm in Matz ruby implementation > scans over all the allocated objects every time the GC is run (which > is not written in ruby code but implicitly runs if not suppressed). Yup. No GC is perfect. They all have trade-offs. And, like you say, in particular when you run out of memory it starts to hurt. > Since ruby-1.9.2 seems to have much better GC performance, I am > interested how the performance compares in ruby-1.9.2. (I am also > interested in GC.disable condition, but this may not work with 15 > Gbytes though). The GC should really run on a separate thread (read core). Not sure Ruby 1.9 does that now. The JVM does, so JRuby probably does. When I implement an LRU cache it could also easily run on a separate thread, as returned data is immutable. I may do that, if I find something similar to Erlang actors, for Ruby. This may be it: http://on-ruby.blogspot.com/2008/01/ruby-concurrency-with-actors.html It is something to do later. Parallelized cache handling would really be nice for big data. And, if it looks like a standard Hash to the outside users, it will be easy to implement transparently throughout BioRuby. Anyway, let me add a cache first, and see what it means to performance. > Running your script with ruby 1.9 caused several errors, related to > case when : removal of colon at the end of when line and changing > colon to newline if the colon is not at the end of line was > sufficient to run with ruby 1.9.2. (diff at the end) Either one of > newline, semicolon, and "then" seems to work. I still have to migrate to 1.9. Thanks for trying! Next time please fix it on github so I can merge it in easier. I may migrate for using actors. > The other good reason is that the data is perhaps not read from the > disk many times but cached by the operating system and retained on > memory. So this is not as bad as it sounds. Having 15 Gbytes, > presumably 500 Mbytes file need not flushed. Yes. And that is why I started experimenting with NoCache. Seeks are cheap. Even without the OS buffers, disk reads are very very optimized these days (I have done some work on that last year, together with a student Konstantin Tretjakov). Most seeks in GFF3 are even within the standard hardware cache (8/16 Mb) boundary, and are therefore not a problem, even on small machines! With NoCache the file gets read twice, so the penalty should really be 2x max. Which is totally acceptable, if that means we can handle any size data on any machine. And then we can offer both InMemory and NoCache. We can handle any type of big data. Our users win. BioRuby wins. Next to do: I want an LRU cache to prevent *parsing* every record twice. Parsing is the single expensive thing in NoCache. One thing will be interesting: to see what LRU means in conjunction with GC. Pj. From email2ants at gmail.com Tue Aug 31 10:42:12 2010 From: email2ants at gmail.com (Anthony Underwood) Date: Tue, 31 Aug 2010 11:42:12 +0100 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: Hi Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. This field is becoming huge within bioinformatics and bioruby is lagging when it comes to tools to parse the date, specifically wrappers around the C functions found in samtools. I would have a go myself but have no experience in C so am sure others would do a better job. Thanks Anthony On 17 Aug 2010, at 12:39, George Githinji wrote: > Thank you very much Toshiaki. We really appreciated the call and the > much advice and helpful conversation that we held. > We are distilling on the various ideas and we will update you and the > list on what will be most appropriate and achievable for us. > > > On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: >> Hi George, >> >> Oops, I just realized that I missed to read this thread. Sorry. ;) >> >> I'm very surprised and excited to know that you guys will organize >> a BioHackathon-like event in Kenya. >> >> Few hours ago, I finished a Skype meeting with the organizers >> and learnt about the plan described at >> >> http://rsg-ea-bio-sprint-2010.wikispaces.com/ >> (design of the poster is awesome, good job! :) >> >> Please use this mailing list to distill pre-hackathon preparations. >> >> We often asked "what can I contribute to the BioRuby project?" but >> it is usually difficult to assign a target and mentoring on it >> as the project itself has been self-organized. >> (The Google Summer of Code will be an exception. Mentors are working >> really hard and I really appreciate about that.) >> >> However, I take this opportunity to suggest several potential targets: >> (in addition to 1. finishing the newly introduced BioRuby plugin system >> and 2. supporting Semantic Web technologies on which we have been >> working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ >> and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) >> >> === interfaces to external resources: >> >> * API for Ensemble (suggested by Jan Aerts) >> * API for UCSC (also suggested by Jan) >> * API for BioMart, InterMine etc. >> * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon >> >> === modern bioinformatics: >> >> * handling NGS data - wrappers and parsers for tools and libraries >> * Proteomics >> * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) >> >> === classical bioinformatics: >> >> * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) >> >> * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. >> >> === visualization modules: >> >> * BioGraphics (already started by Jan) - genome mapping / comparative genomics? >> >> * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby >> >> === improving docs: >> >> * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) >> >> * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. >> >> Regards >> Toshiaki Katayama >> >> >> On 2010/08/10, at 16:37, George Githinji wrote: >> >>> Hi all, >>> The Regional Students Group for Eastern Africa (RSG-EA) is one of the >>> grass-root level bodies of the International Society for Computational >>> Biology Student Council (ISCB-SC). The group has membership from ten >>> countries namely Burundi, Democratic Republic of Congo, Djibouti, >>> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. >>> Recently we proposed to organize a biohakathon three day event to: >>> >>> 1) Learn how to collaborate on bioinformatics programming projects >>> using open source tools. >>> 2) Forge an East African bioinformatics programming community. >>> 3) Contribute a module/code to Bioruby library. >>> >>> The event has been sponsored by a grant from ISCB and ILRI/Beca >>> bioinformatics platform in Nairobi, Kenya. >>> >>> We would like to seek for a suitable project work from one of the >>> developer(s) and the community. The project should ideally be of >>> beginner to intermediate level difficulty. A third of the participants >>> will be of intermediate level programming skills with experience from >>> Java,Python and Perl. while the rest will have beginner level skills. >>> >>> We were also wondering whether it would be possible to get one of the >>> lead contributors to bioruby project to give a short 15-20 minutes >>> introductory talk to the participants. We have excellent video >>> conferencing facilities at the ILRI/Beca hub. The event is slated to >>> take place in late September. >>> >>> Thank you >>> >>> -- >>> --------------- >>> Sincerely >>> George >>> KEMRI/Wellcome-Trust Research Program >>> Skype: george_g2 >>> Blog: http://biorelated.wordpress.com/ >>> _______________________________________________ >>> BioRuby Project - http://www.bioruby.org/ >>> BioRuby mailing list >>> BioRuby at lists.open-bio.org >>> http://lists.open-bio.org/mailman/listinfo/bioruby >> >> > > > > -- > --------------- > Sincerely > George > KEMRI/Wellcome-Trust Research Program > Skype: george_g2 > Blog: http://biorelated.wordpress.com/ > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From pjotr.public14 at thebird.nl Tue Aug 31 10:59:55 2010 From: pjotr.public14 at thebird.nl (Pjotr Prins) Date: Tue, 31 Aug 2010 12:59:55 +0200 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: <20100831105955.GA24001@thebird.nl> I Anthony, I wrote a Ruby wrapper for SAMtools. See http://thebird.nl/biolib/Adding_BioLib_BAM_SAM_Support.html If you want to test and use it, we can move it forward. Pj. On Tue, Aug 31, 2010 at 11:42:12AM +0100, Anthony Underwood wrote: > Hi > > Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. This field is becoming huge within bioinformatics and bioruby is lagging when it comes to tools to parse the date, specifically wrappers around the C functions found in samtools. I would have a go myself but have no experience in C so am sure others would do a better job. > > Thanks Anthony > On 17 Aug 2010, at 12:39, George Githinji wrote: > > > Thank you very much Toshiaki. We really appreciated the call and the > > much advice and helpful conversation that we held. > > We are distilling on the various ideas and we will update you and the > > list on what will be most appropriate and achievable for us. > > > > > > On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: > >> Hi George, > >> > >> Oops, I just realized that I missed to read this thread. Sorry. ;) > >> > >> I'm very surprised and excited to know that you guys will organize > >> a BioHackathon-like event in Kenya. > >> > >> Few hours ago, I finished a Skype meeting with the organizers > >> and learnt about the plan described at > >> > >> http://rsg-ea-bio-sprint-2010.wikispaces.com/ > >> (design of the poster is awesome, good job! :) > >> > >> Please use this mailing list to distill pre-hackathon preparations. > >> > >> We often asked "what can I contribute to the BioRuby project?" but > >> it is usually difficult to assign a target and mentoring on it > >> as the project itself has been self-organized. > >> (The Google Summer of Code will be an exception. Mentors are working > >> really hard and I really appreciate about that.) > >> > >> However, I take this opportunity to suggest several potential targets: > >> (in addition to 1. finishing the newly introduced BioRuby plugin system > >> and 2. supporting Semantic Web technologies on which we have been > >> working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ > >> and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) > >> > >> === interfaces to external resources: > >> > >> * API for Ensemble (suggested by Jan Aerts) > >> * API for UCSC (also suggested by Jan) > >> * API for BioMart, InterMine etc. > >> * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon > >> > >> === modern bioinformatics: > >> > >> * handling NGS data - wrappers and parsers for tools and libraries > >> * Proteomics > >> * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) > >> > >> === classical bioinformatics: > >> > >> * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) > >> > >> * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. > >> > >> === visualization modules: > >> > >> * BioGraphics (already started by Jan) - genome mapping / comparative genomics? > >> > >> * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby > >> > >> === improving docs: > >> > >> * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) > >> > >> * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. > >> > >> Regards > >> Toshiaki Katayama > >> > >> > >> On 2010/08/10, at 16:37, George Githinji wrote: > >> > >>> Hi all, > >>> The Regional Students Group for Eastern Africa (RSG-EA) is one of the > >>> grass-root level bodies of the International Society for Computational > >>> Biology Student Council (ISCB-SC). The group has membership from ten > >>> countries namely Burundi, Democratic Republic of Congo, Djibouti, > >>> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. > >>> Recently we proposed to organize a biohakathon three day event to: > >>> > >>> 1) Learn how to collaborate on bioinformatics programming projects > >>> using open source tools. > >>> 2) Forge an East African bioinformatics programming community. > >>> 3) Contribute a module/code to Bioruby library. > >>> > >>> The event has been sponsored by a grant from ISCB and ILRI/Beca > >>> bioinformatics platform in Nairobi, Kenya. > >>> > >>> We would like to seek for a suitable project work from one of the > >>> developer(s) and the community. The project should ideally be of > >>> beginner to intermediate level difficulty. A third of the participants > >>> will be of intermediate level programming skills with experience from > >>> Java,Python and Perl. while the rest will have beginner level skills. > >>> > >>> We were also wondering whether it would be possible to get one of the > >>> lead contributors to bioruby project to give a short 15-20 minutes > >>> introductory talk to the participants. We have excellent video > >>> conferencing facilities at the ILRI/Beca hub. The event is slated to > >>> take place in late September. > >>> > >>> Thank you > >>> > >>> -- > >>> --------------- > >>> Sincerely > >>> George > >>> KEMRI/Wellcome-Trust Research Program > >>> Skype: george_g2 > >>> Blog: http://biorelated.wordpress.com/ > >>> _______________________________________________ > >>> BioRuby Project - http://www.bioruby.org/ > >>> BioRuby mailing list > >>> BioRuby at lists.open-bio.org > >>> http://lists.open-bio.org/mailman/listinfo/bioruby > >> > >> > > > > > > > > -- > > --------------- > > Sincerely > > George > > KEMRI/Wellcome-Trust Research Program > > Skype: george_g2 > > Blog: http://biorelated.wordpress.com/ > > > > _______________________________________________ > > BioRuby Project - http://www.bioruby.org/ > > BioRuby mailing list > > BioRuby at lists.open-bio.org > > http://lists.open-bio.org/mailman/listinfo/bioruby > > > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby From biopython at maubp.freeserve.co.uk Tue Aug 31 11:12:19 2010 From: biopython at maubp.freeserve.co.uk (Peter) Date: Tue, 31 Aug 2010 12:12:19 +0100 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> Message-ID: On Tue, Aug 31, 2010 at 11:42 AM, Anthony Underwood wrote: > > Hi > > Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. > This field is becoming huge within bioinformatics and bioruby is lagging when it comes > to tools to parse the date, specifically wrappers around the C functions found in > samtools. I would have a go myself but have no experience in C so am sure others > would do a better job. > > Thanks ?Anthony Anthony - Have you looked at Pjotr's recent work in BioLib to wrap the samtools C API in Ruby (and other languages)? http://lists.open-bio.org/pipermail/biolib-dev/2010-August/000160.html Peter From ngoto at gen-info.osaka-u.ac.jp Tue Aug 31 12:00:47 2010 From: ngoto at gen-info.osaka-u.ac.jp (Naohisa GOTO) Date: Tue, 31 Aug 2010 21:00:47 +0900 Subject: [BioRuby] We have 2 and 1/2 days of spare time. Can we help? In-Reply-To: <20100831105955.GA24001@thebird.nl> References: <269DBF19-140B-43B4-A42B-9915350A2C9D@hgc.jp> <20100831105955.GA24001@thebird.nl> Message-ID: <20100831120048.1FF4C1CBC57E@idnmail.gen-info.osaka-u.ac.jp> Hi, I found samtools-ruby at ISMB 2010 poster session. http://github.com/homonecloco/samtools-ruby The abstract of the presentation is available: http://www.iscb.org/cms_addon/conferences/ismb2010/posterlist.php?cat=J (Poster J52). They are developing GeeFu, Rails based web application for high-throughput genome sequencing, which is using BioRuby. http://github.com/danmaclean/gee_fu Naohisa Goto ngoto at gen-info.osaka-u.ac.jp / ng at bioruby.org On Tue, 31 Aug 2010 12:59:55 +0200 Pjotr Prins wrote: > I Anthony, > > I wrote a Ruby wrapper for SAMtools. See > > http://thebird.nl/biolib/Adding_BioLib_BAM_SAM_Support.html > > If you want to test and use it, we can move it forward. > > Pj. > > On Tue, Aug 31, 2010 at 11:42:12AM +0100, Anthony Underwood wrote: > > Hi > > > > Please could I put in my plea for a +1 on (Next Generation sequence) NGS parsing. This field is becoming huge within bioinformatics and bioruby is lagging when it comes to tools to parse the date, specifically wrappers around the C functions found in samtools. I would have a go myself but have no experience in C so am sure others would do a better job. > > > > Thanks Anthony > > On 17 Aug 2010, at 12:39, George Githinji wrote: > > > > > Thank you very much Toshiaki. We really appreciated the call and the > > > much advice and helpful conversation that we held. > > > We are distilling on the various ideas and we will update you and the > > > list on what will be most appropriate and achievable for us. > > > > > > > > > On Mon, Aug 16, 2010 at 4:40 PM, Toshiaki Katayama wrote: > > >> Hi George, > > >> > > >> Oops, I just realized that I missed to read this thread. Sorry. ;) > > >> > > >> I'm very surprised and excited to know that you guys will organize > > >> a BioHackathon-like event in Kenya. > > >> > > >> Few hours ago, I finished a Skype meeting with the organizers > > >> and learnt about the plan described at > > >> > > >> http://rsg-ea-bio-sprint-2010.wikispaces.com/ > > >> (design of the poster is awesome, good job! :) > > >> > > >> Please use this mailing list to distill pre-hackathon preparations. > > >> > > >> We often asked "what can I contribute to the BioRuby project?" but > > >> it is usually difficult to assign a target and mentoring on it > > >> as the project itself has been self-organized. > > >> (The Google Summer of Code will be an exception. Mentors are working > > >> really hard and I really appreciate about that.) > > >> > > >> However, I take this opportunity to suggest several potential targets: > > >> (in addition to 1. finishing the newly introduced BioRuby plugin system > > >> and 2. supporting Semantic Web technologies on which we have been > > >> working on since the 3rd DBCLS BioHackathon http://hackathon3.dbcls.jp/ > > >> and Codefest 2010 http://www.open-bio.org/wiki/Codefest_2010) > > >> > > >> === interfaces to external resources: > > >> > > >> * API for Ensemble (suggested by Jan Aerts) > > >> * API for UCSC (also suggested by Jan) > > >> * API for BioMart, InterMine etc. > > >> * API for Semantic Web resources (BioGateway, Bio2RDF etc.) -- this is what we tried during the last BioHackathon > > >> > > >> === modern bioinformatics: > > >> > > >> * handling NGS data - wrappers and parsers for tools and libraries > > >> * Proteomics > > >> * Immunoinformatics - immunology prediction servers (described by someone during the Skype meeting) > > >> > > >> === classical bioinformatics: > > >> > > >> * Do benchmark for existing BioRuby modules to find bottlenecks for improving performance (good example was posted by Martin Hansen http://lists.open-bio.org/pipermail/bioruby/2010-August/001426.html and I think this kind of improvements should be welcomed) > > >> > > >> * Setting up NCBI's BLAST WWW like interface (with SGE or Cloud backends) is still demanded. People who are working with not-yet-public genome often need to setup this kind of server. How about to create a general Rails plugin using BioRuby which can be easily setup and can perform various sequence similarity search (by BLAST, BLAT, EXONERATE, Bowtie, whatever...) with simple configuration (use DSL to setup target DBs and the computational farm). This project should also target on the downstream processes -- phylogenetic annotations, mapping RNA-Seq data, summarizing statistics, visualization, integration with genome browsers etc. -- and cool UI design, to be chosen as a yet another BLAST+alpha interface. > > >> > > >> === visualization modules: > > >> > > >> * BioGraphics (already started by Jan) - genome mapping / comparative genomics? > > >> > > >> * Interface for Cytoscape - so that we can easily generate beautiful graph visualization within BioRuby (e.g. from Semantic Web data) Note: some samples are already provided by Cytoscape group at http://cytoscape.wodaklab.org/wiki/ScriptingPlugins#Ruby > > >> > > >> === improving docs: > > >> > > >> * Writing some tutorials on how you used the BioRuby - this should be done by newbie who will have difficulty with finding solutions on the Internet. They can ask mentors how to solve their problems and summarize the result in HOWTO-like tutorials. Some blogs and Wiki pages are the only existing resources as far as I know (and also clearly pointed by Yannick Wurm http://lists.open-bio.org/pipermail/bioruby/2010-July/001373.html) > > >> > > >> * We should also need to have a document on "how to use GitHub for forking BioRuby", "how to create your own BioRuby plugins" etc. > > >> > > >> Regards > > >> Toshiaki Katayama > > >> > > >> > > >> On 2010/08/10, at 16:37, George Githinji wrote: > > >> > > >>> Hi all, > > >>> The Regional Students Group for Eastern Africa (RSG-EA) is one of the > > >>> grass-root level bodies of the International Society for Computational > > >>> Biology Student Council (ISCB-SC). The group has membership from ten > > >>> countries namely Burundi, Democratic Republic of Congo, Djibouti, > > >>> Eritrea, Ethiopia, Kenya, Rwanda, Somalia, Tanzania and Uganda. > > >>> Recently we proposed to organize a biohakathon three day event to: > > >>> > > >>> 1) Learn how to collaborate on bioinformatics programming projects > > >>> using open source tools. > > >>> 2) Forge an East African bioinformatics programming community. > > >>> 3) Contribute a module/code to Bioruby library. > > >>> > > >>> The event has been sponsored by a grant from ISCB and ILRI/Beca > > >>> bioinformatics platform in Nairobi, Kenya. > > >>> > > >>> We would like to seek for a suitable project work from one of the > > >>> developer(s) and the community. The project should ideally be of > > >>> beginner to intermediate level difficulty. A third of the participants > > >>> will be of intermediate level programming skills with experience from > > >>> Java,Python and Perl. while the rest will have beginner level skills. > > >>> > > >>> We were also wondering whether it would be possible to get one of the > > >>> lead contributors to bioruby project to give a short 15-20 minutes > > >>> introductory talk to the participants. We have excellent video > > >>> conferencing facilities at the ILRI/Beca hub. The event is slated to > > >>> take place in late September. > > >>> > > >>> Thank you > > >>> > > >>> -- > > >>> --------------- > > >>> Sincerely > > >>> George > > >>> KEMRI/Wellcome-Trust Research Program > > >>> Skype: george_g2 > > >>> Blog: http://biorelated.wordpress.com/ > > >>> _______________________________________________ > > >>> BioRuby Project - http://www.bioruby.org/ > > >>> BioRuby mailing list > > >>> BioRuby at lists.open-bio.org > > >>> http://lists.open-bio.org/mailman/listinfo/bioruby > > >> > > >> > > > > > > > > > > > > -- > > > --------------- > > > Sincerely > > > George > > > KEMRI/Wellcome-Trust Research Program > > > Skype: george_g2 > > > Blog: http://biorelated.wordpress.com/ > > > > > > _______________________________________________ > > > BioRuby Project - http://www.bioruby.org/ > > > BioRuby mailing list > > > BioRuby at lists.open-bio.org > > > http://lists.open-bio.org/mailman/listinfo/bioruby > > > > > > _______________________________________________ > > BioRuby Project - http://www.bioruby.org/ > > BioRuby mailing list > > BioRuby at lists.open-bio.org > > http://lists.open-bio.org/mailman/listinfo/bioruby > _______________________________________________ > BioRuby Project - http://www.bioruby.org/ > BioRuby mailing list > BioRuby at lists.open-bio.org > http://lists.open-bio.org/mailman/listinfo/bioruby