[Biopython-dev] "Online" tests, was [Bug 1972]
Bill Barnard
bill at barnard-engineering.com
Mon May 15 16:22:30 UTC 2006
On Fri, 2006-03-31 at 21:41 +0100, Peter (BioPython Dev) wrote:
> Bill Barnard wrote:
> > I've made a first cut unit test, tentatively named
> > test_Parsers_for_newest_formats, ...
>
> Sounds good to me. But not a very snappy name - how about something
> shorter like test_OnlineFormats.py instead?
Works for me.
> I think the Blast test should actually submit a short protein/nucleotide
> sequence known to be in the online database. Maybe do some basic sanity
> testing like check it returns at least N results and the best hit is at
> least a certain score.
I have one such test working.
>
> >>In some cases (e.g. GenBank, Fasta) once the sample file is downloaded
> >>there are multiple parsers to be checked (e.g. record and feature parsers).
I have tests using EUtils that check the record and feature parsers for
GenBank and Fasta files.
> I'll volunteer to add cases for GenBank, Fasta and GEO files.
I've not yet looked at GEO files, so there's that yet to do. I expect to
have some free time soon so I may look at the GEO and other parsers at
that time. I'd certainly be interested in any feedback you have on these
tests, or to see additional test cases. Finding the parsers to test and
implementing the tests seems pretty straightforward.
I've attached the test code to this email, and added a .txt file
extension just in case the listserv won't allow attaching a code file.
Bill
p.s. I'm still looking for a more interesting project I can contribute
to Biopython. Doing some of the tests and such is useful to me in
learning my way around the codebase; I hope to find something requiring
a bit more creativity. If anyone has any suggestions about areas that
need some attention I'll happily consider them. I am considering writing
some code for HMMs that would build on some prototype code I wrote.
-------------- next part --------------
"""Test to make sure all parsers retrieve and read current
data formats.
"""
import requires_internet
import sys
import unittest
from Bio import ParserSupport
# ExpasyTest
from Bio import Prosite
from Bio.Prosite import Prodoc
from Bio.SwissProt import SProt
# PubMedTest
from Bio import PubMed, Medline
# EUtilsTest
from Bio import EUtils
from Bio.EUtils import DBIdsClient
from Bio import GenBank
from Bio import Fasta
# BlastTest
try:
import cStringIO as StringIO
except ImportError:
import StringIO
from Bio.Blast import NCBIWWW, NCBIXML
def run_tests(argv):
test_suite = testing_suite()
runner = unittest.TextTestRunner(sys.stdout, verbosity = 2)
runner.run(test_suite)
def testing_suite():
"""Generate the suite of tests.
"""
test_suite = unittest.TestSuite()
test_loader = unittest.TestLoader()
test_loader.testMethodPrefix = 't_'
tests = [ExpasyTest, PubMedTest, EUtilsTest, BlastTest]
for test in tests:
cur_suite = test_loader.loadTestsFromTestCase(test)
test_suite.addTest(cur_suite)
return test_suite
class ExpasyTest(unittest.TestCase):
"""Test that parsers can read the current Expasy database formats
"""
def setUp(self):
pass
def t_parse_prosite_record(self):
"""Retrieve a Prosite record and parse it
"""
prosite_dict = Prosite.ExPASyDictionary(parser=Prosite.RecordParser())
accession = 'PS00159'
entry = prosite_dict[accession]
self.assertEqual(entry.accession, accession)
def t_parse_prodoc_record(self):
"""Retrieve a Prodoc record and parse it
"""
prodoc_dict = Prodoc.ExPASyDictionary(parser=Prodoc.RecordParser())
accession = 'PDOC00933'
entry = prodoc_dict[accession]
self.assertEqual(entry.accession, accession)
def t_parse_sprot_record(self):
"""Retrieve a SwissProt record and parse it into Record format
"""
sprot_record_dict = SProt.ExPASyDictionary(parser=SProt.RecordParser())
accession = 'Q5TYW8'
entry = sprot_record_dict[accession]
self.failUnless(accession in entry.accessions)
def t_parse_sprot_seq(self):
"""Retrieve a SwissProt record and parse it into Sequence format
"""
sprot_seq_dict = SProt.ExPASyDictionary(parser=SProt.SequenceParser())
accession = 'Q5TYW8'
entry = sprot_seq_dict[accession]
self.assertEqual(entry.id, accession)
class PubMedTest(unittest.TestCase):
"""Test that Medline parsers can read the current PubMed format
"""
def setUp(self):
pass
def t_parse_pubmed_record(self):
"""Retrieve a PubMed record and parse it
"""
pubmed_dict = PubMed.Dictionary(parser=Medline.RecordParser())
pubmed_id = '3136164'
entry = pubmed_dict[pubmed_id]
self.assertEqual(entry.pubmed_id, pubmed_id)
class EUtilsTest(unittest.TestCase):
"""Test that GenBank, Fasta parsers can read EUtils retrieved db formats
"""
# Database primary IDs
# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?
# lists all E-Utility database names. Primary IDs, which are
# always an integer, are listed below for a few databases:
# Entrez Database Primary ID E-Utility Database Name
# 3D Domains 3D SDI domains
# Domains PSSM-ID cdd
# Genome Genome ID genome
# Nucleotide GI number nucleotide
# OMIM MIM number omim
# PopSet Popset ID popset
# Protein GI number protein
# ProbeSet GEO ID geo
# PubMed PMID pubmed
# Structure MMDB ID structure
# SNP SNP ID snp
# Taxonomy TAXID taxonomy
# UniGene UniGene ID unigene
# UniSTS UniSTS ID unists
# see Bio.EUtils.Config.py for supported EUtils databases
def setUp(self):
self.client = DBIdsClient.DBIdsClient()
def t_parse_nucleotide_gb(self):
"""Use EUtils to retrieve and parse an NCBI nucleotide GenBank record
"""
db = "nucleotide"
gi = "57165207"
result = self.client.search(gi, db, retmax=1)
sfp = result[0].efetch(retmode="text", rettype="gb", \
seq_start=5458145, seq_stop=5458942)
text = sfp.readlines()
sfp.close()
self.assertEqual(result.dbids.db, db) # sanity check, not a parser test
locus_line = text[0]
# now parse the text with GenBank parsers
parser_input = StringIO.StringIO(''.join(text))
# RecordParser
parser = GenBank.RecordParser()
record = parser.parse(parser_input)
parser_input.reset()
self.assertEqual(record.gi, gi)
# split locus line to eliminate whitespace differences
self.assertEqual(record._locus_line().split(), locus_line.split())
# FeatureParser
parser = GenBank.FeatureParser()
record = parser.parse(parser_input)
parser_input.reset()
self.assertEqual(record.annotations['gi'], gi)
def t_parse_protein_fasta(self):
"""Use EUtils to retrieve and parse an NCBI protein Fasta record
"""
db = "protein"
gi = "21220767"
result = self.client.search(gi, db, retmax=1)
sfp = result[0].efetch(retmode="text", rettype="fasta")
text = sfp.readlines()
sfp.close()
self.assertEqual(result.dbids.db, db) # sanity check, not a parser test
# now parse the text with Fasta parsers
parser_input = StringIO.StringIO(''.join(text))
# RecordParser
parser = Fasta.RecordParser()
record = parser.parse(parser_input)
parser_input.reset()
self.assert_(gi in record.title.split('|'))
self.assertEqual(record.title, text[0][1:-1]) # strip > and \n from text[0]
# SequenceParser
parser = Fasta.SequenceParser()
record = parser.parse(parser_input)
parser_input.reset()
self.assert_(gi in record.description.split('|'))
self.assertEqual(record.description, text[0][1:-1]) # strip > and \n from text[0]
class BlastTest(unittest.TestCase):
"""Test that the Blast XML parser can read the current NCBI formats
"""
def setUp(self):
pass
def t_parse_blast_xml(self):
"""Use NCBIWWW to retrieve Blast results and use NCBIXML to parse it
"""
query = '''>sp|P09405|NUCL_MOUSE Nucleolin (Protein C23) - Mus musculus (Mouse).
VKLAKAGKTHGEAKKMAPPPKEVEEDSEDEEMSEDEDDSSGEEEVVIPQKKGKKATTTPA
KKVVVSQTKKAAVPTPAKKAAVTPGKKAVATPAKKNITPAKVIPTPGKKGAAQAKALVPT'''
result_handle = NCBIWWW.qblast('blastp', 'swissprot', \
query, expect=0.0001, format_type="XML")
blast_results = result_handle.read()
result_handle.close()
blast_out = StringIO.StringIO(blast_results)
parser = NCBIXML.BlastParser()
b_record = parser.parse(blast_out)
### write output file for testing purposes ~35 kB
## import os
## save_fname = os.path.expanduser('~/tmp/blast_out.xml')
## save_file = open(save_fname, 'w')
## save_file.write(blast_results)
## save_file.close()
###
# When I ran this on 4 May 2006 I got max_score of 601 & 21 hits
expected_score_cutoff = 600
expected_min_hits = 20
max_score = 0.0
for alignment in b_record.alignments:
for hsp in alignment.hsps:
if hsp.score > max_score: max_score = hsp.score
msg = 'max score (%g) < expected_score_cutoff(%g)' % \
(max_score, expected_score_cutoff)
self.assert_(max_score >= expected_score_cutoff, msg)
msg = 'N(%d) < expected_min_hits(%d)' % \
(len(b_record.alignments), expected_min_hits)
self.assert_(len(b_record.alignments) >= expected_min_hits, msg)
if __name__ == "__main__":
sys.exit(run_tests(sys.argv))
More information about the Biopython-dev
mailing list