[Biopython-dev] GenBank parser -- first go
Andrew Dalke
dalke at acm.org
Fri Dec 8 04:04:36 EST 2000
Jeff:
>> I don't believe there's any general
>> data structure in existance that can handle the genbank location
>> field. It's describe by a BNF grammar and requires a tree!
Me:
>Speaking as a parsing problem, this cannot be done with regular
>expression. When something like that occurs, it should be fine
>to leave it as an opaque block of text, which is parsed elsewhere.
>
>John Aycock wrote a really nice context-free parser in pure
>Python called SPARK. http://www.csr.uvic.ca/~aycock/python/
>Easier to use. (Which means it is *much* easier to use than
>lax/yacc.)
And here's a first run at a SPARK-based parser for the location
part of the feature table. BTW, the documentation at
http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
contains several errors that I could tell
***
If a location is between 102 and 110 inclusive, do you use
"(102.110)" as the example has, or "102.110" as given in the
BNF?
base_position ::= <integer> | <low_base_bound> | <high_base_bound> |
<two_base_bound>
two_base_bound ::= <base_position>.<base_position>
***
Example 5.4 Plasmid has
CDS join(complement(567..795)complement(21..349))
which ignores the comma
CDS join(complement(567..795),complement(21..349))
^^^
***
There is an example showing "J00194:(100..202)" which also
does not agree with the BNF description. From looking at
some real data, it seems the documentation should say
"J00194:100..202".
The BNF says
symbol ::= <letter> | <symbol><symbol_character> |
<symbol_character><symbol>
where
symbol_character ::= <up_case_letter> | <low_case_letter> |
<digit> | _ | - | ' | *
letter ::= <up_case_letter> | <low_case_letter>
This means 'AA' can be parsed as
<symbol><symbol_character>
| |
<letter> <up_case_letter>
| |
"A" "A"
or
<symbol_character><symbol>
| |
<up_case_letter> <letter>
| |
"A" "A"
so it's an ambiguous definition.
***
Additionally, symbol_character needs to allow '.' to agree
with real-life data (see the regression tests for the text).
Instead, I just redefined
symbol ::= Re("[A-Za-z0-9_'*-][A-Za-z0-9_'*.]*")
(note the "." in the second []).
Anyway, the grammer is attached for anyone wishing to take it
farther. Enjoy!
Andrew
-------------- next part --------------
# First pass at a parser for the location fields of a feature table.
# Everything likely to change.
# Based on the DDBJ/EMBL/GenBank Feature Table Definition Version 2.2
# Dec 15 1999 available from EBI, but the documentation is not
# completely internally consistent much less agree with real-life
# examples. Conflicts resolved to agree with real examples.
# Uses John Aycock's SPARK for parsing
from spark import GenericScanner, GenericParser
# a list of strings to test
test_data = (
"467",
"23..400",
"join(544..589,688..1032)",
"1..1000",
"<345..500",
"<1..888",
"(102.110)",
"(23.45)..600",
"(122.133)..(204.221)",
"123^124",
"145^177",
"join(12..78,134..202)",
"complement(join(2691..4571,4918..5163))",
"join(complement(4918..5163),complement(2691..4571))",
"complement(34..(122.126))",
# The doc example allows "J00194:(100..202)" but not the BNF
"J00194:100..202",
"1..1509",
"<1..9",
"join(10..567,789..1320)",
"join(54..567,789..1254)",
"10..567",
"join(complement(<1..799),complement(5080..5120))",
"complement(1697..2512)",
"complement(4170..4829)",
# added a comma from the documentation
"join(complement(567..795),complement(21..349))",
"join(2004..2195,3..20)",
"<1..>336",
"394..>402",
# a few examples from from hum1
"join(AB001090.1:1669..1713)",
"join(AB001090.1:1669..1713,AB001091.1:85..196)",
"join(AB001090.1:1669..1713,AB001091.1:85..196,AB001092.1:40..248,AB001093.1:96..212,AB001094.1:71..223,AB001095.1:87..231,AB001096.1:33..211,AB001097.1:35..175,AB001098.1:213..395,AB001099.1:56..309,AB001100.1:54..196,AB001101.1:171..404,AB001102.1:160..378,210..217)",
"join(9106..9239,9843..9993,11889..11960,16575..16650)",
"join(<1..109,620..>674)",
"join(AB003599.1:<61..315,AB003599.1:587..874,47..325,425..>556)",
"join(<85..194,296..458,547..>653)",
)
class Token:
def __init__(self, type):
self.type = type
def __cmp__(self, other):
return cmp(self.type, other)
def __repr__(self):
return "Tokens(%r)" % (self.type,)
# "38"
class Integer:
type = "integer"
def __init__(self, val):
self.val = val
def __cmp__(self, other):
return cmp(self.type, other)
def __str__(self):
return str(self.val)
def __repr__(self):
return "Integer(%s)" % self.val
# From the BNF definition, this isn't needed. Does tht mean
# that bases can be refered to with negative numbers?
class UnsignedInteger(Integer):
type = "unsigned_integer"
def __repr__(self):
return "UnsignedInteger(%s)" % self.val
class Symbol:
type = "symbol"
def __init__(self, name):
self.name = name
def __cmp__(self, other):
return cmp(self.type, other)
def __str__(self):
return str(self.name)
def __repr__(self):
return "Symbol(%s)" % repr(self.name)
# ">38" -- The BNF says ">" is for the lower bound.. seems wrong to me
class LowBound:
def __init__(self, base):
self.base = base
def __repr__(self):
return "LowBound(%r)" % self.base
# "<38"
class HighBound:
def __init__(self, base):
self.base = base
def __repr__(self):
return "HighBound(%r)" % self.base
# 12.34
class TwoBound:
def __init__(self, low, high):
self.low = low
self.high = high
def __repr__(self):
return "TwoBound(%r, %r)" % (self.low, self.high)
# 12^34
class Between:
def __init__(self, low, high):
self.low = low
self.high = high
def __repr__(self):
return "Between(%r, %r)" % (self.low, self.high)
# 12..34
class Range:
def __init__(self, low, high):
self.low = low
self.high = high
def __repr__(self):
return "Range(%r, %r)" % (self.low, self.high)
class Function:
def __init__(self, name, args):
self.name = name
self.args = args
def __repr__(self):
return "Function(%r, %r)" % (self.name, self.args)
class AbsoluteLocation:
def __init__(self, path, local_location):
self.path = path
self.local_location = local_location
def __repr__(self):
return "AbsoluteLocation(%r, %r)" % (self.path, self.local_location)
class Path:
def __init__(self, database, accession):
self.database = database
self.accession = accession
def __repr__(self):
return "Path(%r, %r)" % (self.database, self.accession)
class FeatureName:
def __init__(self, path, label):
self.path = path
self.label = label
def __repr__(self):
return "FeatureName(%r, %r)" % (self.path, self.label)
class LocationScanner(GenericScanner):
def __init__(self):
GenericScanner.__init__(self)
def tokenize(self, input):
self.rv = []
GenericScanner.tokenize(self, input)
return self.rv
def t_double_colon(self, input):
r" :: "
self.rv.append(Token("double_colon"))
def t_double_dot(self, input):
r" \.\. "
self.rv.append(Token("double_dot"))
def t_dot(self, input):
r" \.(?!\.) "
self.rv.append(Token("dot"))
def t_caret(self, input):
r" \^ "
self.rv.append(Token("caret"))
def t_comma(self, input):
r" \, "
self.rv.append(Token("comma"))
def t_integer(self, input):
r" -?[0-9]+ "
self.rv.append(Integer(int(input)))
def t_unsigned_integer(self, input):
r" [0-9]+ "
self.rv.append(UnsignedInteger(int(input)))
def t_colon(self, input):
r" :(?!:) "
self.rv.append(Token("colon"))
def t_open_paren(self, input):
r" \( "
self.rv.append(Token("open_paren"))
def t_close_paren(self, input):
r" \) "
self.rv.append(Token("close_paren"))
def t_symbol(self, input):
r" [A-Za-z0-9_'*-][A-Za-z0-9_'*.-]* "
# Needed an extra '.'
self.rv.append(Symbol(input))
def t_less_than(self, input):
r" < "
self.rv.append(Token("less_than"))
def t_greater_than(self, input):
r" > "
self.rv.append(Token("greater_than"))
# punctuation .. hmm, isn't needed for location
# r''' [ !#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~] '''
class LocationParser(GenericParser):
def __init__(self, start='location'):
GenericParser.__init__(self, start)
self.begin_pos = 0
def p_location(self, args):
"""
location ::= absolute_location
location ::= feature_name
location ::= function
"""
return args[0]
def p_function(self, args):
"""
function ::= functional_operator open_paren location_list close_paren
"""
return Function(args[0].name, args[2])
def p_absolute_location(self, args):
"""
absolute_location ::= local_location
absolute_location ::= path colon local_location
"""
if len(args) == 1:
return AbsoluteLocation(None, args[-1])
return AbsoluteLocation(args[0], args[-1])
def p_path(self, args):
"""
path ::= database double_colon primary_accession
path ::= primary_accession
"""
if len(args) == 3:
return Path(args[0], args[2])
return Path(None, args[0])
def p_feature_name(self, args):
"""
feature_name ::= path colon feature_label
feature_name ::= feature_label
"""
if len(args) == 3:
return FeatureName(args[0], args[2])
return FeatureName(None, args[0])
def p_feature_label(self, args):
"""
label ::= symbol
"""
return args[0].name
def p_local_location(self, args):
"""
local_location ::= base_position
local_location ::= between_position
local_location ::= base_range
"""
return args[0]
def p_location_list(self, args):
"""
location_list ::= location
location_list ::= location_list comma location
"""
if len(args) == 1:
return args
return args[0] + [args[2]]
def p_functional_operator(self, args):
"""
functional_operator ::= symbol
"""
return args[0]
def p_base_position(self, args):
"""
base_position ::= integer
base_position ::= low_base_bound
base_position ::= high_base_bound
base_position ::= two_base_bound
"""
return args[0]
def p_low_base_bound(self, args):
"""
low_base_bound ::= greater_than integer
"""
return LowBound(args[1])
def p_high_base_bound(self, args):
"""
high_base_bound ::= less_than integer
"""
return HighBound(args[1])
def p_two_base_bound(self, args):
"""
two_base_bound ::= open_paren base_position dot base_position close_paren
"""
# main example doesn't have parens but others do.. (?)
return TwoBound(args[1], args[3])
def p_between_position(self, args):
"""
between_position ::= base_position caret base_position
"""
return Between(args[0], args[2])
def p_base_range(self, args):
"""
base_range ::= base_position double_dot base_position
"""
return Range(args[0], args[2])
def p_database(self, args):
"""
database ::= symbol
"""
return args[0].name
def p_primary_accession(self, args):
"""
primary_accession ::= symbol
"""
return args[0].name
def scan(input):
scanner = LocationScanner()
return scanner.tokenize(input)
def parse(tokens):
#print "I have", tokens
parser = LocationParser()
return parser.parse(tokens)
if __name__ == "__main__":
for s in test_data:
print "--> Trying", s
print repr(parse(scan(s)))
More information about the Biopython-dev
mailing list