[Biopython-dev] GenBank parser -- first go

Fri Dec 8 04:04:36 EST 2000

Jeff:
>> I don't believe there's any general
>> data structure in existance that can handle the genbank location
>> field.  It's describe by a BNF grammar and requires a tree!

Me:
>Speaking as a parsing problem, this cannot be done with regular
>expression.  When something like that occurs, it should be fine
>to leave it as an opaque block of text, which is parsed elsewhere.
>
>John Aycock wrote a really nice context-free parser in pure
>Python called SPARK.  http://www.csr.uvic.ca/~aycock/python/
>Easier to use.  (Which means it is *much* easier to use than
>lax/yacc.)

And here's a first run at a SPARK-based parser for the location
part of the feature table.  BTW, the documentation at
http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
contains several errors that I could tell

  ***

If a location is between 102 and 110 inclusive, do you use
"(102.110)"  as the example has, or "102.110" as given in the
BNF?

base_position ::= <integer> | <low_base_bound> | <high_base_bound> |
 <two_base_bound> 

two_base_bound ::= <base_position>.<base_position>

  ***

Example 5.4 Plasmid has
CDS             join(complement(567..795)complement(21..349))

which ignores the comma

CDS             join(complement(567..795),complement(21..349))
                                        ^^^

  ***

There is an example showing "J00194:(100..202)" which also
does not agree with the BNF description.  From looking at
some real data, it seems the documentation should say
"J00194:100..202".

The BNF says
  symbol  ::= <letter> | <symbol><symbol_character> | 
              <symbol_character><symbol>

where
  symbol_character ::= <up_case_letter> | <low_case_letter> |
              <digit> | _ | - | ' | *

  letter ::= <up_case_letter> | <low_case_letter> 

This means 'AA' can be parsed as

     <symbol><symbol_character>
       |            |
    <letter>    <up_case_letter>
       |            |
      "A"          "A"
or
     <symbol_character><symbol>
       |                 |
 <up_case_letter>     <letter>
       |                 |
      "A"               "A"

so it's an ambiguous definition.

  ***

Additionally,  symbol_character needs to allow '.' to agree
with real-life data (see the regression tests for the text).
Instead, I just redefined
  symbol  ::= Re("[A-Za-z0-9_'*-][A-Za-z0-9_'*.]*")
(note the "." in the second []).

Anyway, the grammer is attached for anyone wishing to take it
farther.  Enjoy!

                    Andrew

-------------- next part --------------
# First pass at a parser for the location fields of a feature table.
# Everything likely to change.

# Based on the DDBJ/EMBL/GenBank Feature Table Definition Version 2.2
# Dec 15 1999 available from EBI, but the documentation is not
# completely internally consistent much less agree with real-life
# examples.  Conflicts resolved to agree with real examples.

# Uses John Aycock's SPARK for parsing
from spark import GenericScanner, GenericParser

# a list of strings to test
test_data = (
    "467",
    "23..400",
    "join(544..589,688..1032)",
    "1..1000",
    "<345..500",
    "<1..888",
    "(102.110)",
    "(23.45)..600",
    "(122.133)..(204.221)",
    "123^124",
    "145^177",
    "join(12..78,134..202)",
    "complement(join(2691..4571,4918..5163))",
    "join(complement(4918..5163),complement(2691..4571))",
    "complement(34..(122.126))",
    # The doc example allows "J00194:(100..202)" but not the BNF
    "J00194:100..202",
    "1..1509",
    "<1..9",
    "join(10..567,789..1320)",
    "join(54..567,789..1254)",
    "10..567",
    "join(complement(<1..799),complement(5080..5120))",
    "complement(1697..2512)",
    "complement(4170..4829)",
    # added a comma from the documentation
    "join(complement(567..795),complement(21..349))",
    "join(2004..2195,3..20)",
    "<1..>336",
    "394..>402",

    # a few examples from from hum1
    "join(AB001090.1:1669..1713)",
    "join(AB001090.1:1669..1713,AB001091.1:85..196)",
    "join(AB001090.1:1669..1713,AB001091.1:85..196,AB001092.1:40..248,AB001093.1:96..212,AB001094.1:71..223,AB001095.1:87..231,AB001096.1:33..211,AB001097.1:35..175,AB001098.1:213..395,AB001099.1:56..309,AB001100.1:54..196,AB001101.1:171..404,AB001102.1:160..378,210..217)",
    "join(9106..9239,9843..9993,11889..11960,16575..16650)",
    "join(<1..109,620..>674)",
    "join(AB003599.1:<61..315,AB003599.1:587..874,47..325,425..>556)",
    "join(<85..194,296..458,547..>653)",
    )

class Token:
    def __init__(self, type):
        self.type = type
    def __cmp__(self, other):
        return cmp(self.type, other)
    def __repr__(self):
        return "Tokens(%r)" % (self.type,)

# "38"
class Integer:
    type = "integer"
    def __init__(self, val):
        self.val = val
    def __cmp__(self, other):
        return cmp(self.type, other)
    def __str__(self):
        return str(self.val)
    def __repr__(self):
        return "Integer(%s)" % self.val

# From the BNF definition, this isn't needed.  Does tht mean
# that bases can be refered to with negative numbers?
class UnsignedInteger(Integer):
    type = "unsigned_integer"
    def __repr__(self):
        return "UnsignedInteger(%s)" % self.val

class Symbol:
    type = "symbol"
    def __init__(self, name):
        self.name = name
    def __cmp__(self, other):
        return cmp(self.type, other)
    def __str__(self):
        return str(self.name)
    def __repr__(self):
        return "Symbol(%s)" % repr(self.name)

# ">38"  -- The BNF says ">" is for the lower bound.. seems wrong to me
class LowBound:
    def __init__(self, base):
        self.base = base
    def __repr__(self):
        return "LowBound(%r)" % self.base

# "<38"
class HighBound:
    def __init__(self, base):
        self.base = base
    def __repr__(self):
        return "HighBound(%r)" % self.base

# 12.34
class TwoBound:
    def __init__(self, low, high):
        self.low = low
        self.high = high
    def __repr__(self):
        return "TwoBound(%r, %r)" % (self.low, self.high)

# 12^34
class Between:
    def __init__(self, low, high):
        self.low = low
        self.high = high
    def __repr__(self):
        return "Between(%r, %r)" % (self.low, self.high)

# 12..34
class Range:
    def __init__(self, low, high):
        self.low = low
        self.high = high
    def __repr__(self):
        return "Range(%r, %r)" % (self.low, self.high)

class Function:
    def __init__(self, name, args):
        self.name = name
        self.args = args
    def __repr__(self):
        return "Function(%r, %r)" % (self.name, self.args)

class AbsoluteLocation:
    def __init__(self, path, local_location):
        self.path = path
        self.local_location = local_location
    def __repr__(self):
        return "AbsoluteLocation(%r, %r)" % (self.path, self.local_location)

class Path:
    def __init__(self, database, accession):
        self.database = database
        self.accession = accession
    def __repr__(self):
        return "Path(%r, %r)" % (self.database, self.accession)

class FeatureName:
    def __init__(self, path, label):
        self.path = path
        self.label = label
    def __repr__(self):
        return "FeatureName(%r, %r)" % (self.path, self.label)

class LocationScanner(GenericScanner):
    def __init__(self):
        GenericScanner.__init__(self)

    def tokenize(self, input):
        self.rv = []
        GenericScanner.tokenize(self, input)
        return self.rv

    def t_double_colon(self, input):
        r" :: "
        self.rv.append(Token("double_colon"))
    def t_double_dot(self, input):
        r" \.\. "
        self.rv.append(Token("double_dot"))
    def t_dot(self, input):
        r" \.(?!\.) "
        self.rv.append(Token("dot"))
    def t_caret(self, input):
        r" \^ "
        self.rv.append(Token("caret"))
    def t_comma(self, input):
        r" \, "
        self.rv.append(Token("comma"))
    def t_integer(self, input):
        r" -?[0-9]+ "
        self.rv.append(Integer(int(input)))
    def t_unsigned_integer(self, input):
        r" [0-9]+ "
        self.rv.append(UnsignedInteger(int(input)))
    def t_colon(self, input):
        r" :(?!:) "
        self.rv.append(Token("colon"))
    def t_open_paren(self, input):
        r" \( "
        self.rv.append(Token("open_paren"))
    def t_close_paren(self, input):
        r" \) "
        self.rv.append(Token("close_paren"))
    def t_symbol(self, input):
        r" [A-Za-z0-9_'*-][A-Za-z0-9_'*.-]* "
        # Needed an extra '.'
        self.rv.append(Symbol(input))
    def t_less_than(self, input):
        r" < "
        self.rv.append(Token("less_than"))
    def t_greater_than(self, input):
        r" > "
        self.rv.append(Token("greater_than"))

# punctuation .. hmm, isn't needed for location
#        r''' [ !#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~] '''

class LocationParser(GenericParser):
    def __init__(self, start='location'):
        GenericParser.__init__(self, start)
        self.begin_pos = 0

    def p_location(self, args):
        """
        location ::= absolute_location
        location ::= feature_name
        location ::= function
        """
        return args[0]

    def p_function(self, args):
        """
        function ::= functional_operator open_paren location_list close_paren
        """
        return Function(args[0].name, args[2])

    def p_absolute_location(self, args):
        """
        absolute_location ::= local_location
        absolute_location ::= path colon local_location
        """
        if len(args) == 1:
            return AbsoluteLocation(None, args[-1])
        return AbsoluteLocation(args[0], args[-1])

    def p_path(self, args):
        """
        path ::= database double_colon primary_accession
        path ::= primary_accession
        """
        if len(args) == 3:
            return Path(args[0], args[2])
        return Path(None, args[0])

    def p_feature_name(self, args):
        """
        feature_name ::= path colon feature_label
        feature_name ::= feature_label
        """
        if len(args) == 3:
            return FeatureName(args[0], args[2])
        return FeatureName(None, args[0])

    def p_feature_label(self, args):
        """
        label ::= symbol
        """
        return args[0].name

    def p_local_location(self, args):
        """
        local_location ::= base_position
        local_location ::= between_position
        local_location ::= base_range
        """
        return args[0]
    def p_location_list(self, args):
        """
        location_list ::= location
        location_list ::= location_list comma location
        """
        if len(args) == 1:
            return args
        return args[0] + [args[2]]

    def p_functional_operator(self, args):
        """
        functional_operator ::= symbol
        """
        return args[0]

    def p_base_position(self, args):
        """
        base_position ::= integer
        base_position ::= low_base_bound
        base_position ::= high_base_bound
        base_position ::= two_base_bound
        """
        return args[0]

    def p_low_base_bound(self, args):
        """
        low_base_bound ::= greater_than integer
        """
        return LowBound(args[1])

    def p_high_base_bound(self, args):
        """
        high_base_bound ::= less_than integer
        """
        return HighBound(args[1])

    def p_two_base_bound(self, args):
        """
        two_base_bound ::= open_paren base_position dot base_position close_paren
        """
        # main example doesn't have parens but others do.. (?)
        return TwoBound(args[1], args[3])

    def p_between_position(self, args):
        """
        between_position ::= base_position caret base_position
        """
        return Between(args[0], args[2])

    def p_base_range(self, args):
        """
        base_range ::= base_position double_dot base_position
        """
        return Range(args[0], args[2])

    def p_database(self, args):
        """
        database ::= symbol
        """
        return args[0].name

    def p_primary_accession(self, args):
        """
        primary_accession ::= symbol
        """
        return args[0].name

def scan(input):
    scanner = LocationScanner()
    return scanner.tokenize(input)

def parse(tokens):
    #print "I have", tokens
    parser = LocationParser()
    return parser.parse(tokens)

if __name__ == "__main__":
    for s in test_data:
        print "--> Trying", s
        print repr(parse(scan(s)))