import string
import shlex

class MMCIFlex:
    """
    Paul T. Bathen, Nov 2009
    Returns tokens to the calling object (usually MMCIF2Dict) with each call to get_token().
    Based entirely on python, no flex support needed.

    Each call to get_token() implements this psuedo code (implemented quite differently)
        while readline() is not EOF
            return any semicolon lines with the SEMICOLONS token.
            use shlex.split to split this current line into tokens
            while there are tokens left:
                process a token for each call to get_token
            
    Typical usage: (note: create the object, THEN pass in the filename. This is different than
    the C/flex version of MMCIFlex.py)
        p = MMCIFlex()
        fn=raw_input('Please enter the path and filename of the .cif file to parse: ')
        p.open_file(fn)
        while 1:
            (token, value, token_num_in_line) = p.get_token()
            print "%s\t%s\t%s" %(token, token_num_in_line, value)
            peeked_token = p.peek_token() #has no side effect.
            if token == 0:
                break #end of file
        p.close_file()

    Also supports a call to peek_token() to get the 1-ahead token, if any.
        
    Notes:
        - Returns all semi-colon lines without the leading semi-colon as a complete line. So
            ;asdlkjasldjalsdjalsdj "asdasd" asdasdasd 'asdasd'a asdasd
          would be returned as one single value:
            asdlkjasldjalsdjalsdj "asdasd" asdasdasd 'asdasd'a asdasd
        - Returns the '#' lines as BLOCK tokens, which are discarded in MMCIFDict
        - The call to get_token() returns THREE values as opposed to the C version's two. The third
          value token_num_in_line represents the 0-based index into what token this was in a given
          readline.
        - Currently the program strips any whitespace in the returned value.

    Token value are:
        self.RUN_GET_TOKEN_FIRST = -2
        self.BLOCK=-1
        self.END_OF_FILE = 0
        self.NAME=1
        self.LOOP=2
        self.DATA=3
        self.SEMICOLONS=4    
        self.DOUBLEQUOTED=5
        self.QUOTED=6
        self.SIMPLE=7
    """   
    def __init__(self):
        self.path_and_filename = ""
        self.file_handle = None
        self.should_read_a_line = True
        self.need_to_run_first_time = True
        self.num_of_tokens = 0
        self.tokens_read = 1

        self.RUN_GET_TOKEN_FIRST = -2
        self.BLOCK=-1
        self.END_OF_FILE = 0
        self.NAME=1
        self.LOOP=2
        self.DATA=3
        self.SEMICOLONS=4    
        self.DOUBLEQUOTED=5
        self.QUOTED=6
        self.SIMPLE=7

    def open_file(self, path_and_filename):
        if self.file_handle is None:
            try:
                self.file_handle = open(path_and_filename,'rb')
            except IOError:
                self.path_and_filename = ""
                self.file_handle = None
                print "Error. Can't open file %s" %path_and_filename
                return ""
        else:
            print "Error. This object already had this file opened: %s" %self.path_and_filename
            return ""
        self.path_and_filename = path_and_filename
        return path_and_filename

    def close_file(self):
        try:
            self.file_handle.close()
        except:
            print "Unable to close file %s" %self.path_and_filename
        finally:            
            self.path_and_filename = ""
            self.file_handle = None
        
    def get_token(self):
        """
        Returns the next token triplet. Has the side effect of determining and retaining the 1-ahead token
        as the 'peeked' token. Calls the private method _get_token to do these two things.

        Returns a triplet list:
        @return: self.token
        @rtype integer        NAME, DATA LOOP, etc

        @return self.value    ie, "data_XXXX" or "_citation.title"
        @rtype string

        @return self.token_num_in_line   if this is the first token in a readline returns 0, etc
        @rtype integer
        """
        if self.need_to_run_first_time:
            (self.token, self.value, self.token_num_in_line) = self._get_token()
            if self.token <> self.END_OF_FILE:
                (self.peeked_token, self.peeked_value, self.peeked_token_num_in_line)= self._get_token()
            else:
                self.peeked_token = self.END_OF_FILE
                self.peeked_value = ""
                self.peeked_token_num_in_line = 0
            self.need_to_run_first_time = False
        else:
            self.token = self.peeked_token
            self.value = self.peeked_value
            self.token_num_in_line = self.peeked_token_num_in_line
            if self.peeked_token <> self.END_OF_FILE:
                (self.peeked_token, self.peeked_value, self.peeked_token_num_in_line)= self._get_token()
        return (self.token, self.value, self.token_num_in_line)

    def peek_token(self):
        """
        Returns a 'look ahead' to return the single integer value of the next token to be parsed.
        Note: Repeated calls to peek_token without a get_token() call in betwen will NOT peek further
        than one token ahead! :-)

        Returns a singe integer. If you have not called get_token yet, will return RUN_GET_TOKEN_FIRST.
        Otherwise, it will return the integer for NAME, DATA, LOOP, etc

        @return: value of token
        @rtype: integer
        """
        if self.need_to_run_first_time:
            return self.RUN_GET_TOKEN_FIRST
        return self.peeked_token

    def _get_token(self):
        """
        Private method, but there the 'guts' are. Return the triplet list of:
            1. The token number (1 for NAME, 2 for LOOP, etc)
            2. The strip()'d value. For example, if the token is DATA, the value will be 'data_XXXX' where
            XXXX is the four char PDB code.
            3. This version of MMCIFlex parses files line-by-line. This last member of the triplet returned 
            is the token number in this line. So for example any lines that are simple key/value pairs would
            have a NAME token in parsed position 0, and a value in token 1. It is currently not used in
            MMCIF2Dict (my modified version) but it might need to be so what the heck! :-)
        Semi-colon lines are stripped and returned in whole, less the ';'. This includes the final ';' line.
        """
        if self.should_read_a_line:
            self.line=self.file_handle.readline()
            self.tokens_read = 0
            self.should_read_a_line = False
            if self.line and self.line[0] == ";": #shlex.split would turn this line into a list... yuck.
                self.should_read_a_line = True
                self.num_of_tokens = -1
                #return the each semi-colon line, even the last 'empty' one. Don't return the ';' though.
                return self.SEMICOLONS, self.line[1:].strip(), 0
            self.line_as_list = shlex.split(self.line)
            self.num_of_tokens = len(self.line_as_list)
            if self.num_of_tokens == 0:
                return 0,"",0
        token_num, value = self._get_token_num_and_value(self.line_as_list[self.tokens_read], self.tokens_read)
        self.tokens_read += 1
        if self.tokens_read >= self.num_of_tokens:
            self.should_read_a_line = True
        return token_num, value, self.tokens_read-1

    def _get_token_num_and_value(self, list_element, tokens_read):
        """
        Returns the correct token depending on what literal is parsed.
        tokens_read refers to the number of tokens read so far in this current line!!
        """
        if tokens_read == 0:
            if list_element[0:5] == "data_":
                token_num = self.DATA
            elif list_element[0] == ";":
                token_num = self.SEMICOLONS
            elif list_element[0] == "_":
                token_num = self.NAME
            elif list_element[0:5] == "loop_":
                token_num = self.LOOP
            elif list_element[0] == "\"":
                token_num = self.DOUBLEQUOTED
            elif list_element[0] == "'":
                token_num = self.QUOTED
            elif list_element[0] == "#":
                token_num = self.BLOCK
            else:
                token_num = self.SIMPLE
        else:
            #Any other situations which can occur?
            token_num = self.SIMPLE
                
        return (token_num, list_element)

if __name__=="__main__":
    import sys
    
    filename = None
    if len(sys.argv)!=2:
        while filename is None:
            filename = raw_input("Enter a path and filename to .cif file with VALID, COMPLETE data: ")
    else:
        print "Usage: python MMCIFlex.py filename"
        filename=sys.argv[1]

    p = MMCIFlex()
    p.open_file(filename)
    print "%s\t%s\t%s\t%s" %("", "token", "peeked", "")
    print "%s\t%s\t%s\t%s" %("token", "number", "token", "value")
    while 1:
        (token, value, token_num_in_line) = p.get_token()
        peeked_token = p.peek_token()
        print "%s\t%s\t%s\t%s" %(token, token_num_in_line, peeked_token, value)
        if token == 0:
            break
    p.close_file()