import string import shlex class MMCIFlex: """ Paul T. Bathen, Nov 2009 Returns tokens to the calling object (usually MMCIF2Dict) with each call to get_token(). Based entirely on python, no flex support needed. Each call to get_token() implements this psuedo code (implemented quite differently) while readline() is not EOF return any semicolon lines with the SEMICOLONS token. use shlex.split to split this current line into tokens while there are tokens left: process a token for each call to get_token Typical usage: (note: create the object, THEN pass in the filename. This is different than the C/flex version of MMCIFlex.py) p = MMCIFlex() fn=raw_input('Please enter the path and filename of the .cif file to parse: ') p.open_file(fn) while 1: (token, value, token_num_in_line) = p.get_token() print "%s\t%s\t%s" %(token, token_num_in_line, value) peeked_token = p.peek_token() #has no side effect. if token == 0: break #end of file p.close_file() Also supports a call to peek_token() to get the 1-ahead token, if any. Notes: - Returns all semi-colon lines without the leading semi-colon as a complete line. So ;asdlkjasldjalsdjalsdj "asdasd" asdasdasd 'asdasd'a asdasd would be returned as one single value: asdlkjasldjalsdjalsdj "asdasd" asdasdasd 'asdasd'a asdasd - Returns the '#' lines as BLOCK tokens, which are discarded in MMCIFDict - The call to get_token() returns THREE values as opposed to the C version's two. The third value token_num_in_line represents the 0-based index into what token this was in a given readline. - Currently the program strips any whitespace in the returned value. Token value are: self.RUN_GET_TOKEN_FIRST = -2 self.BLOCK=-1 self.END_OF_FILE = 0 self.NAME=1 self.LOOP=2 self.DATA=3 self.SEMICOLONS=4 self.DOUBLEQUOTED=5 self.QUOTED=6 self.SIMPLE=7 """ def __init__(self): self.path_and_filename = "" self.file_handle = None self.should_read_a_line = True self.need_to_run_first_time = True self.num_of_tokens = 0 self.tokens_read = 1 self.RUN_GET_TOKEN_FIRST = -2 self.BLOCK=-1 self.END_OF_FILE = 0 self.NAME=1 self.LOOP=2 self.DATA=3 self.SEMICOLONS=4 self.DOUBLEQUOTED=5 self.QUOTED=6 self.SIMPLE=7 def open_file(self, path_and_filename): if self.file_handle is None: try: self.file_handle = open(path_and_filename,'rb') except IOError: self.path_and_filename = "" self.file_handle = None print "Error. Can't open file %s" %path_and_filename return "" else: print "Error. This object already had this file opened: %s" %self.path_and_filename return "" self.path_and_filename = path_and_filename return path_and_filename def close_file(self): try: self.file_handle.close() except: print "Unable to close file %s" %self.path_and_filename finally: self.path_and_filename = "" self.file_handle = None def get_token(self): """ Returns the next token triplet. Has the side effect of determining and retaining the 1-ahead token as the 'peeked' token. Calls the private method _get_token to do these two things. Returns a triplet list: @return: self.token @rtype integer NAME, DATA LOOP, etc @return self.value ie, "data_XXXX" or "_citation.title" @rtype string @return self.token_num_in_line if this is the first token in a readline returns 0, etc @rtype integer """ if self.need_to_run_first_time: (self.token, self.value, self.token_num_in_line) = self._get_token() if self.token <> self.END_OF_FILE: (self.peeked_token, self.peeked_value, self.peeked_token_num_in_line)= self._get_token() else: self.peeked_token = self.END_OF_FILE self.peeked_value = "" self.peeked_token_num_in_line = 0 self.need_to_run_first_time = False else: self.token = self.peeked_token self.value = self.peeked_value self.token_num_in_line = self.peeked_token_num_in_line if self.peeked_token <> self.END_OF_FILE: (self.peeked_token, self.peeked_value, self.peeked_token_num_in_line)= self._get_token() return (self.token, self.value, self.token_num_in_line) def peek_token(self): """ Returns a 'look ahead' to return the single integer value of the next token to be parsed. Note: Repeated calls to peek_token without a get_token() call in betwen will NOT peek further than one token ahead! :-) Returns a singe integer. If you have not called get_token yet, will return RUN_GET_TOKEN_FIRST. Otherwise, it will return the integer for NAME, DATA, LOOP, etc @return: value of token @rtype: integer """ if self.need_to_run_first_time: return self.RUN_GET_TOKEN_FIRST return self.peeked_token def _get_token(self): """ Private method, but there the 'guts' are. Return the triplet list of: 1. The token number (1 for NAME, 2 for LOOP, etc) 2. The strip()'d value. For example, if the token is DATA, the value will be 'data_XXXX' where XXXX is the four char PDB code. 3. This version of MMCIFlex parses files line-by-line. This last member of the triplet returned is the token number in this line. So for example any lines that are simple key/value pairs would have a NAME token in parsed position 0, and a value in token 1. It is currently not used in MMCIF2Dict (my modified version) but it might need to be so what the heck! :-) Semi-colon lines are stripped and returned in whole, less the ';'. This includes the final ';' line. """ if self.should_read_a_line: self.line=self.file_handle.readline() self.tokens_read = 0 self.should_read_a_line = False if self.line and self.line[0] == ";": #shlex.split would turn this line into a list... yuck. self.should_read_a_line = True self.num_of_tokens = -1 #return the each semi-colon line, even the last 'empty' one. Don't return the ';' though. return self.SEMICOLONS, self.line[1:].strip(), 0 self.line_as_list = shlex.split(self.line) self.num_of_tokens = len(self.line_as_list) if self.num_of_tokens == 0: return 0,"",0 token_num, value = self._get_token_num_and_value(self.line_as_list[self.tokens_read], self.tokens_read) self.tokens_read += 1 if self.tokens_read >= self.num_of_tokens: self.should_read_a_line = True return token_num, value, self.tokens_read-1 def _get_token_num_and_value(self, list_element, tokens_read): """ Returns the correct token depending on what literal is parsed. tokens_read refers to the number of tokens read so far in this current line!! """ if tokens_read == 0: if list_element[0:5] == "data_": token_num = self.DATA elif list_element[0] == ";": token_num = self.SEMICOLONS elif list_element[0] == "_": token_num = self.NAME elif list_element[0:5] == "loop_": token_num = self.LOOP elif list_element[0] == "\"": token_num = self.DOUBLEQUOTED elif list_element[0] == "'": token_num = self.QUOTED elif list_element[0] == "#": token_num = self.BLOCK else: token_num = self.SIMPLE else: #Any other situations which can occur? token_num = self.SIMPLE return (token_num, list_element) if __name__=="__main__": import sys filename = None if len(sys.argv)!=2: while filename is None: filename = raw_input("Enter a path and filename to .cif file with VALID, COMPLETE data: ") else: print "Usage: python MMCIFlex.py filename" filename=sys.argv[1] p = MMCIFlex() p.open_file(filename) print "%s\t%s\t%s\t%s" %("", "token", "peeked", "") print "%s\t%s\t%s\t%s" %("token", "number", "token", "value") while 1: (token, value, token_num_in_line) = p.get_token() peeked_token = p.peek_token() print "%s\t%s\t%s\t%s" %(token, token_num_in_line, peeked_token, value) if token == 0: break p.close_file()