# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

#Modifications Oct/Nov 2009 by Paul T. Bathen to match with the MMCIFlex.py
#module written completely in Python. Includes 'peek_token' to do a 1-ahead
#token lookup from MMCIFlex; modifications to the __main__ method; potential
#error correction to how multiple semicolon lines were originally handled
#in the previous version if each line was passed in as it's own token and value...
#wouldn't have worked, I don't think.

import os.path
from UserDict import UserDict
import Bio.PDB.mmCIF.MMCIFlex
import warnings

__doc__="Turn an mmCIF file into a dictionary."

class MMCIF2Dict(UserDict):
    """
    Original version (c) Thomas Hamelryck (thamelry@binf.ku.dk)
    Modifications made Nov 2009 by Paul T. Bathen
    
    Convert a valid .cif protein file into a bunch of dictionaries. This version of
    MMCIFParser relies on the MMCIFlex written totally in python with no lex/flex/C
    support needed. As such, the class and object signatures have changed. For example,
    in the older version MMCIFlex was an external process the MMCIFParser sent
    messages to. In this version, MMCIFParser maintains an object handle to MMCIFlex.
    Also, this version utilizes a 1-ahead 'peek_token' in the MMCIFlex object.

    This class is used by MMCIFParser. However, there is a __main__ method available
    for testing. A typical user test to see all the keys generated might be:

    import MMCIFParser
    ...
    mmcif_dict=MMCIF2Dict(path_and_filename_to_cif_file)
    for key in mmcif_dict.keys():
        print key
    ...
    
    """
    # The token identifiers
    RUN_GET_TOKEN_FIRST = -2
    BLOCK=-1
    END_OF_FILE=0
    NAME=1
    LOOP=2
    DATA=3
    SEMICOLONS=4    
    DOUBLEQUOTED=5
    QUOTED=6
    SIMPLE=7

    def __init__(self, path_and_filename):
        """
        Initiate the object with the path and filename of a valid .cif file
        """
        # this dict will contain the name/data pairs 
        self.data={}
        # entry for garbage
        self.data[None]=[]
        if not os.path.isfile(path_and_filename):
            raise IOError("File not found.")
        self.mmciflex = Bio.PDB.mmCIF.MMCIFlex.MMCIFlex()
        self.mmciflex.open_file(path_and_filename)
        self._make_mmcif_dict()
        self.mmciflex.close_file()

    def _make_mmcif_dict(self):
        """
        Makes dict's out of the mmcif entries.
        With a call to the MMCIFlex object's get_token() method, either reads the next token out of a line, or if all tokens in a line
        are exhausted it reads the next line and starts the process again.
        Strings like "abc", "abc 5's", 'asdasd', 'asdasd"asdasd' are all handled correctly and returned as a single token per occurence.
        The triplet list return values in the call to get_token() are:
           1. the token: 0 through 7 as identified in the static identifiers, and the '#" lines as -1.
           2. the value associated with the token.
           3. num_of_tokens_read_this_line: the list number in the line just read, ie, 0 would be the
           first token read from a readline() in get_token()
        Semi-colon lines are returned in their entirety, less the ';', including the last semi-colon line which should just be '' (stripped).
        The first line, data_XXXX, is returned in it's entirety with a token of DATA where XXXX is the 4-char file code.
        """
        #local copies
        RUN_GET_TOKEN_FIRST = self.RUN_GET_TOKEN_FIRST
        END_OF_FILE=self.END_OF_FILE
        NAME=self.NAME
        LOOP=self.LOOP
        DATA=self.DATA
        SEMICOLONS=self.SEMICOLONS
        DOUBLEQUOTED=self.DOUBLEQUOTED
        QUOTED=self.QUOTED
        SIMPLE=self.SIMPLE
        BLOCK=self.BLOCK

        # are we looping?
        loop_flag=0
        # list of names in loop
        temp_list=[]
        # last encountered name
        current_name=None
        token, value, num_of_tokens_read_this_line=self.mmciflex.get_token()
        #print "1:%s %s %s" %(token, value,num_of_tokens_read_this_line)        

        # print token, value
        mmcif_dict=self.data
        # loop until EOF (token==0)
        while token:
            if token==DATA: #We just read the data_XXXX line
                mmcif_dict[value[0:5]]=value[5:]
                token=None
            elif token==LOOP: #We just read a loop_ line... list of names coming up in the following tokens
                loop_flag=1
                temp_list=[]
                token=None
            elif token==BLOCK: #Comment line, ie, #
                token=None
            elif token==NAME: #We just read a line where the first char in the line was a '_'
                if loop_flag: #And we are in a loop
                    # Make lists for all the names in the loop
                    while token==NAME: 
                        # create  a list for each name encountered in loop
                        new_list=mmcif_dict[value]=[]
                        temp_list.append(new_list)
                        if self.mmciflex.peek_token() == NAME:
                            token, value, num_of_tokens_read_this_line=self.mmciflex.get_token()
                            #print "2:%s %s %s" %(token, value,num_of_tokens_read_this_line)
                        else:
                            token = None
                    loop_flag=0         
                    # nr of data items parsed
                    data_counter=0
                    # corresponding data name
                    pos=0
                    nr_fields=len(temp_list)
                    # Now fill all lists with the data
                    if token==None:
                        token, value, num_of_tokens_read_this_line=self.mmciflex.get_token()
                    while token>3:
                        if token==SEMICOLONS: #consume all the related semi-colons lines for this NAME
                            tmp_string = value
                            while token == SEMICOLONS:
                                if self.mmciflex.peek_token() == SEMICOLONS:
                                    token, value, num_of_tokens_read_this_line=self.mmciflex.get_token()
                                    #print "3:%s %s %s" %(token, value,num_of_tokens_read_this_line)                                
                                    if len(value):
                                        tmp_string = " ".join([tmp_string,value])
                                else:
                                    token = None
                            value = tmp_string
                        pos=data_counter%nr_fields
                        data_counter=data_counter+1
                        temp_list[pos].append(value)
                        if self.mmciflex.peek_token() > 3:
                            token, value, num_of_tokens_read_this_line=self.mmciflex.get_token()
                            #print "4:%s %s %s" %(token, value,num_of_tokens_read_this_line)                            
                        else:
                            token = None
                    if pos!=nr_fields-1:
                        #print "pos=%s, nr_fields-1=%s" %(pos,nr_fields-1)
                        warnings.warn("ERROR1: broken name-data pair "
                                      "(data missing)!%s %s" % (token, value), RuntimeWarning)
                    token=None
                else:   
                    # simple name-data pair (no loop) so next token should be the data
                    next_token, data,num_of_tokens_read_this_line=self.mmciflex.get_token()
                    #print "5:%s %s %s" %(next_token, data,num_of_tokens_read_this_line)
                    if next_token<4:
                        warnings.warn("ERROR2: broken name-data pair "
                                      "(name-non data pair)! "
                                      "next_token:%s data:%s. There will be no data for token:%s value:%s" \
                                      % (next_token, data, token, value), RuntimeWarning)
                        # print token, value
                        #Use this token in the next round... continue on where we left off.
                        token = next_token
                        value = data
                    else:
                        tmp_string = data
                        if next_token == SEMICOLONS:
                            while next_token == SEMICOLONS:
                                if self.mmciflex.peek_token() == SEMICOLONS:
                                    next_token, data, num_of_tokens_read_this_line=self.mmciflex.get_token()
                                    #print "6:%s %s %s" %(next_token, data,num_of_tokens_read_this_line)
                                    if len(data.strip()):
                                        tmp_string = " ".join([tmp_string,data.strip()])
                                else:
                                    next_token = None
                                    token = None
                        mmcif_dict[value]=tmp_string
                        token = None
            else:
                # we found some complete garbage
                warnings.warn("ERROR3: broken name-data pair "
                              "(missing name)!%s %s" % (token, value),
                              RuntimeWarning)
                mmcif_dict[None].append(value)
                # get next token
                token=None

            if token==None:
                token, value,num_of_tokens_read_this_line=self.mmciflex.get_token()
                #print "7:%s %s %s" %(token, value,num_of_tokens_read_this_line)

    def __getitem__(self, key):
        return self.data[key]


if __name__=="__main__":

    import sys

    filename = None
    if len(sys.argv)!=2:
        while filename is None:
            filename = raw_input("Enter the .cif filename to process: ")
    else:
        print "Usage: python MMCIF2Dict.py filename"
        filename=sys.argv[1]    

    mmcif_dict=MMCIF2Dict(filename)
    print "File tokenized and processed. Continue.\n"
    input=""
    while(input!="q"):
        print "\n\nEnter 'q' to end,'k' to get list of keys,'a' to list all key/value pairs,"
        print "or enter a key to get it's value."
        print "An example key entry might be ==>_citation.title"
        input=raw_input("Your entry ==> ")    
        if input=="q":
            sys.exit()
        elif input=="k":
            for key in mmcif_dict.keys():
                print key
        elif input=="a":
            for key in mmcif_dict.keys():
                value=mmcif_dict[key]
                if type(value)==type([]):
                    for item in value:
                        print "\nkey=%s\nitem=%s" %(key,item)
                else:
                    print "\nkey=%s\nvalue=%s" %(key,value)
        else:
            try:
                value=mmcif_dict[input]
                if type(value)==type([]):
                    for item in value:
                        print "\nkey=%s\nitem=%s" %(input,item)
                else:
                    print "\nkey=%s\nvalue=%s" %(input,value)
            except KeyError:
                print "No such key found."