[BioPython] parse IPI data with biopythons SwissProt parser

Wolfgang Schueler wolfgang@proceryon.at
Tue, 18 Dec 2001 10:21:31 +0100


Hi all,

the IPI database at EBI contains proteins from the human genome
from SWISS-PROT, TrEMBL, RefSeq and Ensembl and is available in a 
SWISS-PROT format.
Nevertheless there are minor differences to real SWISS-PROT data which 
prevent the use of the SWISS-PROT parser of Biopython1.00.a3

The following modifications of Sprot.py allowed the parsing of the 
IPI-data (find IPI in http://www.ebi.ac.uk/IPI/IPIhelp.html).

Maybe it is helpful for someone.
Wolfgang




# ws: changes in _RecordConsumer.date()            for IPI
#                _RecordConsumer.identification()  for IPI
#                _Scanner.scanReference()          crashing SwissProt entry
#                _Scanner.scanDT()                 for IPI
#                _Scanner.scanDE()                 for IPI

     def _scan_dt(self, uhandle, consumer):
         self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
#        self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
#ws:2001-12-05----------------------------------------v========v----  # 
IPI does not use 'last annotation update'	
         self._scan_line('DT', uhandle, consumer.date, one_or_more=1)  #
# 
                                               ^========^------#
  #       self._scan_line('DT', uhandle, consumer.date, exactly_one=1) #
#^--------------------------------------------------------------------# 


     def _scan_de(self, uhandle, consumer):
#ws:2001-12-05-----------------------------------------------v========v---- 
  # IPI IPI00029727.2: no DE entry	
         self._scan_line('DE', uhandle, consumer.description, 
any_number=1) # was one_or_more
#------------------------------------------------------------^========^
     def _scan_reference(self, uhandle, consumer):
         while 1:
             if safe_peekline(uhandle)[:2] != 'RN':
                 break
             self._scan_rn(uhandle, consumer)
             self._scan_rp(uhandle, consumer)
             self._scan_rc(uhandle, consumer)
             self._scan_rx(uhandle, consumer)
# ws:2001-12-05 added, entry exists with RL before RA
# ----------v==============================v
             self._scan_rl(uhandle, consumer)
#-----------^==============================^ 

             self._scan_ra(uhandle, consumer)
             self._scan_rt(uhandle, consumer)
             self._scan_rl(uhandle, consumer)


     def identification(self, line):
         cols = string.split(line)
         self.data.entry_name = cols[1]
         self.data.data_class = self._chomp(cols[2])    # don't want ';'
         self.data.molecule_type = self._chomp(cols[3]) # don't want ';'
         self.data.sequence_length = int(cols[4])

         # data class can be 'STANDARD' or 'PRELIMINARY'
# ws:2001-12-05 added to be IPI conform -------------------------v=====v
         if self.data.data_class not in ['STANDARD','PRELIMINARY','IPI']:
# ---------------------------------------------------------------^=====^
             raise SyntaxError, "Unrecognized data class %s is in 
line\n%s" % \
                   (self.data.data_class, line)
         # molecule_type should be 'PRT' for PRoTein
         if self.data.molecule_type != 'PRT':
             raise SyntaxError, "Unrecognized molecule type %s in 
line\n%s" % \
                   (self.data.molecule_type, line)

     def date(self, line):
         uprline = string.upper(line)
         if string.find(uprline, 'CREATED') >= 0:
             cols = string.split(line)
# ws:2001-12-05 added lines to prevent crash at (IPIrel. , created) !no 
number given!
             if self._chomp(cols[3]) == '':                            #<=
	       self.data.created = cols[1], 0                         #<=
	    else:	                                              #<=
                self.data.created = cols[1], int(self._chomp(cols[3]))
#-----------^=^--------------------------------------------------------
         elif string.find(uprline, 'LAST SEQUENCE UPDATE') >= 0:
             cols = string.split(line)
# ws:2001-12-05 added lines to prevent crash at '(IPIrel. , created)' 
!no number given!
             if self._chomp(cols[3]) == '': 
        #<=
	       self.data.sequence_update = cols[1], 0                         #<=
	    else:                                                             #<=
                self.data.sequence_update = cols[1], 
int(self._chomp(cols[3]))
#-----------^=^----------------------------------------------------------------
         elif string.find(uprline, 'LAST ANNOTATION UPDATE') >= 0:
             cols = string.split(line)
# ws:2001-12-05 added lines to prevent crash at '(IPIrel. , created)' 
!no number given!
             if self._chomp(cols[3]) == '': 
        #<=
	       self.data.annotation_update = cols[1], 0                       #<=
	    else:                                                             #<=
                self.data.annotation_update = cols[1], 
int(self._chomp(cols[3]))  #<=
#-----------^=^----------------------------------------------------------------
         else:
             raise SyntaxError, "I don't understand the date line %s" % line