[BioPython] parse IPI data with biopythons SwissProt parser
Wolfgang Schueler
wolfgang@proceryon.at
Tue, 18 Dec 2001 10:21:31 +0100
Hi all,
the IPI database at EBI contains proteins from the human genome
from SWISS-PROT, TrEMBL, RefSeq and Ensembl and is available in a
SWISS-PROT format.
Nevertheless there are minor differences to real SWISS-PROT data which
prevent the use of the SWISS-PROT parser of Biopython1.00.a3
The following modifications of Sprot.py allowed the parsing of the
IPI-data (find IPI in http://www.ebi.ac.uk/IPI/IPIhelp.html).
Maybe it is helpful for someone.
Wolfgang
# ws: changes in _RecordConsumer.date() for IPI
# _RecordConsumer.identification() for IPI
# _Scanner.scanReference() crashing SwissProt entry
# _Scanner.scanDT() for IPI
# _Scanner.scanDE() for IPI
def _scan_dt(self, uhandle, consumer):
self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
# self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
#ws:2001-12-05----------------------------------------v========v---- #
IPI does not use 'last annotation update'
self._scan_line('DT', uhandle, consumer.date, one_or_more=1) #
#
^========^------#
# self._scan_line('DT', uhandle, consumer.date, exactly_one=1) #
#^--------------------------------------------------------------------#
def _scan_de(self, uhandle, consumer):
#ws:2001-12-05-----------------------------------------------v========v----
# IPI IPI00029727.2: no DE entry
self._scan_line('DE', uhandle, consumer.description,
any_number=1) # was one_or_more
#------------------------------------------------------------^========^
def _scan_reference(self, uhandle, consumer):
while 1:
if safe_peekline(uhandle)[:2] != 'RN':
break
self._scan_rn(uhandle, consumer)
self._scan_rp(uhandle, consumer)
self._scan_rc(uhandle, consumer)
self._scan_rx(uhandle, consumer)
# ws:2001-12-05 added, entry exists with RL before RA
# ----------v==============================v
self._scan_rl(uhandle, consumer)
#-----------^==============================^
self._scan_ra(uhandle, consumer)
self._scan_rt(uhandle, consumer)
self._scan_rl(uhandle, consumer)
def identification(self, line):
cols = string.split(line)
self.data.entry_name = cols[1]
self.data.data_class = self._chomp(cols[2]) # don't want ';'
self.data.molecule_type = self._chomp(cols[3]) # don't want ';'
self.data.sequence_length = int(cols[4])
# data class can be 'STANDARD' or 'PRELIMINARY'
# ws:2001-12-05 added to be IPI conform -------------------------v=====v
if self.data.data_class not in ['STANDARD','PRELIMINARY','IPI']:
# ---------------------------------------------------------------^=====^
raise SyntaxError, "Unrecognized data class %s is in
line\n%s" % \
(self.data.data_class, line)
# molecule_type should be 'PRT' for PRoTein
if self.data.molecule_type != 'PRT':
raise SyntaxError, "Unrecognized molecule type %s in
line\n%s" % \
(self.data.molecule_type, line)
def date(self, line):
uprline = string.upper(line)
if string.find(uprline, 'CREATED') >= 0:
cols = string.split(line)
# ws:2001-12-05 added lines to prevent crash at (IPIrel. , created) !no
number given!
if self._chomp(cols[3]) == '': #<=
self.data.created = cols[1], 0 #<=
else: #<=
self.data.created = cols[1], int(self._chomp(cols[3]))
#-----------^=^--------------------------------------------------------
elif string.find(uprline, 'LAST SEQUENCE UPDATE') >= 0:
cols = string.split(line)
# ws:2001-12-05 added lines to prevent crash at '(IPIrel. , created)'
!no number given!
if self._chomp(cols[3]) == '':
#<=
self.data.sequence_update = cols[1], 0 #<=
else: #<=
self.data.sequence_update = cols[1],
int(self._chomp(cols[3]))
#-----------^=^----------------------------------------------------------------
elif string.find(uprline, 'LAST ANNOTATION UPDATE') >= 0:
cols = string.split(line)
# ws:2001-12-05 added lines to prevent crash at '(IPIrel. , created)'
!no number given!
if self._chomp(cols[3]) == '':
#<=
self.data.annotation_update = cols[1], 0 #<=
else: #<=
self.data.annotation_update = cols[1],
int(self._chomp(cols[3])) #<=
#-----------^=^----------------------------------------------------------------
else:
raise SyntaxError, "I don't understand the date line %s" % line