[BioPython] SwissProt.SProt parser improvement

Andreas Kuntzagk andreas.kuntzagk at mdc-berlin.de
Fri Jun 20 15:53:54 EDT 2003


Am Fre, 2003-06-20 um 16.26 schrieb Andreas Kuntzagk:
> Hi,
> 
> I have added code to the parser to get the FeatureId (FTId) where it
> exists. Also replaced all occurences of the module string whith the
> functions for the string type. Makes for more readable code.
> 
> Feel free to use or throw away

Forgot to attach the diff.
Andreas 
-------------- next part --------------
Index: Bio/SwissProt/SProt.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/SwissProt/SProt.py,v
retrieving revision 1.24
diff -r1.24 SProt.py
36d35
< import string
166c165
<         data = string.join(lines, '')
---
>         data = ''.join(lines)
506c505
<         cols = string.split(line)
---
>         cols = line.split()
523c522
<         cols = string.split(self._chomp(string.rstrip(line[5:])), ';')
---
>         cols= self._chomp(line[5:].rstrip()).split(';')
525c524
<             self.data.accessions.append(string.lstrip(ac))
---
>             self.data.accessions.append(ac.lstrip())
529,530c528,529
<         if string.find(uprline, 'CREATED') >= 0:
<             cols = string.split(line)
---
>         cols = line.split()
>         if uprline.find(uprline) >= 0:
537,538c536
<         elif string.find(uprline, 'LAST SEQUENCE UPDATE') >= 0:
<             cols = string.split(line)
---
>         elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
545,546c543
<         elif string.find(uprline, 'LAST ANNOTATION UPDATE') >= 0:
<             cols = string.split(line)
---
>         elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
570,571c567,568
<         line = self._chomp(string.rstrip(line[5:]))
<         cols = string.split(line, ';')
---
>         line = self._chomp(line[5:].rstrip())
>         cols = line.split(';')
573c570
<             self.data.organism_classification.append(string.lstrip(col))
---
>             self.data.organism_classification.append(col.lstrip())
587,588c584,585
<         line = self._chomp(string.rstrip(line[5:]))
<         index = string.find(line, '=')
---
>         line = self._chomp(line[5:].rstrip())
>         index = line.find('=')
592c589
<             ids = string.split(line[index+1:], ',')
---
>             ids = line[index+1:].split(',')
594,595c591,592
<             ids = string.split(line, ',')
<         self.data.taxonomy_id.extend(map(string.strip, ids))
---
>             ids = line.split(',')
>         self.data.taxonomy_id.extend([id.strip for id in ids])
598c595
<         rn = string.rstrip(line[5:])
---
>         rn = line[5:].rstrip()
606c603
<         self.data.references[-1].positions.append(string.rstrip(line[5:]))
---
>         self.data.references[-1].positions.append(line[5:].rstrip())
610c607
<         cols = string.split(string.rstrip(line[5:]), ';')
---
>         cols = line[5:].rstrip().split( ';')
616c613
<             index = string.find(col, '=')
---
>             index = col.find('=')
634c631
<             ref.comments.append((string.lstrip(token), text))
---
>             ref.comments.append((token.lstrip(), text))
646c643
<         ind = string.find(line, '[NCBI, ExPASy, Israel, Japan]')
---
>         ind = line.find('[NCBI, ExPASy, Israel, Japan]')
656,657c653,654
<         if string.find(line, "=") != -1:
<             cols = string.split(line)
---
>         if line.find( "=") != -1:
>             cols = line.split()
661c658
<                 id_cols = string.split(info_col, "=")
---
>                 id_cols = info_col.split("=")
670c667
<             cols = string.split(line)
---
>             cols = line.split()
719c716
<         i = string.find(line, '[')
---
>         i = line.find('[')
722,724c719,720
<         cols = string.split(self._chomp(string.rstrip(line)), ';')
<         for i in range(len(cols)):
<             cols[i] = string.lstrip(cols[i])
---
>         cols = self._chomp(line.rstrip()).split(';')
>         cols = [col.lstrip() for col in cols]
728,731c724,726
<         cols = string.split(self._chomp(string.rstrip(line[5:])), ';')
<         for col in cols:
<             self.data.keywords.append(string.lstrip(col))
<     
---
>         cols = self._chomp(line[5:].rstrip()).split(';')
>         self.data.keywords.extend([c.lstrip for c in cols])
> 
734c729
<         name = string.rstrip(line[0:8])
---
>         name = line[0:8].rstrip()
738c733
<             from_res = string.lstrip(line[9:15])
---
>             from_res = line[9:15].lstrip()
742,744c737,743
<             to_res = string.lstrip(line[16:22])
<         description = string.rstrip(line[29:70])
< 
---
>             to_res = line[16:22].lstrip()
>         description = line[29:70].rstrip()
>         #if there is a feature_id (FTId), store it away
>         if line[29:35]==r"/FTId=":
>             ft_id = line[35:70].rstrip()[:-1]
>         else:
>             ft_id =""
747c746
<             name, from_res, to_res, old_description = self.data.features[-1]
---
>             name, from_res, to_res, old_description,old_ft_id = self.data.features[-1]
754c753
<         self.data.features.append((name, from_res, to_res, description))
---
>         self.data.features.append((name, from_res, to_res, description,ft_id))
764c763
<         descr_cols = string.split(description, " -> ")
---
>         descr_cols = description.split(" -> ")
771c770
<             extra_info_pos = string.find(second_seq, " (")
---
>             extra_info_pos = second_seq.find(" (")
777,778c776,777
<             first_seq = string.replace(first_seq, " ", "")
<             second_seq = string.replace(second_seq, " ", "")
---
>             first_seq = first_seq.replace(" ", "")
>             second_seq = second_seq.replace(" ", "")
786c785
<         cols = string.split(line)
---
>         cols = line.split()
792c791
<         seq = string.rstrip(string.replace(line, " ", ""))
---
>         seq = line.replace(" ", "").rstrip()
797a797,798
>     # from Python 2.2.2 could be replaced whith word.rstrip(".,;")
>     # if there is always only one puctuation
814c815
<             setattr(rec, m, string.rstrip(attr))
---
>             setattr(rec, m, attr.rstrip())
823c824
<             setattr(ref, m, string.rstrip(attr))
---
>             setattr(ref, m, attr.rstrip())
849c850
<         self.data.description = string.rstrip(self.data.description)
---
>         self.data.description = self.data.description.rstrip()
852c853
<         cols = string.split(line)
---
>         cols = line.split()
856c857
<         ids = string.split(string.rstrip(line[5:]), ';')
---
>         ids = line[5:].rstrip().split(';')
861c862
<                                 string.strip(line[5:]) + "\n"
---
>                                 line[5:].strip() + "\n"
864c865
<         seq = Seq.Seq(string.rstrip(string.replace(line, " ", "")),
---
>         seq = Seq.Seq(line.replace(" ", "").rstrip(),


More information about the BioPython mailing list