[BioPython] MARTEL parser & FASTA

Tue May 18 06:04:27 EDT 2004

Hi Cristian;

> I would like to parse FASTA files with NCBI description header. I see
> some reference about it in the Bio.expression.fasta module, but when I
> use it by the SeqRecord.io module it assign None and "<no name>" values
> to the 'id' and 'name' attribs. What's wrong?
> 
> 	SeqFile = open('example.fasta')
> 	r = SeqRecord.io.readFile(SeqFile, debug_level=0)
> 	for i in r:
> 		print i.id
> 		print i.name
> 		print i.description

Thanks for this report. The problem is basically that the io system
is still under development and not completely finished. We really do
appreciate people testing on it and reporting problems.

I updated Bio/builders/SeqRecord/sequence.py so that it not tries to
handle setting the name and id attributes in a smarter fashion.
I've committed the changes into CVS and also am attaching a patch
with the changes made.

Let me know if this works better for you. Thanks again for the
report.
Brad
-------------- next part --------------
Index: sequence.py
===================================================================
RCS file: /home/repository/biopython/biopython/Bio/builders/SeqRecord/sequence.py,v
retrieving revision 1.6
retrieving revision 1.8
diff -c -r1.6 -r1.8
*** sequence.py	18 Mar 2004 00:53:24 -0000	1.6
--- sequence.py	18 May 2004 13:58:35 -0000	1.8
***************
*** 32,37 ****
--- 32,39 ----
          Dispatch.Dispatcher.__init__(self)
          self.acquire(StdHandler.Handle_dbid(self.add_dbid),
                       prefix = Std.NS)
+         self.acquire(StdHandler.Handle_dbxref(self.add_dbxref_dbids),
+                      prefix = Std.NS)
          self.acquire(StdHandler.Handle_description(self.add_description),
                       prefix = Std.NS)
          self.acquire(StdHandler.Handle_sequence(self.add_sequence),
***************
*** 44,60 ****
      def start_record(self, tag, attrs):
          self.dbname = None
          self.id_text = None
          self.description = None
          self.alphabet = None
          self.seq = None
          self.features = None
          self.dbxrefs = []
-         

      def add_dbid(self, text, attrs):
          if attrs.get("type") == "primary":
              self.dbname = attrs.get("dbname", "unknown")
              self.id_text = text

      def add_description(self, text):
          self.description = text
--- 46,83 ----
      def start_record(self, tag, attrs):
          self.dbname = None
          self.id_text = None
+         self.name_text = '<unknown name>'
          self.description = None
          self.alphabet = None
          self.seq = None
          self.features = None
          self.dbxrefs = []

      def add_dbid(self, text, attrs):
          if attrs.get("type") == "primary":
              self.dbname = attrs.get("dbname", "unknown")
              self.id_text = text
+         # use the first accession/secondary id as the name
+         # this should be equivalent to what Biopython does
+         elif attrs.get("type") in ["accession", "secondary"]:
+             self.name_text = text
+ 
+     def add_dbxref_dbids(self, dbname_style, dbname, idtype, dbid, negate):
+         """Handle setting name and id attributes from the dbxref ids.
+ 
+         Likely we'll either have a dbid or dbxref dbids to use. We default
+         to using the dbid if it exists.
+         """
+         # first deal with the primary id: SeqFeature.id
+         # set the id if we haven't yet set an id (take the first id we get)
+         # and if we have a primary id
+         if (self.id_text is None and idtype == "primary"):
+             self.id_text = dbid
+ 
+         # now deal with secondary ids: SeqFeature.name
+         if idtype == "secondary":
+             self.name_text = dbid
+ 

      def add_description(self, text):
          self.description = text
***************
*** 65,70 ****
--- 88,95 ----
          self.seq = Seq.Seq(seq, alphabet)

      def add_dbxref(self, dbname_style, dbname, idtype, dbid, negate):
+         """Store all id cross references.
+         """
          self.dbxrefs.append(DBXRef.from_parser(dbname_style, dbname, idtype,
                                                 dbid, negate))

***************
*** 80,85 ****
--- 105,111 ----
          self.document = SeqRecord.SeqRecord(
              seq = self.seq,
              id = self.id_text,
+             name = self.name_text,
              description = self.description,
              dbxrefs = self.dbxrefs,
              features = self.features,