[Biojava-l] fileToBiojava question
Bernd Jagla
bernd.jagla at pasteur.fr
Tue Sep 21 12:47:21 UTC 2010
Sorry for the wrong reply...
Here is the FULL code I marked the passages that are important in red:
Thanks for looking at it!!!!
Bernd
package org.pasteur.pf2.biojava;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.Alphabet;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.SymbolList;
import org.biojavax.RichObjectFactory;
import org.biojavax.bio.seq.io.RichSequenceFormat;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.RowKey;
import org.knime.core.data.container.BlobDataCell;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
import org.knime.core.node.defaultnodesettings.SettingsModelString;
import org.biojavax.bio.seq.io.EMBLFormat;
import org.biojavax.bio.seq.io.FastaFormat;
import org.biojavax.bio.seq.io.GenbankFormat;
import org.biojavax.bio.seq.io.INSDseqFormat;
import org.biojavax.bio.seq.io.RichSequenceBuilderFactory;
import org.biojavax.bio.seq.io.RichSequenceFormat;
import org.biojavax.bio.seq.io.RichStreamReader;
import org.biojavax.bio.seq.io.UniProtFormat;
import org.pasteur.pf2.datatypes.*;
/**
* This is the model implementation of FastAReader. Reads a FASTA file
into two
* columns: seq_name and sequence
*
* @author Bernd Jagla
*/
@SuppressWarnings("deprecation")
public class FastAReaderNodeModel extends NodeModel {
// the logger instance
private static final NodeLogger logger = NodeLogger
.getLogger(FastQReaderNodeModel.class);
private Alphabet alpha;
private SequenceIterator iter;
/**
* the settings key which is used to retrieve and store the
settings (from
* the dialog or from a settings file) (package visibility to be
usable from
* the dialog).
*/
private static final String FAR_name = "far_name";
private static final String FAR_fileFormat = "far_ff";
private static final String FAR_alphabet = "far_alph";
private final SettingsModelString m_fpname = createFAR_fpname();
private final SettingsModelString m_fformat = createFileFormat();
private final SettingsModelString m_alphabet = createAlphabet();
/**
* Constructor for the node model.
*/
protected FastAReaderNodeModel() {
super(0, 1);
}
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData,
final ExecutionContext exec) throws Exception {
// TODO do something here
logger.info("Node Model Stub... this is not yet implemented !");
// the data table spec of the single output table,
// the table will have three columns:
DataColumnSpec[] allColSpecs = new DataColumnSpec[1];
allColSpecs[0] = new DataColumnSpecCreator("sequence",
SequenceDataCell.TYPE)
.createSpec();
DataTableSpec outputSpec = new DataTableSpec(allColSpecs);
// the execution context will provide us with storage capacity,
in this
// case a data container to which we will add rows sequentially
// Note, this container can also handle arbitrary big data
tables, it
// will buffer to disc if necessary.
BufferedDataContainer container =
exec.createDataContainer(outputSpec);
// let's add m_count rows to it
// once we are done, we close the container and return its table
FileReader fp = new FileReader(m_fpname.getStringValue());
exec.checkCanceled();
//String form = m_fformat.getStringValue();
//String alphabet = m_alphabet.getStringValue();
String form = "genbank";
String alphabet = "DNA";
BufferedReader br = new BufferedReader(fp);
// String line = br.readLine();
int count = 0;
SequenceIterator iter = (SequenceIterator)
SeqIOTools.fileToBiojava(
form, alphabet, br);
while (iter.hasNext()) {
exec.checkCanceled();
RowKey key = new RowKey("Row " + count);
exec.setProgress("Row " + count);
// System.out.println(fastq.getSequence());
Sequence seq = iter.nextSequence();
String seqName = seq.getName();
// String seqName = "asdf";
//String sequence = seq.seqString();
System.err.println("reading: " + seqName + " " + seq.length());
SequenceDataCell seqCell = new SequenceDataCell(seqName, seq);
container.addRowToTable(new DefaultRow(key, seqCell));
count++;
}
System.err.println("finished reading file");
br.close();
fp.close();
container.close();
return new BufferedDataTable[] { container.getTable() };
}
/**
* Makes a <code>SequenceIterator</code> look like an
* <code>Iterator {@code <Sequence>}</code>
*
* @param iter
* The <CODE>SequenceIterator</CODE>
* @return An <CODE>Iterator</CODE> that returns only
<CODE>Sequence</CODE>
* objects. <B>You cannot call <code>remove()</code> on this
* iterator!</B>
*/
public Iterator<Sequence> asIterator(SequenceIterator iter) {
final SequenceIterator it = iter;
return new Iterator<Sequence>() {
public boolean hasNext() {
return it.hasNext();
}
public Sequence next() {
try {
return it.nextSequence();
} catch (BioException e) {
NoSuchElementException ex = new
NoSuchElementException();
ex.initCause(e);
throw ex;
}
}
public void remove() {
throw new UnsupportedOperationException();
}
};
}
public static RichSequenceFormat formatForName(String name)
throws ClassNotFoundException, InstantiationException,
IllegalAccessException {
// determine the format to use
RichSequenceFormat format;
if (name.equalsIgnoreCase("fasta")) {
format = (RichSequenceFormat) new FastaFormat();
} else if (name.equalsIgnoreCase("genbank")) {
format = (RichSequenceFormat) new GenbankFormat();
} else if (name.equalsIgnoreCase("uniprot")) {
format = new UniProtFormat();
} else if (name.equalsIgnoreCase("embl")) {
format = new EMBLFormat();
} else if (name.equalsIgnoreCase("INSDseq")) {
format = new INSDseqFormat();
} else {
Class formatClass = Class.forName(name);
format = (RichSequenceFormat) formatClass.newInstance();
}
return format;
}
/**
* {@inheritDoc}
*/
@Override
protected void reset() {
}
/**
* {@inheritDoc}
*/
@Override
protected DataTableSpec[] configure(final DataTableSpec[] inSpecs)
throws InvalidSettingsException {
DataColumnSpec[] allColSpecs = new DataColumnSpec[1];
allColSpecs[0] = new DataColumnSpecCreator("sequence",
SequenceDataCell.TYPE)
.createSpec();
DataTableSpec outputSpec = new DataTableSpec(allColSpecs);
return new DataTableSpec[] { outputSpec };
}
/**
* {@inheritDoc}
*/
@Override
protected void saveSettingsTo(final NodeSettingsWO settings) {
m_alphabet.saveSettingsTo(settings);
m_fformat.saveSettingsTo(settings);
m_fpname.saveSettingsTo(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadValidatedSettingsFrom(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_alphabet.loadSettingsFrom(settings);
m_fformat.loadSettingsFrom(settings);
m_fpname.loadSettingsFrom(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void validateSettings(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_alphabet.validateSettings(settings);
m_fformat.validateSettings(settings);
m_fpname.validateSettings(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
}
/**
* {@inheritDoc}
*/
@Override
protected void saveInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
}
public static SettingsModelString createFAR_fpname() {
return new SettingsModelString(FAR_name, "");
}
public static SettingsModelString createFileFormat() {
return new SettingsModelString(FAR_fileFormat, "FASTA");
}
public static SettingsModelString createAlphabet() {
return new SettingsModelString(FAR_alphabet, "RNA");
}
}
On 9/21/2010 2:40 PM, simon rayner wrote:
> hi,
>
> can you repost to the biojava group along with the full code, (just in
> case there is a missing import or something). you only replied to,
> and not to the biojava mailing list
>
> thanks
>
> simon
>
> On Tue, Sep 21, 2010 at 8:18 PM, Bernd Jagla <bernd.jagla at pasteur.fr
> <mailto:bernd.jagla at pasteur.fr>> wrote:
>
> Thanks for the quick reply!
>
> Here is some code that should have all the important parts:
>
> String form = "genbank";
> String alphabet = "dna";
> BufferedReader br = new BufferedReader(fp);
> SequenceIterator iter = (SequenceIterator) SeqIOTools.fileToBiojava(
> form, alphabet, br);
> while (iter.hasNext()) {
> Sequence seq = iter.nextSequence();
> => Exception thrown
> String seqName = seq.getName();
> }
>
>
> When trying to simplify the code a bit I now get the following error:
> Execute failed: Could not initialize class
> org.biojava.bio.seq.FeatureFilter
>
> I assume that in the previous times I had a spelling error??
> Then the exception got thrown during the initialization of "iter"
>
> Thanks,
>
> Bernd
>
>
> On 9/21/2010 2:07 PM, simon rayner wrote:
>> hi,
>>
>> can you post the code you are trying to run along with the full
>> error, it will help to figure out what is happening. There are
>> now loaders for biojavax as well, which work well which are
>> available in the biojavax docs here
>> http://biojava.org/wiki/BioJava:BioJavaXDocs#Example
>>
>> but yeah, it's confusing unless you happen to be a real java
>> guru. i keep having to refer back to the docs because i keep
>> forgeting which class does what
>>
>> On Tue, Sep 21, 2010 at 7:46 PM, Bernd Jagla
>> <bernd.jagla at pasteur.fr <mailto:bernd.jagla at pasteur.fr>> wrote:
>>
>> Hello,
>>
>> I am getting a little frustrated with the wiki page (I guess
>> I don't spend enough time reading and testing). I have the
>> impression that some of the documentation relates to version
>> 3 whereas others relate to 1.5 or 1.7.
>> So sorry if this all sounds a bit confused... ;(
>>
>> I believe I am using 1.7.1. (I wasn't able to find a readme
>> file that contains that information) even though I would
>> probably like to use version 3. But as I am stuck with an
>> older Eclipse version I think it will be even worse when I
>> try that.
>>
>> Anyways, I am trying to read in sequence files using
>> SeqIOTools.fileToBiojava, which seems to be deprecated, with
>> the following parameters: "genbank", "dna", bufferedReader.
>>
>> somehow this works with "fasta" but with genbank I get the
>> following exception:
>> Execute failed: Unknown file type '524300'
>> in some cases I get:
>> Unknown file type '262156'
>>
>> Does this mean anything to you?
>>
>> Or how do you read in a sequence file? I am looking for a
>> generic way that covers many file types (genbank, fasta,
>> swissprot...)
>>
>> Once I have this I will probably be able to get to the
>> feature information using the information from the tutorial.
>>
>> Thanks for your time.
>>
>> Bernd
>>
>>
>>
>> _______________________________________________
>> Biojava-l mailing list - Biojava-l at lists.open-bio.org
>> <mailto:Biojava-l at lists.open-bio.org>
>> http://lists.open-bio.org/mailman/listinfo/biojava-l
>>
>>
>>
>>
>> --
>> Simon Rayner
>>
>> State Key Laboratory of Virology
>> Wuhan Institute of Virology
>> Chinese Academy of Sciences
>> Wuhan, Hubei 430071
>> P.R.China
>>
>> +86 (27) 87199895 (office)
>> +86 18627113001 (cell)
>>
>
>
>
> --
> Simon Rayner
>
> State Key Laboratory of Virology
> Wuhan Institute of Virology
> Chinese Academy of Sciences
> Wuhan, Hubei 430071
> P.R.China
>
> +86 (27) 87199895 (office)
> +86 18627113001 (cell)
>
More information about the Biojava-l
mailing list