package ca.gc.phac.aspc.nml;

import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Properties;

import org.biojava.bio.chromatogram.Chromatogram;
import org.biojava.bio.program.scf.SCF;
import org.biojava.bio.seq.DNATools;
import org.biojava.bio.symbol.Alignment;
import org.biojava.bio.symbol.AtomicSymbol;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.Symbol;
import org.biojava.bio.symbol.SymbolList;
import org.biojava.bio.symbol.IntegerAlphabet.IntegerSymbol;

/**
 * An SCF format file writer written as is described in the Staden document that
 * can be found at <a
 * href="http://staden.sourceforge.net/manual/formats_unix_2.html"
 * >http://staden.sourceforge.net/manual/formats_unix_2.html</a>. The file
 * format looks like this:
 * 
 * <pre>
 * Version 3
 * 
 * Length in bytes                        Data
 * ---------------------------------------------------------------------------
 * 128                                    header
 * Number of samples * sample size        Samples for A trace
 * Number of samples * sample size        Samples for C trace
 * Number of samples * sample size        Samples for G trace
 * Number of samples * sample size        Samples for T trace
 * Number of bases * 4                    Offset into peak index for each base
 * Number of bases                        Accuracy estimate bases being 'A'
 * Number of bases                        Accuracy estimate bases being 'C'
 * Number of bases                        Accuracy estimate bases being 'G'
 * Number of bases                        Accuracy estimate bases being 'T'
 * Number of bases                        The called bases
 * Number of bases * 3                    Reserved for future use (biojava does substitution probability, overcall probability and undercall probability)
 * Comments size                          Comments
 * Private data size                      Private data
 * ---------------------------------------------------------------------------
 * </pre>
 * 
 * @author Franklin Bristow (franklin_bristow@phac-aspc.gc.ca)
 * 
 */
public class SCFWriter {
	/** magic! this is sometimes stored as its integer value 779314022 */
	public static final Integer SCF_MAGIC = ('.' << 24) + ('s' << 16)
			+ ('c' << 8) + ('f' << 0);
	/** the number of elements in the samples matrix */
	private Integer samples;
	/** the byte offset from the start of the file for the sample matrix */
	private static final Integer SAMPLES_OFFSET = 128;
	/** the number of bases in the bases matrix */
	private Integer bases;
	/** deprecated, here for illustrative purposes */
	private static final Integer BASES_LEFT_CLIP = 0;
	/** deprecated, here for illustrative purposes */
	private static final Integer BASES_RIGHT_CLIP = 0;
	/** the byte offset from the start of the file for the bases matrix */
	private Integer basesOffset;
	/** the number of bytes in the comment section */
	private Integer commentsSize;
	/** the byte offset from the start of the file for the comment section */
	private Integer commentsOffset;
	/** the version of scf format we're currently using */
	public static final byte[] VERSION = "3.00".getBytes();
	/** the size of the samples in bytes; 1 = 8 bits, 2 = 16 bits */
	private Integer sampleSize;
	/** the code set used (The SCF doc says that this is ignored...) */
	private static final Integer CODE_SET = 0;
	/** the number of bytes of private data, 0 if none */
	private Integer privateSize;
	/** the byte offset from the start of the file for the private section */
	private Integer privateOffset;
	/** 72 bytes of padding */
	public static final byte[] SPARE;
	/** the chromatogram we're writing */
	private Chromatogram chromatogram;
	/** the list of comments that should be written to this file */
	private Properties comments;
	/** the private data to be written to the private data section */
	private byte[] privateData;

	static {
		// SPARE is meant to be a 72 byte padding area
		SPARE = new byte[72];
		for (int i = 0; i < 72; i++) {
			SPARE[i] = 0;
		}
	}

	/**
	 * Set up an SCFWriter to write the supplied chromatogram out to SCF format.
	 * 
	 * @param c
	 *            the chromatogram to write
	 * @param comments
	 *            the comments to put into the comments section
	 */
	public SCFWriter(Chromatogram c, Properties comments) {
		this.chromatogram = c;
		this.comments = comments;
		calculateHeaderSection();
	}

	/**
	 * Set up an SCFWriter to write the supplied chromatogram out to SCF format
	 * and populate the comments section with the supplied comments.
	 * 
	 * @param c
	 *            the chromatogram to write
	 * @param comments
	 *            the comments to put into the comments section
	 * @param sampleSize
	 *            the number of bytes each sample should be when we're writing
	 *            it to the file
	 */
	public SCFWriter(Chromatogram c, Properties comments, Integer sampleSize) {
		this.chromatogram = c;
		this.comments = comments;
		this.sampleSize = sampleSize;
		calculateHeaderSection();
	}

	/**
	 * Set up an SCFWriter to write the supplied chromatogram out to the SCF
	 * format. Populate the comments section using comments, use the specified
	 * sample size for writing each individual sample record (in bytes, 1 or 2)
	 * and use the supplied byte array as the private data section.
	 * 
	 * @param c
	 *            the chromatogram to write
	 * @param comments
	 *            the comments to put into the comments section
	 * @param sampleSize
	 *            the number of bytes each sample should be when we're writing
	 *            it to the file
	 * @param privateData
	 *            the private data to store in the private section
	 */
	public SCFWriter(Chromatogram c, Properties comments, Integer sampleSize,
			byte[] privateData) {
		this.chromatogram = c;
		this.comments = comments;
		this.sampleSize = sampleSize;
		this.privateData = privateData;
		calculateHeaderSection();
	}

	/**
	 * Writes the full SCF file to the stream.
	 * 
	 * @param out
	 *            the stream to write the file to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	public void writeSCFToStream(OutputStream out) throws IOException {
		writeHeaderToStream(out);
		writeSamplesToStream(out);
		writeBaseOffsetsToStream(out);
		writeBaseProbabilitiesToStream(out);
		writeCalledBasesToStream(out);
		writeReservedBlockToStream(out);
		writeCommentsToStream(out);
		writePrivateDataToStream(out);
	}

	/**
	 * Once we have the appropriate data, calculate all of the field values in
	 * the header section.
	 */
	private void calculateHeaderSection() {
		// first do some sanity checking
		if (sampleSize == null || sampleSize == 0) {
			sampleSize = 2; // default to 2 byte samples
		}

		samples = chromatogram.getTraceLength();
		bases = chromatogram.getSequenceLength();
		basesOffset = SAMPLES_OFFSET + samples * 4 * sampleSize;
		commentsSize = getCommentsAsString().getBytes().length;
		// We're writing out bases * 12 bytes of information before we get to
		// the actual comments. See the File Structure for more information.
		commentsOffset = basesOffset + bases * 12;

		if (privateData != null) {
			privateSize = privateData.length;
		} else {
			privateSize = 0;
		}

		privateOffset = commentsOffset + commentsSize;
	}

	/**
	 * The private data section described by the staden spec essentially says
	 * this: you put in stuff, we do not care what it is. Thus, we accept data
	 * to be written, but we will only be dumping the bytes of that data into
	 * the stream without trying to figure out what it is.
	 * 
	 * @param out
	 *            the output stream to write to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writePrivateDataToStream(OutputStream out) throws IOException {
		if (privateData != null) {
			ObjectOutputStream oos = new ObjectOutputStream(out);
			oos.writeObject(privateData);
			oos.close();
		}
	}

	/**
	 * Writes the set of comments to the stream. The staden document describes
	 * the comment section as a newline delimited null terminated list of key
	 * values pairs in the format &lt;key (Field-ID)&gt;=&lt;value&gt;.
	 * 
	 * @param out
	 *            the output stream to write to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writeCommentsToStream(OutputStream out) throws IOException {
		if (comments != null) {
			out.write(getCommentsAsString().getBytes());
		}
	}

	/**
	 * Writes the reserved block to the stream. The staden document describes
	 * this to be the number of bases * 3 bytes of "Reserved for future use".
	 * Heather was writing the substitution probability, overcall probability
	 * and undercall probability to the stream, will wait for her to explain.
	 * Version 3.00 of the SCF spec just marks this as
	 * "Reserved for future use". Version 3.10 of the SCF spec (found at <a
	 * href="http://iubio.bio.indiana.edu/soft/molbio/molbio.old/staden/www_pages/scf-rfc.html"
	 * >http://iubio.bio.indiana.edu/soft/molbio/molbio.old/staden/www_pages/scf
	 * -rfc.html</a>) states that 'we may also store the substitution, insertion
	 * and deletion probability values. These are stored using the same scale as
	 * the prob_A, prob_C, prob_G and prob_T values. It is expected that the
	 * four prob_A, prob_C, prob_G and prob_T values will encode the absolute
	 * probability of that base call being correct, taking into account the
	 * chance of it being an overcalled base. For alignment algorithms it may be
	 * useful to obtain individual confidence values for the chance of
	 * insertion, deletion and substitution. These are stored in prob_ins,
	 * prob_del and prob_sub. In version 3.00 these fields existed in the SCF
	 * files, but were labelled as " uint_1 spare[3]'. Thus, we will store these
	 * probability values in the reserved section.
	 * 
	 * @param out
	 *            the stream to write to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writeReservedBlockToStream(OutputStream out)
			throws IOException {
		Alignment bases = chromatogram.getBaseCalls();
		for (Object probability : new Object[] { SCF.PROB_SUBSTITUTION,
				SCF.PROB_OVERCALL, SCF.PROB_UNDERCALL }) {
			SymbolList listForBase = bases.symbolListForLabel(probability);
			@SuppressWarnings("unchecked")
			Iterator<IntegerSymbol> offsetListIterator = listForBase.iterator();

			while (offsetListIterator.hasNext()) {
				IntegerSymbol s = offsetListIterator.next();
				out.write((byte) s.intValue());
			}
		}
	}

	/**
	 * Writes the called bases to the stream. Each of these values is treated as
	 * a one byte integer value.
	 * 
	 * @param out
	 *            the stream to write to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writeCalledBasesToStream(OutputStream out) throws IOException {
		Alignment bases = chromatogram.getBaseCalls();
		SymbolList dnaFormattedBases = bases
				.symbolListForLabel(Chromatogram.DNA);
		@SuppressWarnings("unchecked")
		Iterator<Symbol> dnaBasesIterator = dnaFormattedBases.iterator();

		while (dnaBasesIterator.hasNext()) {
			Symbol s = dnaBasesIterator.next();
			out.write(SCFUtils.SYMBOL_MAPPING.get(s));
		}
	}

	/**
	 * Writes out the accuracy estimates for the bases in the order of 'A', 'C',
	 * 'G', 'T'. Each of these values is treated as a one byte integer value.
	 * 
	 * @param out
	 *            the stream to write to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writeBaseProbabilitiesToStream(OutputStream out)
			throws IOException {
		Alignment bases = chromatogram.getBaseCalls();
		for (Object base : new Object[] { SCF.PROB_NUC_A, SCF.PROB_NUC_C,
				SCF.PROB_NUC_G, SCF.PROB_NUC_T }) {
			SymbolList listForBase = bases.symbolListForLabel(base);
			@SuppressWarnings("unchecked")
			Iterator<IntegerSymbol> offsetListIterator = listForBase.iterator();

			while (offsetListIterator.hasNext()) {
				IntegerSymbol s = offsetListIterator.next();
				out.write((byte) s.intValue());
			}
		}
	}

	/**
	 * Writes out the offset into peak index for each base to the supplied
	 * {@link OutputStream}. Each of these entries is to be treated as a 4 byte
	 * integer value and is written to the stream as such.
	 * 
	 * @param out
	 *            the stream to write to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writeBaseOffsetsToStream(OutputStream out) throws IOException {
		Alignment bases = chromatogram.getBaseCalls();
		if (bases != null) {
			SymbolList offsetList = bases
					.symbolListForLabel(Chromatogram.OFFSETS);
			@SuppressWarnings("unchecked")
			Iterator<IntegerSymbol> offsetListIterator = offsetList.iterator();

			while (offsetListIterator.hasNext()) {
				IntegerSymbol s = offsetListIterator.next();
				out.write(getBytesFromInt(s.intValue()));
			}
		}
	}

	/**
	 * Samples are written to SCF files as differences of successive values.
	 * This means calculating a "delta delta" value for each term. This is
	 * described in the Staden SCF file format spec: <a
	 * href="http://staden.sourceforge.net/manual/formats_unix_4.html"
	 * >http://staden.sourceforge.net/manual/formats_unix_4.html</a>
	 * 
	 * @param out
	 *            the stream to write the file to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writeSamplesToStream(OutputStream out) throws IOException {
		try {
			for (AtomicSymbol nucleotide : Arrays.asList(DNATools.a(), DNATools
					.c(), DNATools.g(), DNATools.t())) {
				int[] chromatogramSamples = chromatogram.getTrace(nucleotide);
				int previousSampleValue;
				int previousDeltaValue = 0;

				// calculate the first delta
				for (int i = 0; i < samples; i++) {
					previousSampleValue = chromatogramSamples[i];
					chromatogramSamples[i] = chromatogramSamples[i]
							- previousDeltaValue;
					previousDeltaValue = previousSampleValue;
				}

				// calculate the "delta delta"
				previousDeltaValue = 0;
				for (int i = 0; i < samples; i++) {
					previousSampleValue = chromatogramSamples[i];
					chromatogramSamples[i] = chromatogramSamples[i]
							- previousDeltaValue;
					previousDeltaValue = previousSampleValue;
				}

				if (sampleSize == 2) {
					// write out samples as 2 bytes each
					for (int i = 0; i < samples; i++) {
						out.write(getByteAt(chromatogramSamples[i], 2));
						out.write(getByteAt(chromatogramSamples[i], 3));
					}
				} else if (sampleSize == 1) {
					// write out samples as 1 byte each
					for (int i = 0; i < samples; i++) {
						out.write(getByteAt(chromatogramSamples[i], 3));
					}
				}
			}
		} catch (IllegalSymbolException e) {
			e.printStackTrace();
		}
	}

	/**
	 * Writes all of the header variables out to the {@link OutputStream}. The
	 * description of this structure can be found at <a
	 * href="http://staden.sourceforge.net/manual/formats_unix_3.html"
	 * >http://staden.sourceforge.net/manual/formats_unix_3.html</a>. In total,
	 * the header comprises of 128 bytes where all fields are 4 bytes in size
	 * with the exception of the SPARE field, which is 18 * 4 bytes in size to
	 * fill in the remaining bytes of the 128.
	 * 
	 * @param out
	 *            the output stream to write to
	 * @throws IOException
	 *             if an I/O error occurs
	 */
	private void writeHeaderToStream(OutputStream out) throws IOException {
		out.write(getBytesFromInt(SCF_MAGIC));
		out.write(getBytesFromInt(samples));
		out.write(getBytesFromInt(SAMPLES_OFFSET));
		out.write(getBytesFromInt(bases));
		out.write(getBytesFromInt(BASES_LEFT_CLIP));
		out.write(getBytesFromInt(BASES_RIGHT_CLIP));
		out.write(getBytesFromInt(basesOffset));
		out.write(getBytesFromInt(commentsSize));
		out.write(getBytesFromInt(commentsOffset));
		out.write(VERSION);
		out.write(getBytesFromInt(sampleSize));
		out.write(getBytesFromInt(CODE_SET));
		out.write(getBytesFromInt(privateSize));
		out.write(getBytesFromInt(privateOffset));
		out.write(SPARE);
	}

	private String getCommentsAsString() {
		StringBuffer buffer = new StringBuffer();

		for (Object key : comments.keySet()) {
			buffer.append(key.toString() + "="
					+ comments.getProperty(key.toString()) + "\n");
		}

		return buffer.toString();
	}

	private byte[] getBytesFromInt(Integer value) {
		byte[] bytes = new byte[4];

		for (int i = 0; i < 4; i++) {
			bytes[i] = getByteAt(value, i);
		}

		return bytes;
	}

	private static Byte getByteAt(Integer record, int position) {
		return Long
				.valueOf(
						(record & (0xFF000000 >>> (position * 8))) >>> ((3 - position) * 8))
				.byteValue();
		// YEGADS! Okay, I'll explain:
		// As the value of i increases we will successively isolate the
		// current byte that we want to examine. The next line will
		// shift that byte to the appropriate position (ie, the least
		// significant bits) so that we can cast it into a char and
		// append to our string
	}

}
