[Biojava-l]
Ren, Zhen
zren at amylin.com
Mon May 5 13:31:06 EDT 2003
Hi,
This message refers to the WeightMatrixDemo example at "BioJava In Anger" page again (http://bioconf.otago.ac.nz/biojava/weightMatrix.htm). I am trying to understand the null model behind this. Experts out there, please help!
I slightly modified the code as below (DNA version):
import java.util.*;
import org.biojava.bio.dist.*;
import org.biojava.bio.dp.*;
import org.biojava.bio.seq.*;
import org.biojava.bio.symbol.*;
public class WeightMatrixDemo {
public static void main(String[] args) throws Exception{
//make an Alignment of a motif.
Map map = new HashMap();
map.put("seq0", DNATools.createDNA("aggag"));
map.put("seq1", DNATools.createDNA("aggaa"));
map.put("seq2", DNATools.createDNA("aggag"));
map.put("seq3", DNATools.createDNA("aagag"));
Alignment align = new SimpleAlignment(map);
//make a Distribution[] of the motif
Distribution[] dists =
DistributionTools.distOverAlignment(align, false, 0.01);
// Zhen added this block of code.
for(int i = 0; i < dists.length; i++) {
Distribution nullmodel = dists[i].getNullModel();
FiniteAlphabet dna = DNATools.getDNA();
Iterator dnaSymbols = dna.iterator();
while(dnaSymbols.hasNext()) {
Symbol s = (Symbol)dnaSymbols.next();
System.out.println(s.getName() + "\t" + nullmodel.getWeight(s));
}
System.out.println();
}
//make a Weight Matrix
WeightMatrix matrix = new SimpleWeightMatrix(dists);
//the sequence to score against
Sequence seq = DNATools.createDNASequence("aaagcctaggaagaggagctgat","seq");
//annotate the sequence with the weight matrix using a low threshold (0.1)
WeightMatrixAnnotator wma = new WeightMatrixAnnotator(matrix, 0.1);
seq = wma.annotate(seq);
//output match information
for (Iterator it = seq.features(); it.hasNext(); ) {
Feature f = (Feature)it.next();
Location loc = f.getLocation();
System.out.println("Match at " + loc.getMin()+"-"+loc.getMax());
System.out.println("\tscore : "+f.getAnnotation().getProperty("score"));
}
}
}
The output is:
adenine 0.25
guanine 0.25
thymine 0.25
cytosine 0.25
adenine 0.25
guanine 0.25
thymine 0.25
cytosine 0.25
adenine 0.25
guanine 0.25
thymine 0.25
cytosine 0.25
adenine 0.25
guanine 0.25
thymine 0.25
cytosine 0.25
adenine 0.25
guanine 0.25
thymine 0.25
cytosine 0.25
Match at 8-12
score : 0.18613993419374553
Match at 11-15
score : 0.18613993419374553
Match at 14-18
score : 0.5574914238570782
I understand 0.25 is 1/4 for 4 nucleotides. However, let's look at the protein version:
import java.util.*;
import org.biojava.bio.dist.*;
import org.biojava.bio.dp.*;
import org.biojava.bio.seq.*;
import org.biojava.bio.symbol.*;
public class WeightMatrixDemo {
public static void main(String[] args) throws Exception{
//make an Alignment of a motif.
Map map = new HashMap();
map.put("seq0", ProteinTools.createProtein("aggag"));
map.put("seq1", ProteinTools.createProtein("aggaa"));
map.put("seq2", ProteinTools.createProtein("aggag"));
map.put("seq3", ProteinTools.createProtein("aagag"));
Alignment align = new SimpleAlignment(map);
//make a Distribution[] of the motif
Distribution[] dists =
DistributionTools.distOverAlignment(align, false, 0.01);
// Zhen added this block of code.
for(int i = 0; i < dists.length; i++) {
Distribution nullmodel = dists[i].getNullModel();
FiniteAlphabet pro = ProteinTools.getAlphabet();
Iterator proSymbols = pro.iterator();
while(proSymbols.hasNext()) {
Symbol s = (Symbol)proSymbols.next();
System.out.println(s.getName() + "\t" + nullmodel.getWeight(s));
}
System.out.println();
}
//make a Weight Matrix
WeightMatrix matrix = new SimpleWeightMatrix(dists);
//the sequence to score against
Sequence seq = ProteinTools.createProteinSequence("aaagcctaggaagaggagctgat","seq");
//annotate the sequence with the weight matrix using a low threshold (0.1)
WeightMatrixAnnotator wma = new WeightMatrixAnnotator(matrix, 0.1);
seq = wma.annotate(seq);
//output match information
for (Iterator it = seq.features(); it.hasNext(); ) {
Feature f = (Feature)it.next();
Location loc = f.getLocation();
System.out.println("Match at " + loc.getMin()+"-"+loc.getMax());
System.out.println("\tscore : "+f.getAnnotation().getProperty("score"));
}
}
}
The output is:
SER 0.045454545454545456
ILE 0.045454545454545456
TYR 0.045454545454545456
ARG 0.045454545454545456
PHE 0.045454545454545456
ASN 0.045454545454545456
GLY 0.045454545454545456
VAL 0.045454545454545456
CYS 0.045454545454545456
LYS 0.045454545454545456
THR 0.045454545454545456
ALA 0.045454545454545456
LEU 0.045454545454545456
MET 0.045454545454545456
PRO 0.045454545454545456
TRP 0.045454545454545456
GLU 0.045454545454545456
GLN 0.045454545454545456
SEC 0.045454545454545456
HIS 0.045454545454545456
ASP 0.045454545454545456
SER 0.045454545454545456
ILE 0.045454545454545456
TYR 0.045454545454545456
ARG 0.045454545454545456
PHE 0.045454545454545456
ASN 0.045454545454545456
GLY 0.045454545454545456
VAL 0.045454545454545456
CYS 0.045454545454545456
LYS 0.045454545454545456
THR 0.045454545454545456
ALA 0.045454545454545456
LEU 0.045454545454545456
MET 0.045454545454545456
PRO 0.045454545454545456
TRP 0.045454545454545456
GLU 0.045454545454545456
GLN 0.045454545454545456
SEC 0.045454545454545456
HIS 0.045454545454545456
ASP 0.045454545454545456
SER 0.045454545454545456
ILE 0.045454545454545456
TYR 0.045454545454545456
ARG 0.045454545454545456
PHE 0.045454545454545456
ASN 0.045454545454545456
GLY 0.045454545454545456
VAL 0.045454545454545456
CYS 0.045454545454545456
LYS 0.045454545454545456
THR 0.045454545454545456
ALA 0.045454545454545456
LEU 0.045454545454545456
MET 0.045454545454545456
PRO 0.045454545454545456
TRP 0.045454545454545456
GLU 0.045454545454545456
GLN 0.045454545454545456
SEC 0.045454545454545456
HIS 0.045454545454545456
ASP 0.045454545454545456
SER 0.045454545454545456
ILE 0.045454545454545456
TYR 0.045454545454545456
ARG 0.045454545454545456
PHE 0.045454545454545456
ASN 0.045454545454545456
GLY 0.045454545454545456
VAL 0.045454545454545456
CYS 0.045454545454545456
LYS 0.045454545454545456
THR 0.045454545454545456
ALA 0.045454545454545456
LEU 0.045454545454545456
MET 0.045454545454545456
PRO 0.045454545454545456
TRP 0.045454545454545456
GLU 0.045454545454545456
GLN 0.045454545454545456
SEC 0.045454545454545456
HIS 0.045454545454545456
ASP 0.045454545454545456
SER 0.045454545454545456
ILE 0.045454545454545456
TYR 0.045454545454545456
ARG 0.045454545454545456
PHE 0.045454545454545456
ASN 0.045454545454545456
GLY 0.045454545454545456
VAL 0.045454545454545456
CYS 0.045454545454545456
LYS 0.045454545454545456
THR 0.045454545454545456
ALA 0.045454545454545456
LEU 0.045454545454545456
MET 0.045454545454545456
PRO 0.045454545454545456
TRP 0.045454545454545456
GLU 0.045454545454545456
GLN 0.045454545454545456
SEC 0.045454545454545456
HIS 0.045454545454545456
ASP 0.045454545454545456
Match at 8-12
score : 0.1853491381982068
Match at 11-15
score : 0.18534913819820675
Match at 14-18
score : 0.5558789919338315
Now, 0.045454545454545456 is 1/22. Can anyone tell me why 22 here?
I appreciate your help.
Zhen
More information about the Biojava-l
mailing list