2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.analysis.scoremodels.ScoreModels;
24 import jalview.api.analysis.DistanceScoreModelI;
25 import jalview.api.analysis.ScoreModelI;
26 import jalview.api.analysis.SimilarityScoreModelI;
27 import jalview.datamodel.AlignmentView;
28 import jalview.datamodel.BinaryNode;
29 import jalview.datamodel.CigarArray;
30 import jalview.datamodel.NodeTransformI;
31 import jalview.datamodel.SeqCigar;
32 import jalview.datamodel.Sequence;
33 import jalview.datamodel.SequenceI;
34 import jalview.datamodel.SequenceNode;
35 import jalview.io.NewickFile;
36 import jalview.math.MatrixI;
38 import java.util.Enumeration;
39 import java.util.List;
40 import java.util.Vector;
53 public static final String AVERAGE_DISTANCE = "AV";
55 public static final String NEIGHBOUR_JOINING = "NJ";
57 public static final String FROM_FILE = "FromFile";
59 Vector<Cluster> cluster;
63 // SequenceData is a string representation of what the user
64 // sees. The display may contain hidden columns.
65 public AlignmentView seqData = null;
83 Vector<SequenceNode> groups = new Vector<SequenceNode>();
95 Vector<SequenceNode> node;
103 boolean hasDistances = true; // normal case for jalview trees
105 boolean hasBootstrap = false; // normal case for jalview trees
107 private boolean hasRootDistance = true;
110 * Create a new NJTree object with leaves associated with sequences in seqs,
111 * and original alignment data represented by Cigar strings.
120 public NJTree(SequenceI[] seqs, AlignmentView odata, NewickFile treefile)
122 this(seqs, treefile);
128 * sequenceString = new String[odata.length]; char gapChar =
129 * jalview.util.Comparison.GapChars.charAt(0); for (int i = 0; i <
130 * odata.length; i++) { SequenceI oseq_aligned = odata[i].getSeq(gapChar);
131 * sequenceString[i] = oseq_aligned.getSequence(); }
136 * Creates a new NJTree object from a tree from an external source
139 * SequenceI which should be associated with leafs of treefile
143 public NJTree(SequenceI[] seqs, NewickFile treefile)
145 this.sequence = seqs;
146 top = treefile.getTree();
149 * There is no dependent alignment to be recovered from an imported tree.
151 * if (sequenceString == null) { sequenceString = new String[seqs.length];
152 * for (int i = 0; i < seqs.length; i++) { sequenceString[i] =
153 * seqs[i].getSequence(); } }
156 hasDistances = treefile.HasDistances();
157 hasBootstrap = treefile.HasBootstrap();
158 hasRootDistance = treefile.HasRootDistance();
160 maxheight = findHeight(top);
162 SequenceIdMatcher algnIds = new SequenceIdMatcher(seqs);
164 Vector<SequenceNode> leaves = findLeaves(top);
167 int namesleft = seqs.length;
172 Vector<SequenceI> one2many = new Vector<SequenceI>();
173 int countOne2Many = 0;
174 while (i < leaves.size())
176 j = leaves.elementAt(i++);
177 realnam = j.getName();
182 nam = algnIds.findIdMatch(realnam);
188 if (one2many.contains(nam))
191 // if (jalview.bin.Cache.log.isDebugEnabled())
192 // jalview.bin.Cache.log.debug("One 2 many relationship for
197 one2many.addElement(nam);
203 j.setElement(new Sequence(realnam, "THISISAPLACEHLDER"));
204 j.setPlaceholder(true);
207 // if (jalview.bin.Cache.log.isDebugEnabled() && countOne2Many>0) {
208 // jalview.bin.Cache.log.debug("There were "+countOne2Many+" alignment
209 // sequence ids (out of "+one2many.size()+" unique ids) linked to two or
216 * Creates a new NJTree object.
229 public NJTree(SequenceI[] sqs, AlignmentView seqView, String treeType,
230 String modelType, ScoreModelI sm, int start, int end)
233 this.node = new Vector<SequenceNode>();
234 this.type = treeType;
235 this.pwtype = modelType;
238 this.seqData = seqView;
242 SeqCigar[] seqs = new SeqCigar[sequence.length];
243 for (int i = 0; i < sequence.length; i++)
245 seqs[i] = new SeqCigar(sequence[i], start, end);
247 CigarArray sdata = new CigarArray(seqs);
248 sdata.addOperation(CigarArray.M, end - start + 1);
249 this.seqData = new AlignmentView(sdata, start);
251 // System.err.println("Made seqData");// dbg
252 if (!(treeType.equals(NEIGHBOUR_JOINING)))
254 treeType = AVERAGE_DISTANCE;
257 if (sm == null && !(modelType.equals("PID")))
259 if (ScoreModels.getInstance().forName(modelType) == null)
261 modelType = "BLOSUM62";
267 done = new int[sequence.length];
269 while ((i < sequence.length) && (sequence[i] != null))
277 if (sm instanceof DistanceScoreModelI)
279 distance = ((DistanceScoreModelI) sm).findDistances(seqData);
281 else if (sm instanceof SimilarityScoreModelI)
284 * compute similarity and invert it to give a distance measure
286 MatrixI result = ((SimilarityScoreModelI) sm)
287 .findSimilarities(seqData);
288 double maxScore = result.getMaxValue();
289 result.subtractAllFrom(maxScore);
293 // System.err.println("Made distances");// dbg
295 // System.err.println("Made leaves");// dbg
297 noClus = cluster.size();
300 // System.err.println("Made clusters");// dbg
305 * Generate a string representation of the Tree
307 * @return Newick File with all tree data available
310 public String toString()
312 jalview.io.NewickFile fout = new jalview.io.NewickFile(getTopNode());
314 return fout.print(isHasBootstrap(), isHasDistances(),
315 isHasRootDistance()); // output all data available for tree
320 * used when the alignment associated to a tree has changed.
323 * Sequence set to be associated with tree nodes
325 public void UpdatePlaceHolders(List<SequenceI> list)
327 Vector<SequenceNode> leaves = findLeaves(top);
329 int sz = leaves.size();
330 SequenceIdMatcher seqmatcher = null;
335 SequenceNode leaf = leaves.elementAt(i++);
337 if (list.contains(leaf.element()))
339 leaf.setPlaceholder(false);
343 if (seqmatcher == null)
345 // Only create this the first time we need it
346 SequenceI[] seqs = new SequenceI[list.size()];
348 for (int j = 0; j < seqs.length; j++)
350 seqs[j] = list.get(j);
353 seqmatcher = new SequenceIdMatcher(seqs);
356 SequenceI nam = seqmatcher.findIdMatch(leaf.getName());
360 if (!leaf.isPlaceholder())
362 // remapping the node to a new sequenceI - should remove any refs to
364 // TODO - make many sequenceI to one leaf mappings possible!
367 leaf.setPlaceholder(false);
368 leaf.setElement(nam);
372 if (!leaf.isPlaceholder())
374 // Construct a new placeholder sequence object for this leaf
375 leaf.setElement(new Sequence(leaf.getName(),
376 "THISISAPLACEHLDER"));
378 leaf.setPlaceholder(true);
386 * rename any nodes according to their associated sequence. This will modify
387 * the tree's metadata! (ie the original NewickFile or newly generated
388 * BinaryTree's label data)
390 public void renameAssociatedNodes()
392 applyToNodes(new NodeTransformI()
396 public void transform(BinaryNode nd)
398 Object el = nd.element();
399 if (el != null && el instanceof SequenceI)
401 nd.setName(((SequenceI) el).getName());
410 public void cluster()
414 if (type.equals(NEIGHBOUR_JOINING))
423 Cluster c = joinClusters(mini, minj);
427 cluster.setElementAt(null, minj);
428 cluster.setElementAt(c, mini);
433 boolean onefound = false;
438 for (int i = 0; i < noseqs; i++)
442 if (onefound == false)
454 joinClusters(one, two);
455 top = (node.elementAt(one));
470 * @return DOCUMENT ME!
472 public Cluster joinClusters(int i, int j)
474 double dist = distance.getValue(i, j);
476 int noi = cluster.elementAt(i).value.length;
477 int noj = cluster.elementAt(j).value.length;
479 int[] value = new int[noi + noj];
481 for (int ii = 0; ii < noi; ii++)
483 value[ii] = cluster.elementAt(i).value[ii];
486 for (int ii = noi; ii < (noi + noj); ii++)
488 value[ii] = cluster.elementAt(j).value[ii - noi];
491 Cluster c = new Cluster(value);
496 if (type.equals(NEIGHBOUR_JOINING))
498 findClusterNJDistance(i, j);
502 findClusterDistance(i, j);
505 SequenceNode sn = new SequenceNode();
507 sn.setLeft((node.elementAt(i)));
508 sn.setRight((node.elementAt(j)));
510 SequenceNode tmpi = (node.elementAt(i));
511 SequenceNode tmpj = (node.elementAt(j));
513 if (type.equals(NEIGHBOUR_JOINING))
515 findNewNJDistances(tmpi, tmpj, dist);
519 findNewDistances(tmpi, tmpj, dist);
525 node.setElementAt(sn, i);
540 public void findNewNJDistances(SequenceNode tmpi, SequenceNode tmpj,
544 tmpi.dist = ((dist + ri) - rj) / 2;
545 tmpj.dist = (dist - tmpi.dist);
568 public void findNewDistances(SequenceNode tmpi, SequenceNode tmpj,
574 SequenceNode sni = tmpi;
575 SequenceNode snj = tmpj;
580 sni = (SequenceNode) sni.left();
586 snj = (SequenceNode) snj.left();
589 tmpi.dist = ((dist / 2) - ih);
590 tmpj.dist = ((dist / 2) - jh);
601 public void findClusterDistance(int i, int j)
603 int noi = cluster.elementAt(i).value.length;
604 int noj = cluster.elementAt(j).value.length;
606 // New distances from cluster to others
607 double[] newdist = new double[noseqs];
609 for (int l = 0; l < noseqs; l++)
611 if ((l != i) && (l != j))
613 // newdist[l] = ((distance[i][l] * noi) + (distance[j][l] * noj))
615 newdist[l] = ((distance.getValue(i, l) * noi) + (distance.getValue(
625 for (int ii = 0; ii < noseqs; ii++)
627 // distance[i][ii] = newdist[ii];
628 // distance[ii][i] = newdist[ii];
629 distance.setValue(i, ii, newdist[ii]);
630 distance.setValue(ii, i, newdist[ii]);
642 public void findClusterNJDistance(int i, int j)
645 // New distances from cluster to others
646 double[] newdist = new double[noseqs];
648 for (int l = 0; l < noseqs; l++)
650 if ((l != i) && (l != j))
652 // newdist[l] = ((distance[i][l] + distance[j][l]) - distance[i][j]) /
654 newdist[l] = (distance.getValue(i, l) + distance.getValue(j, l) - distance
655 .getValue(i, j)) / 2;
663 for (int ii = 0; ii < noseqs; ii++)
665 // distance[i][ii] = newdist[ii];
666 // distance[ii][i] = newdist[ii];
667 distance.setValue(i, ii, newdist[ii]);
668 distance.setValue(ii, i, newdist[ii]);
680 * @return DOCUMENT ME!
682 public double findr(int i, int j)
686 for (int k = 0; k < noseqs; k++)
688 if ((k != i) && (k != j) && (done[k] != 1))
690 // tmp = tmp + distance[i][k];
691 tmp = tmp + distance.getValue(i, k);
697 tmp = tmp / (noClus - 2);
706 * @return DOCUMENT ME!
708 public double findMinNJDistance()
710 double min = Double.MAX_VALUE;
712 for (int i = 0; i < (noseqs - 1); i++)
714 for (int j = i + 1; j < noseqs; j++)
716 if ((done[i] != 1) && (done[j] != 1))
718 // float tmp = distance[i][j] - (findr(i, j) + findr(j, i));
719 double tmp = distance.getValue(i, j)
720 - (findr(i, j) + findr(j, i));
739 * @return DOCUMENT ME!
741 public double findMinDistance()
743 double min = Double.MAX_VALUE;
745 for (int i = 0; i < (noseqs - 1); i++)
747 for (int j = i + 1; j < noseqs; j++)
749 if ((done[i] != 1) && (done[j] != 1))
751 // if (distance[i][j] < min)
752 if (distance.getValue(i, j) < min)
757 // min = distance[i][j];
758 min = distance.getValue(i, j);
768 * Calculate a distance matrix given the sequence input data and score model
772 public MatrixI findDistances(ScoreModelI scoreModel)
774 MatrixI result = null;
776 if (scoreModel == null)
778 // Resolve substitution model
779 scoreModel = ScoreModels.getInstance().forName(pwtype);
780 if (scoreModel == null)
782 scoreModel = ScoreModels.getInstance().forName("BLOSUM62");
785 if (scoreModel instanceof DistanceScoreModelI)
787 result = ((DistanceScoreModelI) scoreModel).findDistances(seqData);
789 else if (scoreModel instanceof SimilarityScoreModelI)
792 * compute similarity and invert it to give a distance measure
794 result = ((SimilarityScoreModelI) scoreModel)
795 .findSimilarities(seqData);
796 double maxScore = result.getMaxValue();
797 result.subtractAllFrom(maxScore);
802 .println("Unexpected type of score model, can't compute distances");
811 public void makeLeaves()
813 cluster = new Vector<Cluster>();
815 for (int i = 0; i < noseqs; i++)
817 SequenceNode sn = new SequenceNode();
819 sn.setElement(sequence[i]);
820 sn.setName(sequence[i].getName());
823 int[] value = new int[1];
826 Cluster c = new Cluster(value);
827 cluster.addElement(c);
832 * Search for leaf nodes below (or at) the given node
835 * root node to search from
839 public Vector<SequenceNode> findLeaves(SequenceNode nd)
841 Vector<SequenceNode> leaves = new Vector<SequenceNode>();
842 findLeaves(nd, leaves);
847 * Search for leaf nodes.
850 * root node to search from
852 * Vector of leaves to add leaf node objects too.
854 * @return Vector of leaf nodes on binary tree
856 Vector<SequenceNode> findLeaves(SequenceNode nd,
857 Vector<SequenceNode> leaves)
864 if ((nd.left() == null) && (nd.right() == null)) // Interior node
867 leaves.addElement(nd);
874 * TODO: Identify internal nodes... if (node.isSequenceLabel()) {
875 * leaves.addElement(node); }
877 findLeaves((SequenceNode) nd.left(), leaves);
878 findLeaves((SequenceNode) nd.right(), leaves);
885 * Find the leaf node with a particular ycount
888 * initial point on tree to search from
890 * value to search for
892 * @return null or the node with ycound=count
894 public Object findLeaf(SequenceNode nd, int count)
896 found = _findLeaf(nd, count);
902 * #see findLeaf(SequenceNode node, count)
904 public Object _findLeaf(SequenceNode nd, int count)
911 if (nd.ycount == count)
913 found = nd.element();
919 _findLeaf((SequenceNode) nd.left(), count);
920 _findLeaf((SequenceNode) nd.right(), count);
927 * printNode is mainly for debugging purposes.
932 public void printNode(SequenceNode nd)
939 if ((nd.left() == null) && (nd.right() == null))
941 System.out.println("Leaf = " + ((SequenceI) nd.element()).getName());
942 System.out.println("Dist " + nd.dist);
943 System.out.println("Boot " + nd.getBootstrap());
947 System.out.println("Dist " + nd.dist);
948 printNode((SequenceNode) nd.left());
949 printNode((SequenceNode) nd.right());
959 public void findMaxDist(SequenceNode nd)
966 if ((nd.left() == null) && (nd.right() == null))
968 double dist = nd.dist;
970 if (dist > maxDistValue)
978 findMaxDist((SequenceNode) nd.left());
979 findMaxDist((SequenceNode) nd.right());
986 * @return DOCUMENT ME!
988 public Vector<SequenceNode> getGroups()
996 * @return DOCUMENT ME!
998 public double getMaxHeight()
1011 public void groupNodes(SequenceNode nd, float threshold)
1018 if ((nd.height / maxheight) > threshold)
1020 groups.addElement(nd);
1024 groupNodes((SequenceNode) nd.left(), threshold);
1025 groupNodes((SequenceNode) nd.right(), threshold);
1035 * @return DOCUMENT ME!
1037 public double findHeight(SequenceNode nd)
1044 if ((nd.left() == null) && (nd.right() == null))
1046 nd.height = ((SequenceNode) nd.parent()).height + nd.dist;
1048 if (nd.height > maxheight)
1059 if (nd.parent() != null)
1061 nd.height = ((SequenceNode) nd.parent()).height + nd.dist;
1066 nd.height = (float) 0.0;
1069 maxheight = findHeight((SequenceNode) (nd.left()));
1070 maxheight = findHeight((SequenceNode) (nd.right()));
1079 * @return DOCUMENT ME!
1081 public SequenceNode reRoot()
1083 if (maxdist != null)
1087 double tmpdist = maxdist.dist;
1090 SequenceNode sn = new SequenceNode();
1093 // New right hand of top
1094 SequenceNode snr = (SequenceNode) maxdist.parent();
1095 changeDirection(snr, maxdist);
1096 System.out.println("Printing reversed tree");
1098 snr.dist = tmpdist / 2;
1099 maxdist.dist = tmpdist / 2;
1102 maxdist.setParent(sn);
1105 sn.setLeft(maxdist);
1119 * @return true if original sequence data can be recovered
1121 public boolean hasOriginalSequenceData()
1123 return seqData != null;
1127 * Returns original alignment data used for calculation - or null where not
1130 * @return null or cut'n'pasteable alignment
1132 public String printOriginalSequenceData(char gapChar)
1134 if (seqData == null)
1139 StringBuffer sb = new StringBuffer();
1140 String[] seqdatas = seqData.getSequenceStrings(gapChar);
1141 for (int i = 0; i < seqdatas.length; i++)
1143 sb.append(new jalview.util.Format("%-" + 15 + "s").form(sequence[i]
1145 sb.append(" " + seqdatas[i] + "\n");
1147 return sb.toString();
1156 public void printN(SequenceNode nd)
1163 if ((nd.left() != null) && (nd.right() != null))
1165 printN((SequenceNode) nd.left());
1166 printN((SequenceNode) nd.right());
1170 System.out.println(" name = " + ((SequenceI) nd.element()).getName());
1173 System.out.println(" dist = " + nd.dist + " " + nd.count + " "
1183 public void reCount(SequenceNode nd)
1187 // _lylimit = this.node.size();
1191 private long _lycount = 0, _lylimit = 0;
1199 public void _reCount(SequenceNode nd)
1201 // if (_lycount<_lylimit)
1203 // System.err.println("Warning: depth of _recount greater than number of nodes.");
1211 if ((nd.left() != null) && (nd.right() != null))
1214 _reCount((SequenceNode) nd.left());
1215 _reCount((SequenceNode) nd.right());
1217 SequenceNode l = (SequenceNode) nd.left();
1218 SequenceNode r = (SequenceNode) nd.right();
1220 nd.count = l.count + r.count;
1221 nd.ycount = (l.ycount + r.ycount) / 2;
1226 nd.ycount = ycount++;
1237 public void swapNodes(SequenceNode nd)
1244 SequenceNode tmp = (SequenceNode) nd.left();
1246 nd.setLeft(nd.right());
1258 public void changeDirection(SequenceNode nd, SequenceNode dir)
1265 if (nd.parent() != top)
1267 changeDirection((SequenceNode) nd.parent(), nd);
1269 SequenceNode tmp = (SequenceNode) nd.parent();
1271 if (dir == nd.left())
1276 else if (dir == nd.right())
1284 if (dir == nd.left())
1286 nd.setParent(nd.left());
1288 if (top.left() == nd)
1290 nd.setRight(top.right());
1294 nd.setRight(top.left());
1299 nd.setParent(nd.right());
1301 if (top.left() == nd)
1303 nd.setLeft(top.right());
1307 nd.setLeft(top.left());
1316 * @return DOCUMENT ME!
1318 public SequenceNode getMaxDist()
1326 * @return DOCUMENT ME!
1328 public SequenceNode getTopNode()
1335 * @return true if tree has real distances
1337 public boolean isHasDistances()
1339 return hasDistances;
1344 * @return true if tree has real bootstrap values
1346 public boolean isHasBootstrap()
1348 return hasBootstrap;
1351 public boolean isHasRootDistance()
1353 return hasRootDistance;
1357 * apply the given transform to all the nodes in the tree.
1359 * @param nodeTransformI
1361 public void applyToNodes(NodeTransformI nodeTransformI)
1363 for (Enumeration<SequenceNode> nodes = node.elements(); nodes
1364 .hasMoreElements(); nodeTransformI.transform(nodes
1376 * @version $Revision$
1378 // TODO what does this class have that int[] doesn't have already?
1384 * Creates a new Cluster object.
1389 public Cluster(int[] value)