2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.api.analysis.DistanceScoreModelI;
24 import jalview.api.analysis.ScoreModelI;
25 import jalview.api.analysis.SimilarityParamsI;
26 import jalview.api.analysis.SimilarityScoreModelI;
27 import jalview.datamodel.AlignmentView;
28 import jalview.datamodel.BinaryNode;
29 import jalview.datamodel.CigarArray;
30 import jalview.datamodel.NodeTransformI;
31 import jalview.datamodel.SeqCigar;
32 import jalview.datamodel.Sequence;
33 import jalview.datamodel.SequenceI;
34 import jalview.datamodel.SequenceNode;
35 import jalview.io.NewickFile;
36 import jalview.math.MatrixI;
37 import jalview.viewmodel.AlignmentViewport;
39 import java.util.BitSet;
40 import java.util.Enumeration;
41 import java.util.List;
42 import java.util.Vector;
52 public static final String AVERAGE_DISTANCE = "AV";
54 public static final String NEIGHBOUR_JOINING = "NJ";
56 public static final String FROM_FILE = "FromFile";
59 * Bit j in each BitSet is set if the cluster includes the j'th sequence.
60 * Clusters are grouped as the tree is built, from an initial state
61 * where each cluster is a single sequence, until only two clusters are left.
62 * These are the children of the root of the tree.
64 Vector<BitSet> clusters;
66 SequenceI[] sequences;
69 * SequenceData is a string representation of what the user
70 * sees. The display may contain hidden columns.
72 public AlignmentView seqData = null;
75 * Bit j is set when cluster j has been combined to another cluster.
76 * The last two bits left unset are the indices of the clusters which
77 * are the children of the root node.
86 * Value [i, j] is the distance between cluster[i] and cluster[j].
87 * Initially these are the pairwise distances of all sequences.
88 * As the tree is built, these are updated to be the distances
89 * between the clusters as they are assembled.
101 Vector<SequenceNode> groups = new Vector<SequenceNode>();
103 SequenceNode maxdist;
113 Vector<SequenceNode> node;
119 boolean hasDistances = true; // normal case for jalview trees
121 boolean hasBootstrap = false; // normal case for jalview trees
123 private boolean hasRootDistance = true;
126 * Create a new NJTree object with leaves associated with sequences in seqs,
127 * and original alignment data represented by Cigar strings.
136 public NJTree(SequenceI[] seqs, AlignmentView odata, NewickFile treefile)
138 this(seqs, treefile);
144 * sequenceString = new String[odata.length]; char gapChar =
145 * jalview.util.Comparison.GapChars.charAt(0); for (int i = 0; i <
146 * odata.length; i++) { SequenceI oseq_aligned = odata[i].getSeq(gapChar);
147 * sequenceString[i] = oseq_aligned.getSequence(); }
152 * Creates a new NJTree object from a tree from an external source
155 * SequenceI which should be associated with leafs of treefile
159 public NJTree(SequenceI[] seqs, NewickFile treefile)
161 this.sequences = seqs;
162 top = treefile.getTree();
165 * There is no dependent alignment to be recovered from an imported tree.
167 * if (sequenceString == null) { sequenceString = new String[seqs.length];
168 * for (int i = 0; i < seqs.length; i++) { sequenceString[i] =
169 * seqs[i].getSequence(); } }
172 hasDistances = treefile.HasDistances();
173 hasBootstrap = treefile.HasBootstrap();
174 hasRootDistance = treefile.HasRootDistance();
176 maxheight = findHeight(top);
178 SequenceIdMatcher algnIds = new SequenceIdMatcher(seqs);
180 Vector<SequenceNode> leaves = findLeaves(top);
183 int namesleft = seqs.length;
188 Vector<SequenceI> one2many = new Vector<SequenceI>();
189 int countOne2Many = 0;
190 while (i < leaves.size())
192 j = leaves.elementAt(i++);
193 realnam = j.getName();
198 nam = algnIds.findIdMatch(realnam);
204 if (one2many.contains(nam))
207 // if (jalview.bin.Cache.log.isDebugEnabled())
208 // jalview.bin.Cache.log.debug("One 2 many relationship for
213 one2many.addElement(nam);
219 j.setElement(new Sequence(realnam, "THISISAPLACEHLDER"));
220 j.setPlaceholder(true);
223 // if (jalview.bin.Cache.log.isDebugEnabled() && countOne2Many>0) {
224 // jalview.bin.Cache.log.debug("There were "+countOne2Many+" alignment
225 // sequence ids (out of "+one2many.size()+" unique ids) linked to two or
232 * Constructor given a viewport, tree type and score model
235 * the current alignment viewport
239 * a distance or similarity score model to use to compute the tree
240 * @param scoreParameters TODO
242 public NJTree(AlignmentViewport av, String treeType, ScoreModelI sm, SimilarityParamsI scoreParameters)
244 // TODO handle type "FromFile" more elegantly
245 if (!(treeType.equals(NEIGHBOUR_JOINING)))
247 treeType = AVERAGE_DISTANCE;
249 this.type = treeType;
251 boolean selview = av.getSelectionGroup() != null
252 && av.getSelectionGroup().getSize() > 1;
253 AlignmentView seqStrings = av.getAlignmentView(selview);
257 end = av.getAlignment().getWidth();
258 this.sequences = av.getAlignment().getSequencesArray();
262 start = av.getSelectionGroup().getStartRes();
263 end = av.getSelectionGroup().getEndRes() + 1;
264 this.sequences = av.getSelectionGroup().getSequencesInOrder(
268 init(seqStrings, start, end);
270 computeTree(sm, scoreParameters);
273 void init(AlignmentView seqView, int start, int end)
275 this.node = new Vector<SequenceNode>();
278 this.seqData = seqView;
282 SeqCigar[] seqs = new SeqCigar[sequences.length];
283 for (int i = 0; i < sequences.length; i++)
285 seqs[i] = new SeqCigar(sequences[i], start, end);
287 CigarArray sdata = new CigarArray(seqs);
288 sdata.addOperation(CigarArray.M, end - start + 1);
289 this.seqData = new AlignmentView(sdata, start);
293 * count the non-null sequences
299 for (SequenceI seq : sequences)
309 * Calculates the tree using the given score model and parameters, and the
310 * configured tree type
312 * If the score model computes pairwise distance scores, then these are used
313 * directly to derive the tree
315 * If the score model computes similarity scores, then the range of the scores
316 * is reversed to give a distance measure, and this is used to derive the tree
319 * @param scoreOptions
321 protected void computeTree(ScoreModelI sm, SimilarityParamsI scoreOptions)
323 if (sm instanceof DistanceScoreModelI)
325 distances = ((DistanceScoreModelI) sm).findDistances(seqData,
328 else if (sm instanceof SimilarityScoreModelI)
331 * compute similarity and invert it to give a distance measure
333 MatrixI result = ((SimilarityScoreModelI) sm).findSimilarities(
334 seqData, scoreOptions);
335 result.reverseRange(true);
341 noClus = clusters.size();
347 * Generate a string representation of the Tree
349 * @return Newick File with all tree data available
352 public String toString()
354 jalview.io.NewickFile fout = new jalview.io.NewickFile(getTopNode());
356 return fout.print(hasBootstrap(), hasDistances(),
357 hasRootDistance()); // output all data available for tree
362 * used when the alignment associated to a tree has changed.
365 * Sequence set to be associated with tree nodes
367 public void updatePlaceHolders(List<SequenceI> list)
369 Vector<SequenceNode> leaves = findLeaves(top);
371 int sz = leaves.size();
372 SequenceIdMatcher seqmatcher = null;
377 SequenceNode leaf = leaves.elementAt(i++);
379 if (list.contains(leaf.element()))
381 leaf.setPlaceholder(false);
385 if (seqmatcher == null)
387 // Only create this the first time we need it
388 SequenceI[] seqs = new SequenceI[list.size()];
390 for (int j = 0; j < seqs.length; j++)
392 seqs[j] = list.get(j);
395 seqmatcher = new SequenceIdMatcher(seqs);
398 SequenceI nam = seqmatcher.findIdMatch(leaf.getName());
402 if (!leaf.isPlaceholder())
404 // remapping the node to a new sequenceI - should remove any refs to
406 // TODO - make many sequenceI to one leaf mappings possible!
409 leaf.setPlaceholder(false);
410 leaf.setElement(nam);
414 if (!leaf.isPlaceholder())
416 // Construct a new placeholder sequence object for this leaf
417 leaf.setElement(new Sequence(leaf.getName(),
418 "THISISAPLACEHLDER"));
420 leaf.setPlaceholder(true);
428 * rename any nodes according to their associated sequence. This will modify
429 * the tree's metadata! (ie the original NewickFile or newly generated
430 * BinaryTree's label data)
432 public void renameAssociatedNodes()
434 applyToNodes(new NodeTransformI()
438 public void transform(BinaryNode nd)
440 Object el = nd.element();
441 if (el != null && el instanceof SequenceI)
443 nd.setName(((SequenceI) el).getName());
450 * Form clusters by grouping sub-clusters, starting from one sequence per
451 * cluster, and finishing when only two clusters remain
457 if (type.equals(NEIGHBOUR_JOINING))
466 BitSet combined = joinClusters(mini, minj);
470 clusters.setElementAt(null, minj);
471 clusters.setElementAt(combined, mini);
476 int rightChild = done.nextClearBit(0);
477 int leftChild = done.nextClearBit(rightChild + 1);
479 joinClusters(leftChild, rightChild);
480 top = (node.elementAt(leftChild));
495 * @return DOCUMENT ME!
497 BitSet joinClusters(int i, int j)
499 double dist = distances.getValue(i, j);
501 BitSet combined = new BitSet();
502 combined.or(clusters.get(i));
503 combined.or(clusters.get(j));
508 if (type.equals(NEIGHBOUR_JOINING))
510 findClusterNJDistance(i, j);
514 findClusterDistance(i, j);
517 SequenceNode sn = new SequenceNode();
519 sn.setLeft((node.elementAt(i)));
520 sn.setRight((node.elementAt(j)));
522 SequenceNode tmpi = (node.elementAt(i));
523 SequenceNode tmpj = (node.elementAt(j));
525 if (type.equals(NEIGHBOUR_JOINING))
527 findNewNJDistances(tmpi, tmpj, dist);
531 findNewDistances(tmpi, tmpj, dist);
537 node.setElementAt(sn, i);
552 void findNewNJDistances(SequenceNode tmpi, SequenceNode tmpj,
556 tmpi.dist = ((dist + ri) - rj) / 2;
557 tmpj.dist = (dist - tmpi.dist);
580 void findNewDistances(SequenceNode tmpi, SequenceNode tmpj,
586 SequenceNode sni = tmpi;
587 SequenceNode snj = tmpj;
592 sni = (SequenceNode) sni.left();
598 snj = (SequenceNode) snj.left();
601 tmpi.dist = ((dist / 2) - ih);
602 tmpj.dist = ((dist / 2) - jh);
613 void findClusterDistance(int i, int j)
615 int noi = clusters.elementAt(i).cardinality();
616 int noj = clusters.elementAt(j).cardinality();
618 // New distances from cluster to others
619 double[] newdist = new double[noseqs];
621 for (int l = 0; l < noseqs; l++)
623 if ((l != i) && (l != j))
625 newdist[l] = ((distances.getValue(i, l) * noi) + (distances.getValue(
635 for (int ii = 0; ii < noseqs; ii++)
637 distances.setValue(i, ii, newdist[ii]);
638 distances.setValue(ii, i, newdist[ii]);
650 void findClusterNJDistance(int i, int j)
653 // New distances from cluster to others
654 double[] newdist = new double[noseqs];
656 for (int l = 0; l < noseqs; l++)
658 if ((l != i) && (l != j))
660 newdist[l] = (distances.getValue(i, l) + distances.getValue(j, l) - distances
661 .getValue(i, j)) / 2;
669 for (int ii = 0; ii < noseqs; ii++)
671 distances.setValue(i, ii, newdist[ii]);
672 distances.setValue(ii, i, newdist[ii]);
684 * @return DOCUMENT ME!
686 double findr(int i, int j)
690 for (int k = 0; k < noseqs; k++)
692 if ((k != i) && (k != j) && (!done.get(k)))
694 tmp = tmp + distances.getValue(i, k);
700 tmp = tmp / (noClus - 2);
709 * @return DOCUMENT ME!
711 double findMinNJDistance()
713 double min = Double.MAX_VALUE;
715 for (int i = 0; i < (noseqs - 1); i++)
717 for (int j = i + 1; j < noseqs; j++)
719 if (!done.get(i) && !done.get(j))
721 double tmp = distances.getValue(i, j)
722 - (findr(i, j) + findr(j, i));
741 * @return DOCUMENT ME!
743 double findMinDistance()
745 double min = Double.MAX_VALUE;
747 for (int i = 0; i < (noseqs - 1); i++)
749 for (int j = i + 1; j < noseqs; j++)
751 if (!done.get(i) && !done.get(j))
753 if (distances.getValue(i, j) < min)
758 min = distances.getValue(i, j);
768 * Start by making a cluster for each individual sequence
772 clusters = new Vector<BitSet>();
774 for (int i = 0; i < noseqs; i++)
776 SequenceNode sn = new SequenceNode();
778 sn.setElement(sequences[i]);
779 sn.setName(sequences[i].getName());
781 BitSet bs = new BitSet();
783 clusters.addElement(bs);
788 * Search for leaf nodes below (or at) the given node
791 * root node to search from
795 public Vector<SequenceNode> findLeaves(SequenceNode nd)
797 Vector<SequenceNode> leaves = new Vector<SequenceNode>();
798 findLeaves(nd, leaves);
803 * Search for leaf nodes.
806 * root node to search from
808 * Vector of leaves to add leaf node objects too.
810 * @return Vector of leaf nodes on binary tree
812 Vector<SequenceNode> findLeaves(SequenceNode nd,
813 Vector<SequenceNode> leaves)
820 if ((nd.left() == null) && (nd.right() == null)) // Interior node
823 leaves.addElement(nd);
830 * TODO: Identify internal nodes... if (node.isSequenceLabel()) {
831 * leaves.addElement(node); }
833 findLeaves((SequenceNode) nd.left(), leaves);
834 findLeaves((SequenceNode) nd.right(), leaves);
841 * printNode is mainly for debugging purposes.
846 void printNode(SequenceNode nd)
853 if ((nd.left() == null) && (nd.right() == null))
855 System.out.println("Leaf = " + ((SequenceI) nd.element()).getName());
856 System.out.println("Dist " + nd.dist);
857 System.out.println("Boot " + nd.getBootstrap());
861 System.out.println("Dist " + nd.dist);
862 printNode((SequenceNode) nd.left());
863 printNode((SequenceNode) nd.right());
873 void findMaxDist(SequenceNode nd)
880 if ((nd.left() == null) && (nd.right() == null))
882 double dist = nd.dist;
884 if (dist > maxDistValue)
892 findMaxDist((SequenceNode) nd.left());
893 findMaxDist((SequenceNode) nd.right());
900 * @return DOCUMENT ME!
902 public Vector<SequenceNode> getGroups()
910 * @return DOCUMENT ME!
912 public double getMaxHeight()
925 public void groupNodes(SequenceNode nd, float threshold)
932 if ((nd.height / maxheight) > threshold)
934 groups.addElement(nd);
938 groupNodes((SequenceNode) nd.left(), threshold);
939 groupNodes((SequenceNode) nd.right(), threshold);
949 * @return DOCUMENT ME!
951 public double findHeight(SequenceNode nd)
958 if ((nd.left() == null) && (nd.right() == null))
960 nd.height = ((SequenceNode) nd.parent()).height + nd.dist;
962 if (nd.height > maxheight)
973 if (nd.parent() != null)
975 nd.height = ((SequenceNode) nd.parent()).height + nd.dist;
980 nd.height = (float) 0.0;
983 maxheight = findHeight((SequenceNode) (nd.left()));
984 maxheight = findHeight((SequenceNode) (nd.right()));
993 * @return DOCUMENT ME!
995 SequenceNode reRoot()
997 // TODO not used - remove?
1002 double tmpdist = maxdist.dist;
1005 SequenceNode sn = new SequenceNode();
1008 // New right hand of top
1009 SequenceNode snr = (SequenceNode) maxdist.parent();
1010 changeDirection(snr, maxdist);
1011 System.out.println("Printing reversed tree");
1013 snr.dist = tmpdist / 2;
1014 maxdist.dist = tmpdist / 2;
1017 maxdist.setParent(sn);
1020 sn.setLeft(maxdist);
1034 * @return true if original sequence data can be recovered
1036 public boolean hasOriginalSequenceData()
1038 return seqData != null;
1042 * Returns original alignment data used for calculation - or null where not
1045 * @return null or cut'n'pasteable alignment
1047 public String printOriginalSequenceData(char gapChar)
1049 if (seqData == null)
1054 StringBuffer sb = new StringBuffer();
1055 String[] seqdatas = seqData.getSequenceStrings(gapChar);
1056 for (int i = 0; i < seqdatas.length; i++)
1058 sb.append(new jalview.util.Format("%-" + 15 + "s").form(sequences[i]
1060 sb.append(" " + seqdatas[i] + "\n");
1062 return sb.toString();
1071 void printN(SequenceNode nd)
1078 if ((nd.left() != null) && (nd.right() != null))
1080 printN((SequenceNode) nd.left());
1081 printN((SequenceNode) nd.right());
1085 System.out.println(" name = " + ((SequenceI) nd.element()).getName());
1088 System.out.println(" dist = " + nd.dist + " " + nd.count + " "
1098 public void reCount(SequenceNode nd)
1102 // _lylimit = this.node.size();
1106 // private long _lycount = 0, _lylimit = 0;
1114 void _reCount(SequenceNode nd)
1116 // if (_lycount<_lylimit)
1118 // System.err.println("Warning: depth of _recount greater than number of nodes.");
1126 if ((nd.left() != null) && (nd.right() != null))
1129 _reCount((SequenceNode) nd.left());
1130 _reCount((SequenceNode) nd.right());
1132 SequenceNode l = (SequenceNode) nd.left();
1133 SequenceNode r = (SequenceNode) nd.right();
1135 nd.count = l.count + r.count;
1136 nd.ycount = (l.ycount + r.ycount) / 2;
1141 nd.ycount = ycount++;
1152 public void swapNodes(SequenceNode nd)
1159 SequenceNode tmp = (SequenceNode) nd.left();
1161 nd.setLeft(nd.right());
1173 void changeDirection(SequenceNode nd, SequenceNode dir)
1180 if (nd.parent() != top)
1182 changeDirection((SequenceNode) nd.parent(), nd);
1184 SequenceNode tmp = (SequenceNode) nd.parent();
1186 if (dir == nd.left())
1191 else if (dir == nd.right())
1199 if (dir == nd.left())
1201 nd.setParent(nd.left());
1203 if (top.left() == nd)
1205 nd.setRight(top.right());
1209 nd.setRight(top.left());
1214 nd.setParent(nd.right());
1216 if (top.left() == nd)
1218 nd.setLeft(top.right());
1222 nd.setLeft(top.left());
1231 * @return DOCUMENT ME!
1233 public SequenceNode getMaxDist()
1241 * @return DOCUMENT ME!
1243 public SequenceNode getTopNode()
1250 * @return true if tree has real distances
1252 public boolean hasDistances()
1254 return hasDistances;
1259 * @return true if tree has real bootstrap values
1261 public boolean hasBootstrap()
1263 return hasBootstrap;
1266 public boolean hasRootDistance()
1268 return hasRootDistance;
1272 * apply the given transform to all the nodes in the tree.
1274 * @param nodeTransformI
1276 public void applyToNodes(NodeTransformI nodeTransformI)
1278 for (Enumeration<SequenceNode> nodes = node.elements(); nodes
1279 .hasMoreElements(); nodeTransformI.transform(nodes