2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.api.analysis.ScoreModelI;
24 import jalview.datamodel.AlignmentView;
25 import jalview.datamodel.BinaryNode;
26 import jalview.datamodel.CigarArray;
27 import jalview.datamodel.NodeTransformI;
28 import jalview.datamodel.SeqCigar;
29 import jalview.datamodel.Sequence;
30 import jalview.datamodel.SequenceI;
31 import jalview.datamodel.SequenceNode;
32 import jalview.io.NewickFile;
33 import jalview.schemes.ResidueProperties;
35 import java.util.Enumeration;
36 import java.util.List;
37 import java.util.Vector;
47 Vector<Cluster> cluster;
51 // SequenceData is a string representation of what the user
52 // sees. The display may contain hidden columns.
53 public AlignmentView seqData = null;
71 Vector<SequenceNode> groups = new Vector<SequenceNode>();
83 Vector<SequenceNode> node;
91 boolean hasDistances = true; // normal case for jalview trees
93 boolean hasBootstrap = false; // normal case for jalview trees
95 private boolean hasRootDistance = true;
98 * Create a new NJTree object with leaves associated with sequences in seqs,
99 * and original alignment data represented by Cigar strings.
108 public NJTree(SequenceI[] seqs, AlignmentView odata, NewickFile treefile)
110 this(seqs, treefile);
116 * sequenceString = new String[odata.length]; char gapChar =
117 * jalview.util.Comparison.GapChars.charAt(0); for (int i = 0; i <
118 * odata.length; i++) { SequenceI oseq_aligned = odata[i].getSeq(gapChar);
119 * sequenceString[i] = oseq_aligned.getSequence(); }
124 * Creates a new NJTree object from a tree from an external source
127 * SequenceI which should be associated with leafs of treefile
131 public NJTree(SequenceI[] seqs, NewickFile treefile)
133 this.sequence = seqs;
134 top = treefile.getTree();
137 * There is no dependent alignment to be recovered from an imported tree.
139 * if (sequenceString == null) { sequenceString = new String[seqs.length];
140 * for (int i = 0; i < seqs.length; i++) { sequenceString[i] =
141 * seqs[i].getSequence(); } }
144 hasDistances = treefile.HasDistances();
145 hasBootstrap = treefile.HasBootstrap();
146 hasRootDistance = treefile.HasRootDistance();
148 maxheight = findHeight(top);
150 SequenceIdMatcher algnIds = new SequenceIdMatcher(seqs);
152 Vector<SequenceNode> leaves = findLeaves(top);
155 int namesleft = seqs.length;
160 Vector<SequenceI> one2many = new Vector<SequenceI>();
161 int countOne2Many = 0;
162 while (i < leaves.size())
164 j = leaves.elementAt(i++);
165 realnam = j.getName();
170 nam = algnIds.findIdMatch(realnam);
176 if (one2many.contains(nam))
179 // if (jalview.bin.Cache.log.isDebugEnabled())
180 // jalview.bin.Cache.log.debug("One 2 many relationship for
185 one2many.addElement(nam);
191 j.setElement(new Sequence(realnam, "THISISAPLACEHLDER"));
192 j.setPlaceholder(true);
195 // if (jalview.bin.Cache.log.isDebugEnabled() && countOne2Many>0) {
196 // jalview.bin.Cache.log.debug("There were "+countOne2Many+" alignment
197 // sequence ids (out of "+one2many.size()+" unique ids) linked to two or
204 * Creates a new NJTree object.
217 public NJTree(SequenceI[] sequence, AlignmentView seqData, String type,
218 String pwtype, ScoreModelI sm, int start, int end)
220 this.sequence = sequence;
221 this.node = new Vector<SequenceNode>();
223 this.pwtype = pwtype;
226 this.seqData = seqData;
230 SeqCigar[] seqs = new SeqCigar[sequence.length];
231 for (int i = 0; i < sequence.length; i++)
233 seqs[i] = new SeqCigar(sequence[i], start, end);
235 CigarArray sdata = new CigarArray(seqs);
236 sdata.addOperation(CigarArray.M, end - start + 1);
237 this.seqData = new AlignmentView(sdata, start);
239 // System.err.println("Made seqData");// dbg
240 if (!(type.equals("NJ")))
245 if (sm == null && !(pwtype.equals("PID")))
247 if (ResidueProperties.getScoreMatrix(pwtype) == null)
255 done = new int[sequence.length];
257 while ((i < sequence.length) && (sequence[i] != null))
265 distance = findDistances(sm);
266 // System.err.println("Made distances");// dbg
268 // System.err.println("Made leaves");// dbg
270 noClus = cluster.size();
273 // System.err.println("Made clusters");// dbg
278 * Generate a string representation of the Tree
280 * @return Newick File with all tree data available
283 public String toString()
285 jalview.io.NewickFile fout = new jalview.io.NewickFile(getTopNode());
287 return fout.print(isHasBootstrap(), isHasDistances(),
288 isHasRootDistance()); // output all data available for tree
293 * used when the alignment associated to a tree has changed.
296 * Sequence set to be associated with tree nodes
298 public void UpdatePlaceHolders(List<SequenceI> list)
300 Vector<SequenceNode> leaves = findLeaves(top);
302 int sz = leaves.size();
303 SequenceIdMatcher seqmatcher = null;
308 SequenceNode leaf = leaves.elementAt(i++);
310 if (list.contains(leaf.element()))
312 leaf.setPlaceholder(false);
316 if (seqmatcher == null)
318 // Only create this the first time we need it
319 SequenceI[] seqs = new SequenceI[list.size()];
321 for (int j = 0; j < seqs.length; j++)
323 seqs[j] = list.get(j);
326 seqmatcher = new SequenceIdMatcher(seqs);
329 SequenceI nam = seqmatcher.findIdMatch(leaf.getName());
333 if (!leaf.isPlaceholder())
335 // remapping the node to a new sequenceI - should remove any refs to
337 // TODO - make many sequenceI to one leaf mappings possible!
340 leaf.setPlaceholder(false);
341 leaf.setElement(nam);
345 if (!leaf.isPlaceholder())
347 // Construct a new placeholder sequence object for this leaf
348 leaf.setElement(new Sequence(leaf.getName(),
349 "THISISAPLACEHLDER"));
351 leaf.setPlaceholder(true);
359 * rename any nodes according to their associated sequence. This will modify
360 * the tree's metadata! (ie the original NewickFile or newly generated
361 * BinaryTree's label data)
363 public void renameAssociatedNodes()
365 applyToNodes(new NodeTransformI()
369 public void transform(BinaryNode nd)
371 Object el = nd.element();
372 if (el != null && el instanceof SequenceI)
374 nd.setName(((SequenceI) el).getName());
383 public void cluster()
387 if (type.equals("NJ"))
396 Cluster c = joinClusters(mini, minj);
400 cluster.setElementAt(null, minj);
401 cluster.setElementAt(c, mini);
406 boolean onefound = false;
411 for (int i = 0; i < noseqs; i++)
415 if (onefound == false)
427 joinClusters(one, two);
428 top = (node.elementAt(one));
443 * @return DOCUMENT ME!
445 public Cluster joinClusters(int i, int j)
447 float dist = distance[i][j];
449 int noi = cluster.elementAt(i).value.length;
450 int noj = cluster.elementAt(j).value.length;
452 int[] value = new int[noi + noj];
454 for (int ii = 0; ii < noi; ii++)
456 value[ii] = cluster.elementAt(i).value[ii];
459 for (int ii = noi; ii < (noi + noj); ii++)
461 value[ii] = cluster.elementAt(j).value[ii - noi];
464 Cluster c = new Cluster(value);
469 if (type.equals("NJ"))
471 findClusterNJDistance(i, j);
475 findClusterDistance(i, j);
478 SequenceNode sn = new SequenceNode();
480 sn.setLeft((node.elementAt(i)));
481 sn.setRight((node.elementAt(j)));
483 SequenceNode tmpi = (node.elementAt(i));
484 SequenceNode tmpj = (node.elementAt(j));
486 if (type.equals("NJ"))
488 findNewNJDistances(tmpi, tmpj, dist);
492 findNewDistances(tmpi, tmpj, dist);
498 node.setElementAt(sn, i);
513 public void findNewNJDistances(SequenceNode tmpi, SequenceNode tmpj,
517 tmpi.dist = ((dist + ri) - rj) / 2;
518 tmpj.dist = (dist - tmpi.dist);
541 public void findNewDistances(SequenceNode tmpi, SequenceNode tmpj,
547 SequenceNode sni = tmpi;
548 SequenceNode snj = tmpj;
553 sni = (SequenceNode) sni.left();
559 snj = (SequenceNode) snj.left();
562 tmpi.dist = ((dist / 2) - ih);
563 tmpj.dist = ((dist / 2) - jh);
574 public void findClusterDistance(int i, int j)
576 int noi = cluster.elementAt(i).value.length;
577 int noj = cluster.elementAt(j).value.length;
579 // New distances from cluster to others
580 float[] newdist = new float[noseqs];
582 for (int l = 0; l < noseqs; l++)
584 if ((l != i) && (l != j))
586 newdist[l] = ((distance[i][l] * noi) + (distance[j][l] * noj))
595 for (int ii = 0; ii < noseqs; ii++)
597 distance[i][ii] = newdist[ii];
598 distance[ii][i] = newdist[ii];
610 public void findClusterNJDistance(int i, int j)
613 // New distances from cluster to others
614 float[] newdist = new float[noseqs];
616 for (int l = 0; l < noseqs; l++)
618 if ((l != i) && (l != j))
620 newdist[l] = ((distance[i][l] + distance[j][l]) - distance[i][j]) / 2;
628 for (int ii = 0; ii < noseqs; ii++)
630 distance[i][ii] = newdist[ii];
631 distance[ii][i] = newdist[ii];
643 * @return DOCUMENT ME!
645 public float findr(int i, int j)
649 for (int k = 0; k < noseqs; k++)
651 if ((k != i) && (k != j) && (done[k] != 1))
653 tmp = tmp + distance[i][k];
659 tmp = tmp / (noClus - 2);
668 * @return DOCUMENT ME!
670 public float findMinNJDistance()
674 for (int i = 0; i < (noseqs - 1); i++)
676 for (int j = i + 1; j < noseqs; j++)
678 if ((done[i] != 1) && (done[j] != 1))
680 float tmp = distance[i][j] - (findr(i, j) + findr(j, i));
699 * @return DOCUMENT ME!
701 public float findMinDistance()
705 for (int i = 0; i < (noseqs - 1); i++)
707 for (int j = i + 1; j < noseqs; j++)
709 if ((done[i] != 1) && (done[j] != 1))
711 if (distance[i][j] < min)
716 min = distance[i][j];
726 * Calculate a distance matrix given the sequence input data and score model
728 * @return similarity matrix used to compute tree
730 public float[][] findDistances(ScoreModelI _pwmatrix)
733 float[][] dist = new float[noseqs][noseqs];
734 if (_pwmatrix == null)
736 // Resolve substitution model
737 _pwmatrix = ResidueProperties.getScoreModel(pwtype);
738 if (_pwmatrix == null)
740 _pwmatrix = ResidueProperties.getScoreMatrix("BLOSUM62");
743 dist = _pwmatrix.findDistances(seqData);
751 public void makeLeaves()
753 cluster = new Vector<Cluster>();
755 for (int i = 0; i < noseqs; i++)
757 SequenceNode sn = new SequenceNode();
759 sn.setElement(sequence[i]);
760 sn.setName(sequence[i].getName());
763 int[] value = new int[1];
766 Cluster c = new Cluster(value);
767 cluster.addElement(c);
772 * Search for leaf nodes below (or at) the given node
775 * root node to search from
779 public Vector<SequenceNode> findLeaves(SequenceNode nd)
781 Vector<SequenceNode> leaves = new Vector<SequenceNode>();
782 findLeaves(nd, leaves);
787 * Search for leaf nodes.
790 * root node to search from
792 * Vector of leaves to add leaf node objects too.
794 * @return Vector of leaf nodes on binary tree
796 Vector<SequenceNode> findLeaves(SequenceNode nd,
797 Vector<SequenceNode> leaves)
804 if ((nd.left() == null) && (nd.right() == null)) // Interior node
807 leaves.addElement(nd);
814 * TODO: Identify internal nodes... if (node.isSequenceLabel()) {
815 * leaves.addElement(node); }
817 findLeaves((SequenceNode) nd.left(), leaves);
818 findLeaves((SequenceNode) nd.right(), leaves);
825 * Find the leaf node with a particular ycount
828 * initial point on tree to search from
830 * value to search for
832 * @return null or the node with ycound=count
834 public Object findLeaf(SequenceNode nd, int count)
836 found = _findLeaf(nd, count);
842 * #see findLeaf(SequenceNode node, count)
844 public Object _findLeaf(SequenceNode nd, int count)
851 if (nd.ycount == count)
853 found = nd.element();
859 _findLeaf((SequenceNode) nd.left(), count);
860 _findLeaf((SequenceNode) nd.right(), count);
867 * printNode is mainly for debugging purposes.
872 public void printNode(SequenceNode nd)
879 if ((nd.left() == null) && (nd.right() == null))
881 System.out.println("Leaf = " + ((SequenceI) nd.element()).getName());
882 System.out.println("Dist " + nd.dist);
883 System.out.println("Boot " + nd.getBootstrap());
887 System.out.println("Dist " + nd.dist);
888 printNode((SequenceNode) nd.left());
889 printNode((SequenceNode) nd.right());
899 public void findMaxDist(SequenceNode nd)
906 if ((nd.left() == null) && (nd.right() == null))
908 float dist = nd.dist;
910 if (dist > maxDistValue)
918 findMaxDist((SequenceNode) nd.left());
919 findMaxDist((SequenceNode) nd.right());
926 * @return DOCUMENT ME!
928 public Vector<SequenceNode> getGroups()
936 * @return DOCUMENT ME!
938 public float getMaxHeight()
951 public void groupNodes(SequenceNode nd, float threshold)
958 if ((nd.height / maxheight) > threshold)
960 groups.addElement(nd);
964 groupNodes((SequenceNode) nd.left(), threshold);
965 groupNodes((SequenceNode) nd.right(), threshold);
975 * @return DOCUMENT ME!
977 public float findHeight(SequenceNode nd)
984 if ((nd.left() == null) && (nd.right() == null))
986 nd.height = ((SequenceNode) nd.parent()).height + nd.dist;
988 if (nd.height > maxheight)
999 if (nd.parent() != null)
1001 nd.height = ((SequenceNode) nd.parent()).height + nd.dist;
1006 nd.height = (float) 0.0;
1009 maxheight = findHeight((SequenceNode) (nd.left()));
1010 maxheight = findHeight((SequenceNode) (nd.right()));
1019 * @return DOCUMENT ME!
1021 public SequenceNode reRoot()
1023 if (maxdist != null)
1027 float tmpdist = maxdist.dist;
1030 SequenceNode sn = new SequenceNode();
1033 // New right hand of top
1034 SequenceNode snr = (SequenceNode) maxdist.parent();
1035 changeDirection(snr, maxdist);
1036 System.out.println("Printing reversed tree");
1038 snr.dist = tmpdist / 2;
1039 maxdist.dist = tmpdist / 2;
1042 maxdist.setParent(sn);
1045 sn.setLeft(maxdist);
1059 * @return true if original sequence data can be recovered
1061 public boolean hasOriginalSequenceData()
1063 return seqData != null;
1067 * Returns original alignment data used for calculation - or null where not
1070 * @return null or cut'n'pasteable alignment
1072 public String printOriginalSequenceData(char gapChar)
1074 if (seqData == null)
1079 StringBuffer sb = new StringBuffer();
1080 String[] seqdatas = seqData.getSequenceStrings(gapChar);
1081 for (int i = 0; i < seqdatas.length; i++)
1083 sb.append(new jalview.util.Format("%-" + 15 + "s").form(sequence[i]
1085 sb.append(" " + seqdatas[i] + "\n");
1087 return sb.toString();
1096 public void printN(SequenceNode nd)
1103 if ((nd.left() != null) && (nd.right() != null))
1105 printN((SequenceNode) nd.left());
1106 printN((SequenceNode) nd.right());
1110 System.out.println(" name = " + ((SequenceI) nd.element()).getName());
1113 System.out.println(" dist = " + nd.dist + " " + nd.count + " "
1123 public void reCount(SequenceNode nd)
1127 // _lylimit = this.node.size();
1131 private long _lycount = 0, _lylimit = 0;
1139 public void _reCount(SequenceNode nd)
1141 // if (_lycount<_lylimit)
1143 // System.err.println("Warning: depth of _recount greater than number of nodes.");
1151 if ((nd.left() != null) && (nd.right() != null))
1154 _reCount((SequenceNode) nd.left());
1155 _reCount((SequenceNode) nd.right());
1157 SequenceNode l = (SequenceNode) nd.left();
1158 SequenceNode r = (SequenceNode) nd.right();
1160 nd.count = l.count + r.count;
1161 nd.ycount = (l.ycount + r.ycount) / 2;
1166 nd.ycount = ycount++;
1177 public void swapNodes(SequenceNode nd)
1184 SequenceNode tmp = (SequenceNode) nd.left();
1186 nd.setLeft(nd.right());
1198 public void changeDirection(SequenceNode nd, SequenceNode dir)
1205 if (nd.parent() != top)
1207 changeDirection((SequenceNode) nd.parent(), nd);
1209 SequenceNode tmp = (SequenceNode) nd.parent();
1211 if (dir == nd.left())
1216 else if (dir == nd.right())
1224 if (dir == nd.left())
1226 nd.setParent(nd.left());
1228 if (top.left() == nd)
1230 nd.setRight(top.right());
1234 nd.setRight(top.left());
1239 nd.setParent(nd.right());
1241 if (top.left() == nd)
1243 nd.setLeft(top.right());
1247 nd.setLeft(top.left());
1256 * @return DOCUMENT ME!
1258 public SequenceNode getMaxDist()
1266 * @return DOCUMENT ME!
1268 public SequenceNode getTopNode()
1275 * @return true if tree has real distances
1277 public boolean isHasDistances()
1279 return hasDistances;
1284 * @return true if tree has real bootstrap values
1286 public boolean isHasBootstrap()
1288 return hasBootstrap;
1291 public boolean isHasRootDistance()
1293 return hasRootDistance;
1297 * apply the given transform to all the nodes in the tree.
1299 * @param nodeTransformI
1301 public void applyToNodes(NodeTransformI nodeTransformI)
1303 for (Enumeration<SequenceNode> nodes = node.elements(); nodes
1304 .hasMoreElements(); nodeTransformI.transform(nodes
1316 * @version $Revision$
1323 * Creates a new Cluster object.
1328 public Cluster(int[] value)