package fr.orsay.lri.varna.models.treealign; import java.util.*; /** * Tree alignment algorithm. * This class implements the tree alignment algorithm * for ordered trees explained in article: * T. Jiang, L. Wang, K. Zhang, * Alignment of trees - an alternative to tree edit, * Theoret. Comput. Sci. 143 (1995). * Other references: * - Claire Herrbach, Alain Denise and Serge Dulucq. * Average complexity of the Jiang-Wang-Zhang pairwise tree alignment * algorithm and of a RNA secondary structure alignment algorithm. * Theoretical Computer Science 411 (2010) 2423-2432. * * Our implementation supposes that the trees will never have more * than 32000 nodes and that the total distance will never require more * significant digits that a float (single precision) has. * * @author Raphael Champeimont * @param The type of values on nodes in the first tree. * @param The type of values on nodes in the second tree. */ public class TreeAlign { private class TreeData { /** * The tree. */ public Tree tree; /** * The tree size (number of nodes). */ public int size = -1; /** * The number of children of a node is called the node degree. * This variable is the maximum node degree in the tree. */ public int degree = -1; /** * The number of children of a node is called the node degree. * degree[i] is the degree of node i, with i being an index in nodes. */ public int[] degrees; /** * The trees as an array of its nodes (subtrees rooted at each node * in fact), in postorder. */ public Tree[] nodes; /** * children[i] is the array of children (as indexes in nodes) * of i (an index in nodes) */ public int[][] children; /** * Values of nodes. */ public ValueType[] values; } /** * The distance function between labels. */ private TreeAlignLabelDistanceAsymmetric labelDist; /** * Create a TreeAlignSymmetric object, which can align trees. * The distance function will be called only once on every pair * of nodes. The result is then kept in a matrix, so you need not manage * yourself a cache of f(value1, value2). * Note that it is permitted to have null values on nodes, * so comparing a node with a non-null value with a node with a null * value will give the same cost as to insert the first node. * This can be useful if you tree has "fake" nodes. * @param labelDist The label distance. */ public TreeAlign(TreeAlignLabelDistanceAsymmetric labelDist) { this.labelDist = labelDist; } private class ConvertTreeToArray { private int nextNodeIndex = 0; private TreeData treeData; public ConvertTreeToArray(TreeData treeData) { this.treeData = treeData; } private void convertTreeToArrayAux( Tree subtree, int[] siblingIndexes, int siblingNumber) throws TreeAlignException { // We want it in postorder, so first we put the children List> children = subtree.getChildren(); int numberOfChildren = children.size(); int[] childrenIndexes = new int[numberOfChildren]; int myIndex = -1; { int i = 0; for (Tree child: children) { convertTreeToArrayAux(child, childrenIndexes, i); i++; } } // Compute the maximum degree if (numberOfChildren > treeData.degree) { treeData.degree = numberOfChildren; } // Now we add the node (root of the given subtree). myIndex = nextNodeIndex; nextNodeIndex++; treeData.nodes[myIndex] = subtree; // Record how many children I have treeData.degrees[myIndex] = numberOfChildren; // Store my value in an array ValueType v = subtree.getValue(); treeData.values[myIndex] = v; // Tell the caller my index siblingIndexes[siblingNumber] = myIndex; // Record my children indexes treeData.children[myIndex] = childrenIndexes; } /** * Reads: treeData.tree * Computes: treeData.nodes, treeData.degree, treeData.degrees * treeData.fathers, treeData.children, treeData.size, * treeData.values * Converts a tree to an array of nodes, in postorder. * We also compute the maximum node degree in the tree. * @throws TreeAlignException */ @SuppressWarnings("unchecked") public void convert() throws TreeAlignException { treeData.degree = 0; treeData.size = treeData.tree.countNodes(); // we didn't write new Tree[treeData.size] because // java does not support generics with arrays treeData.nodes = new Tree[treeData.size]; treeData.children = new int[treeData.size][]; treeData.degrees = new int[treeData.size]; treeData.values = (ValueType[]) new Object[treeData.size]; int rootIndex[] = new int[1]; convertTreeToArrayAux(treeData.tree, rootIndex, 0); } } /** * For arrays that take at least O(|T1|*|T2|) we take care * not to use too big data types. */ private class Aligner { /** * The first tree. */ private TreeData treeData1; /** * The second tree. */ private TreeData treeData2; /** * DF1[i][j_t] is DFL for (i,j,s,t) with s=0. * See description of DFL in Aligner.computeAlignmentP1(). * DF1 and DF2 are the "big" arrays, ie. those that may the space * complexity what it is. */ private float[][][][] DF1; /** * DF2[j][i_s] is DFL for (i,j,s,t) with t=0. * See description of DFL in Aligner.computeAlignmentP1(). */ private float[][][][] DF2; /** * This arrays have the same shape as respectively DF1. * They are used to remember which term in the minimum won, so that * we can compute the alignment. * Decision1 is a case number (< 10) * and Decision2 is a child index, hence the types. */ private byte[][][][] DF1Decisions1; private short[][][][] DF1Decisions2; /** * This arrays have the same shape as respectively DF2. * They are used to remember which term in the minimum won, so that * we can compute the alignment. */ private byte[][][][] DF2Decisions1; private short[][][][] DF2Decisions2; /** * Distances between subtrees. * DT[i][j] is the distance between the subtree rooted at i in the first tree * and the subtree rooted at j in the second tree. */ private float[][] DT; /** * This array has the same shape as DT, but is used to remember which * case gave the minimum, so that we can later compute the alignment. */ private byte[][] DTDecisions1; private short[][] DTDecisions2; /** * Distances between labels. * DL[i][j] is the distance labelDist.f(value(T1[i]), value(T2[i])). * By convention, we say that value(T1[|T1|]) = null * and value(T2[|T2|]) = null */ private float[][] DL; /** * DET1[i] is the distance between the empty tree and T1[i] * (the subtree rooted at node i in the first tree). */ private float[] DET1; /** * Same as DET1, but for second tree. */ private float[] DET2; /** * DEF1[i] is the distance between the empty forest and F1[i] * (the forest of children of node i in the first tree). */ private float[] DEF1; /** * Same as DEF1, but for second tree. */ private float[] DEF2; /** * @param i node in T1 * @param s number of first child of i to consider * @param m_i degree of i * @param j node in T2 * @param t number of first child of j to consider * @param n_j degree of j * @param DFx which array to fill (DF1 or DF2) */ private void computeAlignmentP1(int i, int s, int m_i, int j, int t, int n_j, int DFx) { /** * DFL[pr][qr] is D(F1[i_s, i_p], F2[j_t, j_q]) * where p=s+pr-1 and q=t+qr-1 (ie. pr=p-s+1 and qr=q-t+1) * By convention, F1[i_s, i_{s-1}] and F2[j_t, j_{t-1}] are the * empty forests. * Said differently, DFL[pr][qr] is the distance between the forest * of the pr first children of i, starting with child s * (first child is s = 0), and the forest of the qr first children * of j, starting with child t (first child is t = 0). * This array is allocated for a fixed value of (i,j,s,t). */ float[][] DFL; /** * Same shape as DFL, but to remember which term gave the min, * so that we can later compute the alignment. */ byte[][] DFLDecisions1; short[][] DFLDecisions2; DFL = new float[m_i-s+2][n_j-t+2]; DFL[0][0] = 0; // D(empty forest, empty forest) = 0 DFLDecisions1 = new byte[m_i-s+2][n_j-t+2]; DFLDecisions2 = new short[m_i-s+2][n_j-t+2]; // Compute indexes of i_s and j_t because we will need them int i_s = m_i != 0 ? treeData1.children[i][s] : -1; int j_t = n_j != 0 ? treeData2.children[j][t] : -1; for (int p=s; p(treeData1)).convert(); (new ConvertTreeToArray(treeData2)).convert(); // Allocate necessary arrays DT = new float[treeData1.size][treeData2.size]; DTDecisions1 = new byte[treeData1.size][treeData2.size]; DTDecisions2 = new short[treeData1.size][treeData2.size]; DL = new float[treeData1.size+1][treeData2.size+1]; DET1 = new float[treeData1.size]; DET2 = new float[treeData2.size]; DEF1 = new float[treeData1.size]; DEF2 = new float[treeData2.size]; DF1 = new float[treeData1.size][treeData2.size][][]; DF1Decisions1 = new byte[treeData1.size][treeData2.size][][]; DF1Decisions2 = new short[treeData1.size][treeData2.size][][]; DF2 = new float[treeData2.size][treeData1.size][][]; DF2Decisions1 = new byte[treeData2.size][treeData1.size][][]; DF2Decisions2 = new short[treeData2.size][treeData1.size][][]; DL[treeData1.size][treeData2.size] = (float) labelDist.f(null, null); for (int i=0; i T1, Tree T2) { treeData1 = new TreeData(); treeData1.tree = T1; treeData2 = new TreeData(); treeData2.tree = T2; } /** Align F1[i_s,i_p] with F2[j_t,j_q]. * If p = s-1, by convention it means F1[i_s,i_p] = empty forest. * Idem for q=t-1. */ private List>> computeForestAlignment(int i, int s, int p, int j, int t, int q) { if (p == s-1) { // left forest is the empty forest List>> result = new ArrayList>>(); for (int k=t; k<=q; k++) { result.add(treeInserted(treeData2.children[j][k])); } return result; } else { if (q == t-1) { // right forest is the empty forest List>> result = new ArrayList>>(); for (int k=s; k<=p; k++) { result.add(treeDeleted(treeData1.children[i][k])); } return result; } else { // both forests are non-empty int decision1, k; if (s == 0) { decision1 = DF1Decisions1 [i] [treeData2.children[j][t]] [p-s+1] [q-t+1]; k = DF1Decisions2 [i] [treeData2.children[j][t]] [p-s+1] [q-t+1]; } else if (t == 0) { decision1 = DF2Decisions1 [j] [treeData1.children[i][s]] [p-s+1] [q-t+1]; k = DF2Decisions2 [j] [treeData1.children[i][s]] [p-s+1] [q-t+1]; } else { throw (new Error("TreeAlignSymmetric bug: both s and t are non-zero")); } switch (decision1) { case 1: { List>> result; result = computeForestAlignment(i, s, p-1, j, t, q); result.add(treeDeleted(treeData1.children[i][p])); return result; } case 2: { List>> result; result = computeForestAlignment(i, s, p, j, t, q-1); result.add(treeInserted(treeData2.children[j][q])); return result; } case 3: { List>> result; result = computeForestAlignment(i, s, p-1, j, t, q-1); result.add(computeTreeAlignment(treeData1.children[i][p], treeData2.children[j][q])); return result; } case 4: { List>> result; result = computeForestAlignment(i, s, k-1, j, t, q-1); int j_q = treeData2.children[j][q]; Tree> insertedNode = new Tree>(); AlignedNode insertedNodeValue = new AlignedNode(); insertedNodeValue.setLeftNode(null); insertedNodeValue.setRightNode((Tree) treeData2.nodes[j_q]); insertedNode.setValue(insertedNodeValue); insertedNode.replaceChildrenListBy(computeForestAlignment(i, k, p, j_q, 0, treeData2.degrees[j_q]-1)); result.add(insertedNode); return result; } case 5: { List>> result; result = computeForestAlignment(i, s, p-1, j, t, k-1); int i_p = treeData1.children[i][p]; Tree> deletedNode = new Tree>(); AlignedNode deletedNodeValue = new AlignedNode(); deletedNodeValue.setLeftNode((Tree) treeData1.nodes[i_p]); deletedNodeValue.setRightNode(null); deletedNode.setValue(deletedNodeValue); deletedNode.replaceChildrenListBy(computeForestAlignment(i_p, 0, treeData1.degrees[i_p]-1, j, k, q)); result.add(deletedNode); return result; } default: throw (new Error("TreeAlign: decision1 = " + decision1)); } } } } /** * Align T1[i] with the empty tree. * @return the alignment */ private Tree> treeDeleted(int i) { Tree> root = new Tree>(); AlignedNode alignedNode = new AlignedNode(); alignedNode.setLeftNode(treeData1.nodes[i]); alignedNode.setRightNode(null); root.setValue(alignedNode); for (int r = 0; r> treeInserted(int j) { Tree> root = new Tree>(); AlignedNode alignedNode = new AlignedNode(); alignedNode.setLeftNode(null); alignedNode.setRightNode(treeData2.nodes[j]); root.setValue(alignedNode); for (int r = 0; r> computeTreeAlignment(int i, int j) { switch (DTDecisions1[i][j]) { case 1: { Tree> root = new Tree>(); // Compute the value of the node AlignedNode alignedNode = new AlignedNode(); alignedNode.setLeftNode(null); alignedNode.setRightNode(treeData2.nodes[j]); root.setValue(alignedNode); // Compute the children for (int r = 0; r> root = new Tree>(); // Compute the value of the node AlignedNode alignedNode = new AlignedNode(); alignedNode.setLeftNode(treeData1.nodes[i]); alignedNode.setRightNode(null); root.setValue(alignedNode); // Compute the children for (int r = 0; r> root = new Tree>(); // Compute the value of the node AlignedNode alignedNode = new AlignedNode(); alignedNode.setLeftNode(treeData1.nodes[i]); alignedNode.setRightNode(treeData2.nodes[j]); root.setValue(alignedNode); // Compute the children List>> children = computeForestAlignment(i, 0, treeData1.degrees[i]-1, j, 0, treeData2.degrees[j]-1); root.replaceChildrenListBy(children); return root; } default: throw (new Error("TreeAlign: DTDecisions1[i][j] = " + DTDecisions1[i][j])); } } public Tree> computeAlignment() { return computeTreeAlignment(treeData1.size-1, treeData2.size-1); } } /** * Align T1 with T2, computing both the distance and the alignment. * Time: O(|T1|*|T2|*(deg(T1)+deg(T2))^2) * Space: O(|T1|*|T2|*(deg(T1)+deg(T2))) * Average (over possible trees) time: O(|T1|*|T2|) * @param T1 The first tree. * @param T2 The second tree. * @return The distance and the alignment. * @throws TreeAlignException */ public TreeAlignResult align(Tree T1, Tree T2) throws TreeAlignException { TreeAlignResult result = new TreeAlignResult(); Aligner aligner = new Aligner(T1, T2); result.setDistance(aligner.align()); result.setAlignment(aligner.computeAlignment()); return result; } /** * Takes a alignment, and compute the distance between the two * original trees. If you have called align(), the result object already * contains the distance D and the alignment A. If you call * distanceFromAlignment on the alignment A it will compute the distance D. */ public float distanceFromAlignment(Tree> alignment) { Tree originalT1Node; Tree originalT2Node; originalT1Node = alignment.getValue().getLeftNode(); originalT2Node = alignment.getValue().getRightNode(); float d = (float) labelDist.f( originalT1Node != null ? originalT1Node.getValue() : null, originalT2Node != null ? originalT2Node.getValue() : null); for (Tree> child: alignment.getChildren()) { d += distanceFromAlignment(child); } return d; } }