X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fphylogeny%2FPhylogenyMethods.java;h=3c33301fb5f4fdbf52e69dae0d62c27ea8305121;hb=1542f37267c30e690b840d4552234f1f70598a46;hp=6569c810d982d6e7ff9c5e524da01b730d352721;hpb=48f7a89be9d34f1930a1f863e608235cc27184c5;p=jalview.git diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index 6569c81..3c33301 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -5,7 +5,7 @@ // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved -// +// // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -15,7 +15,7 @@ // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. -// +// // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA @@ -26,8 +26,13 @@ package org.forester.phylogeny; import java.awt.Color; +import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -35,21 +40,32 @@ import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.io.parsers.util.PhylogenyParserException; import org.forester.phylogeny.data.BranchColor; import org.forester.phylogeny.data.BranchWidth; import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.data.DomainArchitecture; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PhylogenyDataUtil; +import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; import org.forester.util.FailedConditionCheckException; import org.forester.util.ForesterUtil; public class PhylogenyMethods { - private static PhylogenyMethods _instance = null; - private final Set _temp_hash_set = new HashSet(); - private PhylogenyNode _farthest_1 = null; - private PhylogenyNode _farthest_2 = null; + private static PhylogenyMethods _instance = null; + private PhylogenyNode _farthest_1 = null; + private PhylogenyNode _farthest_2 = null; private PhylogenyMethods() { // Hidden constructor. @@ -64,7 +80,7 @@ public class PhylogenyMethods { * @return distance between node1 and node2 */ public double calculateDistance( final PhylogenyNode node1, final PhylogenyNode node2 ) { - final PhylogenyNode lca = obtainLCA( node1, node2 ); + final PhylogenyNode lca = calculateLCA( node1, node2 ); final PhylogenyNode n1 = node1; final PhylogenyNode n2 = node2; return ( PhylogenyMethods.getDistance( n1, lca ) + PhylogenyMethods.getDistance( n2, lca ) ); @@ -99,6 +115,10 @@ public class PhylogenyMethods { return farthest_d; } + final public static Event getEventAtLCA( final PhylogenyNode n1, final PhylogenyNode n2 ) { + return calculateLCA( n1, n2 ).getNodeData().getEvent(); + } + @Override public Object clone() throws CloneNotSupportedException { throw new CloneNotSupportedException(); @@ -112,6 +132,24 @@ public class PhylogenyMethods { return _farthest_2; } + final public static void deleteNonOrthologousExternalNodes( final Phylogeny phy, final PhylogenyNode n ) { + if ( n.isInternal() ) { + throw new IllegalArgumentException( "node is not external" ); + } + final ArrayList to_delete = new ArrayList(); + for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode i = it.next(); + if ( !PhylogenyMethods.getEventAtLCA( n, i ).isSpeciation() ) { + to_delete.add( i ); + } + } + for( final PhylogenyNode d : to_delete ) { + phy.deleteSubtree( d, true ); + } + phy.clearHashIdToNodeMap(); + phy.externalNodesHaveChanged(); + } + /** * Returns the LCA of PhylogenyNodes node1 and node2. * @@ -120,22 +158,68 @@ public class PhylogenyMethods { * @param node2 * @return LCA of node1 and node2 */ - public PhylogenyNode obtainLCA( final PhylogenyNode node1, final PhylogenyNode node2 ) { - _temp_hash_set.clear(); - PhylogenyNode n1 = node1; - PhylogenyNode n2 = node2; - _temp_hash_set.add( n1.getId() ); - while ( !n1.isRoot() ) { - n1 = n1.getParent(); - _temp_hash_set.add( n1.getId() ); + public final static PhylogenyNode calculateLCA( PhylogenyNode node1, PhylogenyNode node2 ) { + if ( node1 == node2 ) { + return node1; + } + if ( ( node1.getParent() == node2.getParent() ) ) { + return node1.getParent(); + } + int depth1 = node1.calculateDepth(); + int depth2 = node2.calculateDepth(); + while ( ( depth1 > -1 ) && ( depth2 > -1 ) ) { + if ( depth1 > depth2 ) { + node1 = node1.getParent(); + depth1--; + } + else if ( depth2 > depth1 ) { + node2 = node2.getParent(); + depth2--; + } + else { + if ( node1 == node2 ) { + return node1; + } + node1 = node1.getParent(); + node2 = node2.getParent(); + depth1--; + depth2--; + } + } + throw new IllegalArgumentException( "illegal attempt to calculate LCA of two nodes which do not share a common root" ); + } + + public static final void preOrderReId( final Phylogeny phy ) { + if ( phy.isEmpty() ) { + return; } - while ( !_temp_hash_set.contains( n2.getId() ) && !n2.isRoot() ) { - n2 = n2.getParent(); + phy.setIdToNodeMap( null ); + int i = PhylogenyNode.getNodeCount(); + for( final PhylogenyNodeIterator it = phy.iteratorPreorder(); it.hasNext(); ) { + it.next().setId( i++ ); } - if ( !_temp_hash_set.contains( n2.getId() ) ) { - throw new IllegalArgumentException( "attempt to get LCA of two nodes which do not share a common root" ); + PhylogenyNode.setNodeCount( i ); + } + + /** + * Returns the LCA of PhylogenyNodes node1 and node2. + * Precondition: ids are in pre-order (or level-order). + * + * + * @param node1 + * @param node2 + * @return LCA of node1 and node2 + */ + public final static PhylogenyNode calculateLCAonTreeWithIdsInPreOrder( PhylogenyNode node1, PhylogenyNode node2 ) { + while ( node1 != node2 ) { + if ( node1.getId() > node2.getId() ) { + node1 = node1.getParent(); + } + else { + node2 = node2.getParent(); + } } - return n2; + return node1; } /** @@ -152,20 +236,338 @@ public class PhylogenyMethods { * of this Phylogeny, null if this Phylogeny is empty or if n is * internal */ - public List getOrthologousNodes( final Phylogeny phy, final PhylogenyNode node ) { + public final static List getOrthologousNodes( final Phylogeny phy, final PhylogenyNode node ) { final List nodes = new ArrayList(); + PhylogenyMethods.preOrderReId( phy ); final PhylogenyNodeIterator it = phy.iteratorExternalForward(); while ( it.hasNext() ) { final PhylogenyNode temp_node = it.next(); - if ( ( temp_node != node ) && isAreOrthologous( node, temp_node ) ) { + if ( ( temp_node != node ) && !calculateLCAonTreeWithIdsInPreOrder( node, temp_node ).isDuplication() ) { nodes.add( temp_node ); } } return nodes; } - public boolean isAreOrthologous( final PhylogenyNode node1, final PhylogenyNode node2 ) { - return !obtainLCA( node1, node2 ).isDuplication(); + public static final HashMap createNameToExtNodeMap( final Phylogeny phy ) { + final HashMap nodes = new HashMap(); + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + nodes.put( n.getName(), n ); + } + return nodes; + } + + public final static Phylogeny[] readPhylogenies( final PhylogenyParser parser, final File file ) throws IOException { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny[] trees = factory.create( file, parser ); + if ( ( trees == null ) || ( trees.length == 0 ) ) { + throw new PhylogenyParserException( "Unable to parse phylogeny from file: " + file ); + } + return trees; + } + + public final static Phylogeny[] readPhylogenies( final PhylogenyParser parser, final List files ) + throws IOException { + final List tree_list = new ArrayList(); + for( final File file : files ) { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny[] trees = factory.create( file, parser ); + if ( ( trees == null ) || ( trees.length == 0 ) ) { + throw new PhylogenyParserException( "Unable to parse phylogeny from file: " + file ); + } + tree_list.addAll( Arrays.asList( trees ) ); + } + return tree_list.toArray( new Phylogeny[ tree_list.size() ] ); + } + + final static public void transferInternalNodeNamesToConfidence( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !n.isExternal() && !n.getBranchData().isHasConfidences() ) { + if ( !ForesterUtil.isEmpty( n.getName() ) ) { + double d = -1.0; + try { + d = Double.parseDouble( n.getName() ); + } + catch ( final Exception e ) { + d = -1.0; + } + if ( d >= 0.0 ) { + n.getBranchData().addConfidence( new Confidence( d, "" ) ); + n.setName( "" ); + } + } + } + } + } + + final static public void transferInternalNamesToBootstrapSupport( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !n.isExternal() && !ForesterUtil.isEmpty( n.getName() ) ) { + double value = -1; + try { + value = Double.parseDouble( n.getName() ); + } + catch ( final NumberFormatException e ) { + throw new IllegalArgumentException( "failed to parse number from [" + n.getName() + "]: " + + e.getLocalizedMessage() ); + } + if ( value >= 0.0 ) { + n.getBranchData().addConfidence( new Confidence( value, "bootstrap" ) ); + n.setName( "" ); + } + } + } + } + + final static public void sortNodeDescendents( final PhylogenyNode node, final DESCENDANT_SORT_PRIORITY pri ) { + class PhylogenyNodeSortTaxonomyPriority implements Comparator { + + @Override + public int compare( final PhylogenyNode n1, final PhylogenyNode n2 ) { + if ( n1.getNodeData().isHasTaxonomy() && n2.getNodeData().isHasTaxonomy() ) { + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getScientificName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getScientificName() ) ) ) { + return n1.getNodeData().getTaxonomy().getScientificName().toLowerCase() + .compareTo( n2.getNodeData().getTaxonomy().getScientificName().toLowerCase() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getTaxonomyCode() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getTaxonomyCode() ) ) ) { + return n1.getNodeData().getTaxonomy().getTaxonomyCode() + .compareTo( n2.getNodeData().getTaxonomy().getTaxonomyCode() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getCommonName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getCommonName() ) ) ) { + return n1.getNodeData().getTaxonomy().getCommonName().toLowerCase() + .compareTo( n2.getNodeData().getTaxonomy().getCommonName().toLowerCase() ); + } + } + if ( n1.getNodeData().isHasSequence() && n2.getNodeData().isHasSequence() ) { + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getName() ) ) ) { + return n1.getNodeData().getSequence().getName().toLowerCase() + .compareTo( n2.getNodeData().getSequence().getName().toLowerCase() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getSymbol() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getSymbol() ) ) ) { + return n1.getNodeData().getSequence().getSymbol() + .compareTo( n2.getNodeData().getSequence().getSymbol() ); + } + if ( ( n1.getNodeData().getSequence().getAccession() != null ) + && ( n2.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getAccession().getValue() ) + && !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getAccession().getValue() ) ) { + return n1.getNodeData().getSequence().getAccession().getValue() + .compareTo( n2.getNodeData().getSequence().getAccession().getValue() ); + } + } + if ( ( !ForesterUtil.isEmpty( n1.getName() ) ) && ( !ForesterUtil.isEmpty( n2.getName() ) ) ) { + return n1.getName().toLowerCase().compareTo( n2.getName().toLowerCase() ); + } + return 0; + } + } + class PhylogenyNodeSortSequencePriority implements Comparator { + + @Override + public int compare( final PhylogenyNode n1, final PhylogenyNode n2 ) { + if ( n1.getNodeData().isHasSequence() && n2.getNodeData().isHasSequence() ) { + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getName() ) ) ) { + return n1.getNodeData().getSequence().getName().toLowerCase() + .compareTo( n2.getNodeData().getSequence().getName().toLowerCase() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getSymbol() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getSymbol() ) ) ) { + return n1.getNodeData().getSequence().getSymbol() + .compareTo( n2.getNodeData().getSequence().getSymbol() ); + } + if ( ( n1.getNodeData().getSequence().getAccession() != null ) + && ( n2.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getAccession().getValue() ) + && !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getAccession().getValue() ) ) { + return n1.getNodeData().getSequence().getAccession().getValue() + .compareTo( n2.getNodeData().getSequence().getAccession().getValue() ); + } + } + if ( n1.getNodeData().isHasTaxonomy() && n2.getNodeData().isHasTaxonomy() ) { + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getScientificName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getScientificName() ) ) ) { + return n1.getNodeData().getTaxonomy().getScientificName().toLowerCase() + .compareTo( n2.getNodeData().getTaxonomy().getScientificName().toLowerCase() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getTaxonomyCode() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getTaxonomyCode() ) ) ) { + return n1.getNodeData().getTaxonomy().getTaxonomyCode() + .compareTo( n2.getNodeData().getTaxonomy().getTaxonomyCode() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getCommonName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getCommonName() ) ) ) { + return n1.getNodeData().getTaxonomy().getCommonName().toLowerCase() + .compareTo( n2.getNodeData().getTaxonomy().getCommonName().toLowerCase() ); + } + } + if ( ( !ForesterUtil.isEmpty( n1.getName() ) ) && ( !ForesterUtil.isEmpty( n2.getName() ) ) ) { + return n1.getName().toLowerCase().compareTo( n2.getName().toLowerCase() ); + } + return 0; + } + } + class PhylogenyNodeSortNodeNamePriority implements Comparator { + + @Override + public int compare( final PhylogenyNode n1, final PhylogenyNode n2 ) { + if ( ( !ForesterUtil.isEmpty( n1.getName() ) ) && ( !ForesterUtil.isEmpty( n2.getName() ) ) ) { + return n1.getName().toLowerCase().compareTo( n2.getName().toLowerCase() ); + } + if ( n1.getNodeData().isHasTaxonomy() && n2.getNodeData().isHasTaxonomy() ) { + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getScientificName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getScientificName() ) ) ) { + return n1.getNodeData().getTaxonomy().getScientificName().toLowerCase() + .compareTo( n2.getNodeData().getTaxonomy().getScientificName().toLowerCase() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getTaxonomyCode() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getTaxonomyCode() ) ) ) { + return n1.getNodeData().getTaxonomy().getTaxonomyCode() + .compareTo( n2.getNodeData().getTaxonomy().getTaxonomyCode() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getTaxonomy().getCommonName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getTaxonomy().getCommonName() ) ) ) { + return n1.getNodeData().getTaxonomy().getCommonName().toLowerCase() + .compareTo( n2.getNodeData().getTaxonomy().getCommonName().toLowerCase() ); + } + } + if ( n1.getNodeData().isHasSequence() && n2.getNodeData().isHasSequence() ) { + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getName() ) ) ) { + return n1.getNodeData().getSequence().getName().toLowerCase() + .compareTo( n2.getNodeData().getSequence().getName().toLowerCase() ); + } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getSymbol() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getSymbol() ) ) ) { + return n1.getNodeData().getSequence().getSymbol() + .compareTo( n2.getNodeData().getSequence().getSymbol() ); + } + if ( ( n1.getNodeData().getSequence().getAccession() != null ) + && ( n2.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getAccession().getValue() ) + && !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getAccession().getValue() ) ) { + return n1.getNodeData().getSequence().getAccession().getValue() + .compareTo( n2.getNodeData().getSequence().getAccession().getValue() ); + } + } + return 0; + } + } + Comparator c; + switch ( pri ) { + case SEQUENCE: + c = new PhylogenyNodeSortSequencePriority(); + break; + case NODE_NAME: + c = new PhylogenyNodeSortNodeNamePriority(); + break; + default: + c = new PhylogenyNodeSortTaxonomyPriority(); + } + final List descs = node.getDescendants(); + Collections.sort( descs, c ); + int i = 0; + for( final PhylogenyNode desc : descs ) { + node.setChildNode( i++, desc ); + } + } + + final static public void transferNodeNameToField( final Phylogeny phy, + final PhylogenyMethods.PhylogenyNodeField field, + final boolean external_only ) throws PhyloXmlDataFormatException { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( external_only && n.isInternal() ) { + continue; + } + final String name = n.getName().trim(); + if ( !ForesterUtil.isEmpty( name ) ) { + switch ( field ) { + case TAXONOMY_CODE: + n.setName( "" ); + setTaxonomyCode( n, name ); + break; + case TAXONOMY_SCIENTIFIC_NAME: + n.setName( "" ); + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + n.getNodeData().getTaxonomy().setScientificName( name ); + break; + case TAXONOMY_COMMON_NAME: + n.setName( "" ); + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + n.getNodeData().getTaxonomy().setCommonName( name ); + break; + case SEQUENCE_SYMBOL: + n.setName( "" ); + if ( !n.getNodeData().isHasSequence() ) { + n.getNodeData().setSequence( new Sequence() ); + } + n.getNodeData().getSequence().setSymbol( name ); + break; + case SEQUENCE_NAME: + n.setName( "" ); + if ( !n.getNodeData().isHasSequence() ) { + n.getNodeData().setSequence( new Sequence() ); + } + n.getNodeData().getSequence().setName( name ); + break; + case TAXONOMY_ID_UNIPROT_1: { + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + String id = name; + final int i = name.indexOf( '_' ); + if ( i > 0 ) { + id = name.substring( 0, i ); + } + else { + n.setName( "" ); + } + n.getNodeData().getTaxonomy() + .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) ); + break; + } + case TAXONOMY_ID_UNIPROT_2: { + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + String id = name; + final int i = name.indexOf( '_' ); + if ( i > 0 ) { + id = name.substring( i + 1, name.length() ); + } + else { + n.setName( "" ); + } + n.getNodeData().getTaxonomy() + .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) ); + break; + } + case TAXONOMY_ID: { + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + n.getNodeData().getTaxonomy().setIdentifier( new Identifier( name ) ); + break; + } + } + } + } } static double addPhylogenyDistances( final double a, final double b ) { @@ -178,19 +580,17 @@ public class PhylogenyMethods { else if ( b >= 0.0 ) { return b; } - return PhylogenyNode.DISTANCE_DEFAULT; + return PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT; } - // Helper for getUltraParalogousNodes( PhylogenyNode ). - public static boolean areAllChildrenDuplications( final PhylogenyNode n ) { + public final static boolean isAllDecendentsAreDuplications( final PhylogenyNode n ) { if ( n.isExternal() ) { - return false; + return true; } else { if ( n.isDuplication() ) { - //FIXME test me! for( final PhylogenyNode desc : n.getDescendants() ) { - if ( !areAllChildrenDuplications( desc ) ) { + if ( !isAllDecendentsAreDuplications( desc ) ) { return false; } } @@ -202,28 +602,6 @@ public class PhylogenyMethods { } } - public static int calculateDepth( final PhylogenyNode node ) { - PhylogenyNode n = node; - int steps = 0; - while ( !n.isRoot() ) { - steps++; - n = n.getParent(); - } - return steps; - } - - public static double calculateDistanceToRoot( final PhylogenyNode node ) { - PhylogenyNode n = node; - double d = 0.0; - while ( !n.isRoot() ) { - if ( n.getDistanceToParent() > 0.0 ) { - d += n.getDistanceToParent(); - } - n = n.getParent(); - } - return d; - } - public static short calculateMaxBranchesToLeaf( final PhylogenyNode node ) { if ( node.isExternal() ) { return 0; @@ -251,7 +629,7 @@ public class PhylogenyMethods { int max = 0; for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); - final int steps = calculateDepth( node ); + final int steps = node.calculateDepth(); if ( steps > max ) { max = steps; } @@ -263,7 +641,7 @@ public class PhylogenyMethods { double max = 0.0; for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); - final double d = calculateDistanceToRoot( node ); + final double d = node.calculateDistanceToRoot(); if ( d > max ) { max = d; } @@ -271,15 +649,65 @@ public class PhylogenyMethods { return max; } - public static int calculateMaximumNumberOfDescendantsPerNode( final Phylogeny phy ) { - int max = 0; + public static int countNumberOfPolytomies( final Phylogeny phy ) { + int count = 0; for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { - final PhylogenyNode node = iter.next(); - if ( node.getNumberOfDescendants() > max ) { - max = node.getNumberOfDescendants(); + final PhylogenyNode n = iter.next(); + if ( !n.isExternal() && ( n.getNumberOfDescendants() > 2 ) ) { + count++; } } - return max; + return count; + } + + public static DescriptiveStatistics calculatNumberOfDescendantsPerNodeStatistics( final Phylogeny phy ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( !n.isExternal() ) { + stats.addValue( n.getNumberOfDescendants() ); + } + } + return stats; + } + + public static DescriptiveStatistics calculatBranchLengthStatistics( final Phylogeny phy ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( !n.isRoot() && ( n.getDistanceToParent() >= 0.0 ) ) { + stats.addValue( n.getDistanceToParent() ); + } + } + return stats; + } + + public static List calculatConfidenceStatistics( final Phylogeny phy ) { + final List stats = new ArrayList(); + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( !n.isExternal() && !n.isRoot() ) { + if ( n.getBranchData().isHasConfidences() ) { + for( int i = 0; i < n.getBranchData().getConfidences().size(); ++i ) { + final Confidence c = n.getBranchData().getConfidences().get( i ); + if ( ( i > ( stats.size() - 1 ) ) || ( stats.get( i ) == null ) ) { + stats.add( i, new BasicDescriptiveStatistics() ); + } + if ( !ForesterUtil.isEmpty( c.getType() ) ) { + if ( !ForesterUtil.isEmpty( stats.get( i ).getDescription() ) ) { + if ( !stats.get( i ).getDescription().equalsIgnoreCase( c.getType() ) ) { + throw new IllegalArgumentException( "support values in node [" + n.toString() + + "] appear inconsistently ordered" ); + } + } + stats.get( i ).setDescription( c.getType() ); + } + stats.get( i ).addValue( ( ( c != null ) && ( c.getValue() >= 0 ) ) ? c.getValue() : 0 ); + } + } + } + } + return stats; } /** @@ -374,31 +802,33 @@ public class PhylogenyMethods { } public static void deleteExternalNodesNegativeSelection( final Set to_delete, final Phylogeny phy ) { - phy.hashIDs(); + phy.clearHashIdToNodeMap(); for( final Integer id : to_delete ) { phy.deleteSubtree( phy.getNode( id ), true ); } - phy.hashIDs(); + phy.clearHashIdToNodeMap(); + phy.externalNodesHaveChanged(); } public static void deleteExternalNodesNegativeSelection( final String[] node_names_to_delete, final Phylogeny p ) throws IllegalArgumentException { - for( int i = 0; i < node_names_to_delete.length; ++i ) { - if ( ForesterUtil.isEmpty( node_names_to_delete[ i ] ) ) { + for( final String element : node_names_to_delete ) { + if ( ForesterUtil.isEmpty( element ) ) { continue; } List nodes = null; - nodes = p.getNodes( node_names_to_delete[ i ] ); + nodes = p.getNodes( element ); final Iterator it = nodes.iterator(); while ( it.hasNext() ) { final PhylogenyNode n = it.next(); if ( !n.isExternal() ) { - throw new IllegalArgumentException( "attempt to delete non-external node \"" - + node_names_to_delete[ i ] + "\"" ); + throw new IllegalArgumentException( "attempt to delete non-external node \"" + element + "\"" ); } p.deleteSubtree( n, true ); } } + p.clearHashIdToNodeMap(); + p.externalNodesHaveChanged(); } public static void deleteExternalNodesPositiveSelection( final Set species_to_keep, final Phylogeny phy ) { @@ -415,9 +845,8 @@ public class PhylogenyMethods { throw new IllegalArgumentException( "node " + n.getId() + " has no taxonomic data" ); } } - phy.hashIDs(); + phy.clearHashIdToNodeMap(); phy.externalNodesHaveChanged(); - // deleteExternalNodesNegativeSelection( to_delete, phy ); } public static List deleteExternalNodesPositiveSelection( final String[] node_names_to_keep, @@ -611,12 +1040,12 @@ public class PhylogenyMethods { if ( !node.getNodeData().isHasTaxonomy() ) { return ""; } - if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { - return node.getNodeData().getTaxonomy().getTaxonomyCode(); - } else if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getScientificName() ) ) { return node.getNodeData().getTaxonomy().getScientificName(); } + if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + return node.getNodeData().getTaxonomy().getTaxonomyCode(); + } else { return node.getNodeData().getTaxonomy().getCommonName(); } @@ -717,9 +1146,9 @@ public class PhylogenyMethods { // FIXME test me PhylogenyNode node = n; if ( !node.isExternal() ) { - return null; + throw new IllegalArgumentException( "attempt to get ultra-paralogous nodes of internal node" ); } - while ( !node.isRoot() && node.getParent().isDuplication() && areAllChildrenDuplications( node.getParent() ) ) { + while ( !node.isRoot() && node.getParent().isDuplication() && isAllDecendentsAreDuplications( node.getParent() ) ) { node = node.getParent(); } final List nodes = node.getAllExternalDescendants(); @@ -884,8 +1313,9 @@ public class PhylogenyMethods { double blue = 0.0; int n = 0; if ( node.isInternal() ) { - for( final PhylogenyNodeIterator iterator = node.iterateChildNodesForward(); iterator.hasNext(); ) { - final PhylogenyNode child_node = iterator.next(); + //for( final PhylogenyNodeIterator iterator = node.iterateChildNodesForward(); iterator.hasNext(); ) { + for( int i = 0; i < node.getNumberOfDescendants(); ++i ) { + final PhylogenyNode child_node = node.getChildNode( i ); final Color child_color = getBranchColorValue( child_node ); if ( child_color != null ) { ++n; @@ -894,8 +1324,10 @@ public class PhylogenyMethods { blue += child_color.getBlue(); } } - setBranchColorValue( node, new Color( ForesterUtil.roundToInt( red / n ), ForesterUtil - .roundToInt( green / n ), ForesterUtil.roundToInt( blue / n ) ) ); + setBranchColorValue( node, + new Color( ForesterUtil.roundToInt( red / n ), + ForesterUtil.roundToInt( green / n ), + ForesterUtil.roundToInt( blue / n ) ) ); } } } @@ -906,6 +1338,8 @@ public class PhylogenyMethods { } if ( remove_me.isExternal() ) { phylogeny.deleteSubtree( remove_me, false ); + phylogeny.clearHashIdToNodeMap(); + phylogeny.externalNodesHaveChanged(); } else { final PhylogenyNode parent = remove_me.getParent(); @@ -913,11 +1347,11 @@ public class PhylogenyMethods { parent.removeChildNode( remove_me ); for( final PhylogenyNode desc : descs ) { parent.addAsChild( desc ); - desc.setDistanceToParent( addPhylogenyDistances( remove_me.getDistanceToParent(), desc - .getDistanceToParent() ) ); + desc.setDistanceToParent( addPhylogenyDistances( remove_me.getDistanceToParent(), + desc.getDistanceToParent() ) ); } remove_me.setParent( null ); - phylogeny.setIdHash( null ); + phylogeny.clearHashIdToNodeMap(); phylogeny.externalNodesHaveChanged(); } } @@ -925,7 +1359,8 @@ public class PhylogenyMethods { public static List searchData( final String query, final Phylogeny phy, final boolean case_sensitive, - final boolean partial ) { + final boolean partial, + final boolean search_domains ) { final List nodes = new ArrayList(); if ( phy.isEmpty() || ( query == null ) ) { return nodes; @@ -968,15 +1403,16 @@ public class PhylogenyMethods { } } } - else if ( node.getNodeData().isHasSequence() + if ( !match && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getName(), query, case_sensitive, partial ) ) { match = true; } - else if ( node.getNodeData().isHasSequence() + if ( !match && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial ) ) { match = true; } - else if ( node.getNodeData().isHasSequence() + if ( !match + && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) && match( node.getNodeData().getSequence().getAccession().getValue(), query, @@ -984,7 +1420,7 @@ public class PhylogenyMethods { partial ) ) { match = true; } - else if ( node.getNodeData().isHasSequence() + if ( search_domains && !match && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getDomainArchitecture() != null ) ) { final DomainArchitecture da = node.getNodeData().getSequence().getDomainArchitecture(); I: for( int i = 0; i < da.getNumberOfDomains(); ++i ) { @@ -994,6 +1430,22 @@ public class PhylogenyMethods { } } } + if ( !match && ( node.getNodeData().getBinaryCharacters() != null ) ) { + Iterator it = node.getNodeData().getBinaryCharacters().getPresentCharacters().iterator(); + I: while ( it.hasNext() ) { + if ( match( it.next(), query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + it = node.getNodeData().getBinaryCharacters().getGainedCharacters().iterator(); + I: while ( it.hasNext() ) { + if ( match( it.next(), query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + } if ( match ) { nodes.add( node ); } @@ -1004,7 +1456,8 @@ public class PhylogenyMethods { public static List searchDataLogicalAnd( final String[] queries, final Phylogeny phy, final boolean case_sensitive, - final boolean partial ) { + final boolean partial, + final boolean search_domains ) { final List nodes = new ArrayList(); if ( phy.isEmpty() || ( queries == null ) || ( queries.length < 1 ) ) { return nodes; @@ -1050,15 +1503,16 @@ public class PhylogenyMethods { } } } - else if ( node.getNodeData().isHasSequence() + if ( !match && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getName(), query, case_sensitive, partial ) ) { match = true; } - else if ( node.getNodeData().isHasSequence() + if ( !match && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial ) ) { match = true; } - else if ( node.getNodeData().isHasSequence() + if ( !match + && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) && match( node.getNodeData().getSequence().getAccession().getValue(), query, @@ -1066,7 +1520,7 @@ public class PhylogenyMethods { partial ) ) { match = true; } - else if ( node.getNodeData().isHasSequence() + if ( search_domains && !match && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getDomainArchitecture() != null ) ) { final DomainArchitecture da = node.getNodeData().getSequence().getDomainArchitecture(); I: for( int i = 0; i < da.getNumberOfDomains(); ++i ) { @@ -1076,6 +1530,22 @@ public class PhylogenyMethods { } } } + if ( !match && ( node.getNodeData().getBinaryCharacters() != null ) ) { + Iterator it = node.getNodeData().getBinaryCharacters().getPresentCharacters().iterator(); + I: while ( it.hasNext() ) { + if ( match( it.next(), query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + it = node.getNodeData().getBinaryCharacters().getGainedCharacters().iterator(); + I: while ( it.hasNext() ) { + if ( match( it.next(), query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + } if ( !match ) { all_matched = false; break; @@ -1148,8 +1618,10 @@ public class PhylogenyMethods { * * @param node * @param taxonomy_code + * @throws PhyloXmlDataFormatException */ - public static void setTaxonomyCode( final PhylogenyNode node, final String taxonomy_code ) { + public static void setTaxonomyCode( final PhylogenyNode node, final String taxonomy_code ) + throws PhyloXmlDataFormatException { if ( !node.getNodeData().isHasTaxonomy() ) { node.getNodeData().setTaxonomy( new Taxonomy() ); } @@ -1168,19 +1640,98 @@ public class PhylogenyMethods { */ public static int taxonomyBasedDeletionOfExternalNodes( final Phylogeny reference, final Phylogeny to_be_stripped ) { final Set ref_ext_taxo = new HashSet(); - final ArrayList nodes_to_delete = new ArrayList(); for( final PhylogenyNodeIterator it = reference.iteratorExternalForward(); it.hasNext(); ) { - ref_ext_taxo.add( getSpecies( it.next() ) ); + final PhylogenyNode n = it.next(); + if ( !n.getNodeData().isHasTaxonomy() ) { + throw new IllegalArgumentException( "no taxonomic data in node: " + n ); + } + // ref_ext_taxo.add( getSpecies( n ) ); + if ( !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + ref_ext_taxo.add( n.getNodeData().getTaxonomy().getScientificName() ); + } + if ( !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + ref_ext_taxo.add( n.getNodeData().getTaxonomy().getTaxonomyCode() ); + } } + final ArrayList nodes_to_delete = new ArrayList(); for( final PhylogenyNodeIterator it = to_be_stripped.iteratorExternalForward(); it.hasNext(); ) { final PhylogenyNode n = it.next(); - if ( !ref_ext_taxo.contains( getSpecies( n ) ) ) { + if ( !n.getNodeData().isHasTaxonomy() ) { + nodes_to_delete.add( n ); + } + else if ( !( ref_ext_taxo.contains( n.getNodeData().getTaxonomy().getScientificName() ) ) + && !( ref_ext_taxo.contains( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) ) { nodes_to_delete.add( n ); } } for( final PhylogenyNode phylogenyNode : nodes_to_delete ) { to_be_stripped.deleteSubtree( phylogenyNode, true ); } + to_be_stripped.clearHashIdToNodeMap(); + to_be_stripped.externalNodesHaveChanged(); return nodes_to_delete.size(); } + + /** + * Arranges the order of childern for each node of this Phylogeny in such a + * way that either the branch with more children is on top (right) or on + * bottom (left), dependent on the value of boolean order. + * + * @param order + * decides in which direction to order + * @param pri + */ + public static void orderAppearance( final PhylogenyNode n, + final boolean order, + final boolean order_ext_alphabetically, + final DESCENDANT_SORT_PRIORITY pri ) { + if ( n.isExternal() ) { + return; + } + else { + PhylogenyNode temp = null; + if ( ( n.getNumberOfDescendants() == 2 ) + && ( n.getChildNode1().getNumberOfExternalNodes() != n.getChildNode2().getNumberOfExternalNodes() ) + && ( ( n.getChildNode1().getNumberOfExternalNodes() < n.getChildNode2().getNumberOfExternalNodes() ) == order ) ) { + temp = n.getChildNode1(); + n.setChild1( n.getChildNode2() ); + n.setChild2( temp ); + } + else if ( order_ext_alphabetically ) { + boolean all_ext = true; + for( final PhylogenyNode i : n.getDescendants() ) { + if ( !i.isExternal() ) { + all_ext = false; + break; + } + } + if ( all_ext ) { + PhylogenyMethods.sortNodeDescendents( n, pri ); + } + } + for( int i = 0; i < n.getNumberOfDescendants(); ++i ) { + orderAppearance( n.getChildNode( i ), order, order_ext_alphabetically, pri ); + } + } + } + + public static enum PhylogenyNodeField { + CLADE_NAME, + TAXONOMY_CODE, + TAXONOMY_SCIENTIFIC_NAME, + TAXONOMY_COMMON_NAME, + SEQUENCE_SYMBOL, + SEQUENCE_NAME, + TAXONOMY_ID_UNIPROT_1, + TAXONOMY_ID_UNIPROT_2, + TAXONOMY_ID; + } + + public static enum TAXONOMY_EXTRACTION { + NO, YES, PFAM_STYLE_ONLY; + } + + public static enum DESCENDANT_SORT_PRIORITY { + TAXONOMY, SEQUENCE, NODE_NAME; + } }