From f4f1b0cf648f352e9d3dcaa62aef22355cc801ad Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Fri, 18 Aug 2017 14:49:00 -0700 Subject: [PATCH] in progress... --- .../src/org/forester/clade_analysis/Analysis2.java | 121 +++++++++------- .../forester/clade_analysis/CladeAnalysisTest.java | 50 +++++++ .../src/org/forester/clade_analysis/Result2.java | 19 ++- .../java/src/org/forester/util/ForesterUtil.java | 4 + forester/test_data/pplacer_2.tre | 148 ++++++++++++++++++++ 5 files changed, 294 insertions(+), 48 deletions(-) create mode 100644 forester/test_data/pplacer_2.tre diff --git a/forester/java/src/org/forester/clade_analysis/Analysis2.java b/forester/java/src/org/forester/clade_analysis/Analysis2.java index 4c1d368..f3b8cae 100644 --- a/forester/java/src/org/forester/clade_analysis/Analysis2.java +++ b/forester/java/src/org/forester/clade_analysis/Analysis2.java @@ -33,6 +33,8 @@ package org.forester.clade_analysis; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; @@ -41,29 +43,62 @@ import org.forester.util.ForesterUtil; public final class Analysis2 { - public static Result2 execute( final Phylogeny p, final String query, final String separator ) { - final PhylogenyNode qnode = p.getNode( query ); - if ( qnode.isRoot() ) { - throw new IllegalStateException( "Unexpected error: Query " + query - + " is root. This should have never happened" ); - } - if ( qnode.getParent().isRoot() ) { - throw new IllegalStateException( "Unexpected error: Parent of query " + query - + " is root. This should have never happened" ); - } - PhylogenyNode qnode_p = qnode.getParent(); - PhylogenyNode qnode_pp = qnode.getParent().getParent(); - while ( qnode_p.getNumberOfDescendants() == 1 ) { - qnode_p = qnode_p.getParent(); - } - while ( qnode_pp.getNumberOfDescendants() == 1 ) { - qnode_pp = qnode_pp.getParent(); + public static Result2 execute( final Phylogeny p, final Pattern query, final String separator ) { + final List qnodes = p.getNodes( query ); + final Result2 res = new Result2(); + for( int i = 0; i < qnodes.size(); ++i ) { + final PhylogenyNode qnode = qnodes.get( i ); + System.out.println( ">>" + qnode.getName() ); + if ( qnode.isRoot() ) { + throw new IllegalArgumentException( "Query " + query + " is root." ); + } + if ( qnode.getParent().isRoot() ) { + throw new IllegalArgumentException( "Parent of query " + query + " is root." ); + } + PhylogenyNode qnode_p = qnode.getParent(); + PhylogenyNode qnode_pp = qnode.getParent().getParent(); + //This is to deal with internal nodes with 1 descendant. + while ( qnode_p.getNumberOfDescendants() == 1 ) { + qnode_p = qnode_p.getParent(); + } + while ( qnode_pp.getNumberOfDescendants() == 1 ) { + qnode_pp = qnode_pp.getParent(); + } + // final List qnode_ext_nodes = new ArrayList(); + final List qnode_ext_nodes_names = new ArrayList<>(); + for( final PhylogenyNode qnode_ext_node : qnode_pp.getAllExternalDescendants() ) { + final String name = qnode_ext_node.getName(); + if ( ForesterUtil.isEmptyTrimmed( name ) ) { + throw new IllegalArgumentException( "external node(s) with empty names found" ); + } + final Matcher m = query.matcher( name ); + if ( !m.find() ) { + qnode_ext_nodes_names.add( name ); + } + } + final int lec_ext_nodes = qnode_ext_nodes_names.size(); + final int p_ext_nodes = p.getNumberOfExternalNodes() - 1; + final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( qnode_ext_nodes_names, separator ); + System.out.println( greatest_common_prefix ); + Matcher matcher = query.matcher( qnode.getName() ); + String conf_str = null; + if ( matcher.find() ) { + conf_str = matcher.group( 1 ); + } + else { + throw new IllegalStateException( "pattern did not match -- this should have never happened!" ); + } + res.setLeastEncompassingCladeSize( lec_ext_nodes ); + res.setTreeSize( p_ext_nodes ); + final double conf = Double.parseDouble( conf_str ); + if ( !ForesterUtil.isEmpty( greatest_common_prefix ) ) { + res.addGreatestCommonPrefix( greatest_common_prefix, conf ); + } + else { + res.addGreatestCommonPrefix( "?", conf ); + } } - final List qnode_ext_nodes = qnode_pp.getAllExternalDescendants(); - final int lec_ext_nodes = qnode_ext_nodes.size() - 1; - final int p_ext_nodes = p.getNumberOfExternalNodes() - 1; - final List qnode_ext_nodes_names = new ArrayList<>(); - for( final PhylogenyNode qnode_ext_node : qnode_ext_nodes ) { + /* for( final PhylogenyNode qnode_ext_node : qnode_ext_nodes ) { String name = qnode_ext_node.getName(); if ( ForesterUtil.isEmptyTrimmed( name ) ) { throw new IllegalArgumentException( "external node(s) with empty names found" ); @@ -72,29 +107,23 @@ public final class Analysis2 { if ( !name.equals( query ) ) { qnode_ext_nodes_names.add( name ); } - } - final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( qnode_ext_nodes_names, separator ); - final Result2 res = new Result2(); - if ( greatest_common_prefix.length() < 1 ) { - res.addWarning( "No greatest common prefix" ); - //res.setGreatestCommonPrefix( "" ); - } - else { - // res.setGreatestCommonPrefix( greatest_common_prefix ); - // res.addGreatestCommonPrefix( prefix, confidence, separator ); //TODO - } - if ( qnode_pp.isRoot() ) { - res.addWarning( "Least Encompassing Clade is entire tree" ); - } - res.setLeastEncompassingCladeSize( lec_ext_nodes ); - res.setTreeSize( p_ext_nodes ); - - final String conf = obtainConfidence( qnode_pp ); + }*/ + // if ( greatest_common_prefix.length() < 1 ) { + // res.addWarning( "No greatest common prefix" ); + //res.setGreatestCommonPrefix( "" ); + // } + // else { + // // res.setGreatestCommonPrefix( greatest_common_prefix ); + // res.addGreatestCommonPrefix( prefix, confidence, separator ); //TODO + // } + // if ( qnode_pp.isRoot() ) { + // res.addWarning( "Least Encompassing Clade is entire tree" ); + // } + /* final String conf = obtainConfidence( qnode_pp ); if ( conf != null ) { res.setGreatestCommonCladeSubtreeConfidence(conf); - } - - final String greatest_common_prefix_up[] = analyzeSiblings( qnode_p, qnode_pp, separator ); + }*/ + /* final String greatest_common_prefix_up[] = analyzeSiblings( qnode_p, qnode_pp, separator ); res.setGreatestCommonPrefixUp( greatest_common_prefix_up[ 0 ] ); if ( greatest_common_prefix_up[ 1 ] != null ) { res.setGreatestCommonCladeUpSubtreeConfidence( greatest_common_prefix_up[ 1 ] ); @@ -103,12 +132,10 @@ public final class Analysis2 { res.setGreatestCommonPrefixDown( greatest_common_prefix_down[ 0 ] ); if ( greatest_common_prefix_down[ 1 ] != null ) { res.setGreatestCommonCladeDownSubtreeConfidence( greatest_common_prefix_down[ 1 ] ); - } + }*/ return res; } - - private final static String[] analyzeSiblings( final PhylogenyNode child, final PhylogenyNode parent, final String separator ) { @@ -134,7 +161,7 @@ public final class Analysis2 { final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( ext_nodes_names, separator ); return new String[] { greatest_common_prefix, conf }; } - + private final static String obtainConfidence( final PhylogenyNode n ) { if ( n.getBranchData().getConfidences() != null && n.getBranchData().getConfidences().size() > 0 ) { final List confidences = n.getBranchData().getConfidences(); diff --git a/forester/java/src/org/forester/clade_analysis/CladeAnalysisTest.java b/forester/java/src/org/forester/clade_analysis/CladeAnalysisTest.java index 3d1f4b9..207b57c 100644 --- a/forester/java/src/org/forester/clade_analysis/CladeAnalysisTest.java +++ b/forester/java/src/org/forester/clade_analysis/CladeAnalysisTest.java @@ -3,6 +3,7 @@ package org.forester.clade_analysis; import java.io.File; import java.util.List; +import java.util.regex.Pattern; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.util.ParserUtils; @@ -30,6 +31,10 @@ public class CladeAnalysisTest { System.out.println( "Clade analysis 3 failed" ); failed = true; } + if ( !testCladeAnalysis4() ) { + System.out.println( "Clade analysis 3 failed" ); + failed = true; + } if ( !failed ) { System.out.println( "OK" ); } @@ -45,6 +50,9 @@ public class CladeAnalysisTest { if ( !testCladeAnalysis3() ) { return false; } + if ( !testCladeAnalysis4() ) { + return false; + } return true; } @@ -711,4 +719,46 @@ public class CladeAnalysisTest { } return true; } + + private static boolean testCladeAnalysis4() { + try { + final File intreefile1 = new File( PATH_TO_TEST_DATA + "pplacer_2.tre" ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( intreefile1, true ); + final Phylogeny p1 = factory.create( intreefile1, pp )[ 0 ]; + Pattern query = Pattern.compile(".+#\\d+_M=(.+)"); + Result2 res = Analysis2.execute( p1, query, "." ); + + res.analyzeGreatestCommonPrefixes( 0.3 ); + System.out.print( res.toString()); + System.out.println( "------------------------- "); + System.out.println(); + + // Result res = Analysis.execute( p1, "A.1.1.1", "." ); + /* if ( !res.getGreatestCommonPrefix().equals( "A.1" ) ) { + return false; + } + if ( !res.getGreatestCommonPrefixDown().equals( "A.1.1" ) ) { + return false; + } + if ( !res.getGreatestCommonPrefixUp().equals( "A.1.2.1" ) ) { + return false; + } + if ( res.getLeastEncompassingCladeSize() != 4 ) { + return false; + } + if ( res.getTreeSize() != 25 ) { + return false; + } + if ( res.getWarnings().size() != 0 ) { + return false; + }*/ + + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } } diff --git a/forester/java/src/org/forester/clade_analysis/Result2.java b/forester/java/src/org/forester/clade_analysis/Result2.java index 81353c3..2bd4911 100644 --- a/forester/java/src/org/forester/clade_analysis/Result2.java +++ b/forester/java/src/org/forester/clade_analysis/Result2.java @@ -62,6 +62,23 @@ public final class Result2 { _separator = ".";//TODO make const somewhere } + public List getAllMultiHitPrefixes() { + return _all; + } + + public List getCollapsedMultiHitPrefixes() { + return _collapsed; + } + + public List getSpecificMultiHitPrefixes() { + return _cleaned_spec; + } + + public boolean isHasSpecificMultiHitsPrefixes() { + return _has_specifics; + } + + void addWarning( final String warning ) { _warnings.add( warning ); } @@ -201,7 +218,7 @@ public final class Result2 { confidence_sum += prefix.getConfidence(); } } - if ( !ForesterUtil.isEqual( confidence_sum, 1.0 ) ) { + if ( !ForesterUtil.isEqual( confidence_sum, 1.0, 1E-5 ) ) { throw new IllegalArgumentException( "Confidences add up to " + confidence_sum + " instead of 1.0" ); } return collapsed; diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 3d9888b..e453dad 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -621,6 +621,10 @@ public final class ForesterUtil { final public static boolean isEqual( final double a, final double b ) { return ( ( Math.abs( a - b ) ) < ZERO_DIFF ); } + + final public static boolean isEqual( final double a, final double b, final double tolerance ) { + return ( ( Math.abs( a - b ) ) < tolerance ); + } final public static boolean isEven( final int n ) { return ( n % 2 ) == 0; diff --git a/forester/test_data/pplacer_2.tre b/forester/test_data/pplacer_2.tre new file mode 100644 index 0000000..f28c394 --- /dev/null +++ b/forester/test_data/pplacer_2.tre @@ -0,0 +1,148 @@ + + + + + 0.0 + + 0.0679195 + + 0.21174 + + 0.477305 + + 0.309716 + + 0.0152436 + + 0.0857918 + + 0.162176 + + 9.756E-6 + + 0.0802987 + + 0.0684959 + + 0.0761231 + + 0.107021 + + 1.14092 + + A.1.1.1 + 1.0E-6 + + + A.1.1.2 + 0.043972 + + + + CED9_CAEBR_#5_M=0.0277996 + 1.49689 + + + + A.1.1.3 + 1.11622 + + + + CED9_CAEBR_#6_M=0.0273544 + 1.58319 + + + + 0.760242 + + A.1.2.1 + 0.130667 + + + A.1.2.2 + 0.127953 + + + + + CED9_CAEBR_#4_M=0.0552666 + 1.60222 + + + + 7.591E-6 + + A.2.1.1 + 1.00994 + + + CED9_CAEBR_#3_M=0.0552703 + 1.60221 + + + + + CED9_CAEBR_#0_M=0.380211 + 1.54796 + + + + 1.257E-5 + + 1.11517 + + 0.0852309 + + A.3.1.1 + 0.022644 + + + A.3.1.1 + 0.017626 + + + + A.3.2.1 + 0.156409 + + + + CED9_CAEBR_#2_M=0.224819 + 1.56994 + + + + + CED9_CAEBR_#1_M=0.229279 + 1.56987 + + + + C.5 + 0.367867 + + + + + + A.6 + 0.030507 + + + + A.7 + 0.026535 + + + + A.8 + 0.035019 + + + B.9 + 1.0E-6 + + + + \ No newline at end of file -- 1.7.10.2