From 8298cc0323b6f4d4a905f31512f0b3adcc76f925 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Thu, 10 Aug 2017 17:57:42 -0700 Subject: [PATCH] in progress... --- .../src/org/forester/application/cladinator.java | 77 +++++++++++--- .../src/org/forester/clade_analysis/Analysis.java | 35 +++++-- .../src/org/forester/clade_analysis/Result.java | 39 +++++-- forester/java/src/org/forester/test/Test.java | 106 ++++++++++++++++++++ .../java/src/org/forester/util/ForesterUtil.java | 68 +++++++++++-- 5 files changed, 288 insertions(+), 37 deletions(-) diff --git a/forester/java/src/org/forester/application/cladinator.java b/forester/java/src/org/forester/application/cladinator.java index d482f7a..5a31966 100644 --- a/forester/java/src/org/forester/application/cladinator.java +++ b/forester/java/src/org/forester/application/cladinator.java @@ -26,7 +26,10 @@ package org.forester.application; import java.io.File; +import java.io.IOException; import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; import org.forester.clade_analysis.Analysis; import org.forester.clade_analysis.Result; @@ -41,13 +44,14 @@ import org.forester.util.ForesterUtil; public final class cladinator { final static private String PRG_NAME = "cladinator"; - final static private String PRG_VERSION = "0.100"; - final static private String PRG_DATE = "170721"; + final static private String PRG_VERSION = "0.101"; + final static private String PRG_DATE = "170810"; final static private String PRG_DESC = "clades within clades -- analysis of pplacer type outputs"; final static private String E_MAIL = "phyloxml@gmail.com"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; final static private String HELP_OPTION_1 = "help"; final static private String HELP_OPTION_2 = "h"; + final static private String SEP_OPTION = "s"; private final static DecimalFormat df2 = new DecimalFormat( ".##" ); public static void main( final String args[] ) { @@ -71,51 +75,96 @@ public final class cladinator { print_help(); System.exit( 0 ); } - else if ( ( args.length != 2 ) ) { + else if ( ( args.length != 2 && args.length != 3 ) ) { System.out.println(); System.out.println( "Wrong number of arguments." ); System.out.println(); print_help(); System.exit( -1 ); } - //final List allowed_options = new ArrayList<>(); + final List allowed_options = new ArrayList(); + allowed_options.add( SEP_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final String separator; + if ( cla.isOptionSet( SEP_OPTION ) ) { + separator = cla.getOptionValue( SEP_OPTION ); + } + else { + separator = null; + } final File intreefile = cla.getFile( 0 ); final String query = cla.getName( 1 ); System.out.println( "Input tree: " + intreefile ); - System.out.println( "Query: " + query ); + System.out.println( "Query : " + query ); + if ( !ForesterUtil.isEmpty( separator ) ) { + System.out.println( "Separator : " + separator ); + } + else { + System.out.println( "Separator : none" ); + } Phylogeny p = null; try { final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( intreefile, true ); p = factory.create( intreefile, pp )[ 0 ]; } - catch ( final Exception e ) { + catch ( final IOException e ) { System.out.println( "\nCould not read \"" + intreefile + "\" [" + e.getMessage() + "]\n" ); System.exit( -1 ); } - final Result res = Analysis.execute( p, query ); + final Result res = Analysis.execute( p, query, separator ); System.out.println(); System.out.println( "Result:" ); - System.out.println( "Greatest common prefix : " + res.getGreatestCommonPrefix() ); - System.out.println( "Greatest common prefix up : " + res.getGreatestCommonPrefixUp() ); - System.out.println( "Greatest common prefix down: " + res.getGreatestCommonPrefixDown() ); + System.out.println( "Greatest Common Prefix : " + res.getGreatestCommonPrefix() ); + System.out.println( "Greatest Common Prefix Up : " + res.getGreatestCommonPrefixUp() ); + System.out.println( "Greatest Common Prefix Down : " + res.getGreatestCommonPrefixDown() ); + + if ( !ForesterUtil.isEmpty( res.getGreatestCommonCladeConfidence() ) ) { + System.out.println( "Greatest Common Clade Conf : " + res.getGreatestCommonCladeConfidence() ); + } + if ( !ForesterUtil.isEmpty( res.getGreatestCommonCladeUpConfidence() ) ) { + System.out.println( "Greatest Common Clade Up Conf: " + res.getGreatestCommonCladeUpConfidence() ); + } + if ( !ForesterUtil.isEmpty( res.getGreatestCommonCladeDownConfidence() ) ) { + System.out.println( "Greatest Common Clade Down Conf: " + res.getGreatestCommonCladeDownConfidence() ); + } + + System.out.println( "Least Encompassing Clade size: " + res.getLeastEncompassingCladeSize() + + " external nodes" ); final double lec_ratio = ( 100.0 * res.getLeastEncompassingCladeSize() ) / res.getTreeSize(); - System.out.println( "Least Encompassing Clade has " + res.getLeastEncompassingCladeSize() - + " external nodes (" + df2.format( lec_ratio ) + "% of a total of " + res.getTreeSize() + ")" ); + System.out.println( "Least Encompassing Clade size: " + df2.format( lec_ratio ) + "%" ); + System.out.println( "Total tree size : " + res.getTreeSize() + " external nodes" ); if ( res.getWarnings().size() > 0 ) { System.out.println( "Warnings:" ); for( final String s : res.getWarnings() ) { System.out.println( s ); } } + System.out.println(); } - catch ( final Exception e ) { + catch ( final IllegalArgumentException e ) { ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); } + catch ( final Exception e ) { + e.printStackTrace(); + ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" ); + } } private final static void print_help() { - System.out.println( "Usage: " + PRG_NAME + " " ); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + " [options] " ); + System.out.println(); + System.out.println( " options:" ); + System.out.println( " -" + SEP_OPTION + "=: the separator to be used" ); + System.out.println(); + System.out.println( "Example:" ); + System.out.println(); + System.out.println( " " + PRG_NAME + " -s=. my_tree.xml A.1.1.1" ); System.out.println(); } } diff --git a/forester/java/src/org/forester/clade_analysis/Analysis.java b/forester/java/src/org/forester/clade_analysis/Analysis.java index bddd518..3b67725 100644 --- a/forester/java/src/org/forester/clade_analysis/Analysis.java +++ b/forester/java/src/org/forester/clade_analysis/Analysis.java @@ -27,8 +27,6 @@ // * Multiple "hits" with different "M" values // * More tests (including multiple children per node), especially on edge cases // * Utilize relevant support values for warnings -// * Better system for "clade label creation" (e.g. 1.3.4 + 1.3.6 -> 1.3), use -// specific separator (eg . | _ ) package org.forester.clade_analysis; @@ -37,11 +35,12 @@ import java.util.List; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Confidence; import org.forester.util.ForesterUtil; public final class Analysis { - public static Result execute( final Phylogeny p, final String query ) { + public static Result execute( final Phylogeny p, final String query, final String separator ) { final PhylogenyNode qnode = p.getNode( query ); if ( qnode.isRoot() ) { throw new IllegalStateException( "Unexpected error: Query " + query @@ -51,8 +50,14 @@ public final class Analysis { throw new IllegalStateException( "Unexpected error: Parent of query " + query + " is root. This should have never happened" ); } - final PhylogenyNode qnode_p = qnode.getParent(); - final PhylogenyNode qnode_pp = qnode.getParent().getParent(); + PhylogenyNode qnode_p = qnode.getParent(); + PhylogenyNode qnode_pp = qnode.getParent().getParent(); + while ( qnode_p.getNumberOfDescendants() == 1 ) { + qnode_p = qnode_p.getParent(); + } + while ( qnode_pp.getNumberOfDescendants() == 1 ) { + qnode_pp = qnode_pp.getParent(); + } final List qnode_ext_nodes = qnode_pp.getAllExternalDescendants(); final int lec_ext_nodes = qnode_ext_nodes.size() - 1; final int p_ext_nodes = p.getNumberOfExternalNodes() - 1; @@ -67,7 +72,7 @@ public final class Analysis { qnode_ext_nodes_names.add( name ); } } - final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( qnode_ext_nodes_names ); + final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( qnode_ext_nodes_names, separator ); final Result res = new Result(); if ( greatest_common_prefix.length() < 1 ) { res.addWarning( "No greatest common prefix" ); @@ -81,14 +86,24 @@ public final class Analysis { } res.setLeastEncompassingCladeSize( lec_ext_nodes ); res.setTreeSize( p_ext_nodes ); - final String greatest_common_prefix_a = analyzeSiblings( qnode_p, qnode_pp ); + if ( qnode_pp.getBranchData().getConfidences() != null + && qnode_pp.getBranchData().getConfidences().size() > 0 ) { + final Confidence conf = qnode_pp.getBranchData().getConfidence( 0 ); + if ( conf != null ) { + res.setGreatestCommonCladeConfidence( conf.getValue() + + ( ForesterUtil.isEmpty( conf.getType() ) ? "" : " [" + conf.getType() + "]" ) ); + } + } + final String greatest_common_prefix_a = analyzeSiblings( qnode_p, qnode_pp, separator ); res.setGreatestCommonPrefixUp( greatest_common_prefix_a ); - final String greatest_common_prefix_b = analyzeSiblings( qnode, qnode_p ); + final String greatest_common_prefix_b = analyzeSiblings( qnode, qnode_p, separator ); res.setGreatestCommonPrefixDown( greatest_common_prefix_b ); return res; } - private final static String analyzeSiblings( final PhylogenyNode child, final PhylogenyNode parent ) { + private final static String analyzeSiblings( final PhylogenyNode child, + final PhylogenyNode parent, + final String separator ) { final int child_index = child.getChildNodeIndex(); final List ext_nodes_names = new ArrayList<>(); final List descs = parent.getDescendants(); @@ -104,7 +119,7 @@ public final class Analysis { } } } - final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( ext_nodes_names ); + final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( ext_nodes_names, separator ); return greatest_common_prefix; } } diff --git a/forester/java/src/org/forester/clade_analysis/Result.java b/forester/java/src/org/forester/clade_analysis/Result.java index 1f3af94..e7686e1 100644 --- a/forester/java/src/org/forester/clade_analysis/Result.java +++ b/forester/java/src/org/forester/clade_analysis/Result.java @@ -30,12 +30,15 @@ import java.util.List; public final class Result { - private String _greatest_common_prefix = ""; - private String _greatest_common_prefix_up = ""; - private String _greatest_common_prefix_down = ""; - private final List _warnings = new ArrayList<>(); - private int _lec_ext_nodes = 0; - private int _p_ext_nodes = 0; + private String _greatest_common_prefix = ""; + private String _greatest_common_prefix_up = ""; + private String _greatest_common_prefix_down = ""; + private final List _warnings = new ArrayList<>(); + private int _lec_ext_nodes = 0; + private int _p_ext_nodes = 0; + private String _greatest_common_clade_confidence = ""; + private String _greatest_common_clade_confidence_up = ""; + private String _greatest_common_clade_confidence_down = ""; void addWarning( final String warning ) { _warnings.add( warning ); @@ -53,6 +56,18 @@ public final class Result { _greatest_common_prefix_down = greatest_common_prefix_down; } + void setGreatestCommonCladeConfidence( final String greatest_common_clade_confidence ) { + _greatest_common_clade_confidence = greatest_common_clade_confidence; + } + + void setGreatestCommonCladeUpConfidence( final String greatest_common_clade_confidence_up ) { + _greatest_common_clade_confidence_up = greatest_common_clade_confidence_up; + } + + void setGreatestCommonCladeDownConfidence( final String greatest_common_clade_confidence_down ) { + _greatest_common_clade_confidence_down = greatest_common_clade_confidence_down; + } + public String getGreatestCommonPrefix() { return _greatest_common_prefix; } @@ -65,6 +80,18 @@ public final class Result { return _greatest_common_prefix_down; } + public String getGreatestCommonCladeConfidence() { + return _greatest_common_clade_confidence; + } + + public String getGreatestCommonCladeUpConfidence() { + return _greatest_common_clade_confidence_up; + } + + public String getGreatestCommonCladeDownConfidence() { + return _greatest_common_clade_confidence_down; + } + public List getWarnings() { return _warnings; } diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 0a070c0..1a8a216 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -44,6 +44,7 @@ import org.forester.application.support_transfer; import org.forester.archaeopteryx.AptxUtil; import org.forester.archaeopteryx.TreePanelUtil; import org.forester.archaeopteryx.webservices.WebserviceUtil; +import org.forester.clade_analysis.CladeAnalysisTest; import org.forester.development.DevelopmentTools; import org.forester.evoinference.TestPhylogenyReconstruction; import org.forester.evoinference.matrix.character.CharacterStateMatrix; @@ -229,6 +230,17 @@ public final class Test { } System.out.println( "OK." ); + System.out.print( "Common prefix sep: " ); + if ( !testCommonPrefixSep() ) { + System.out.println( "failed." ); + failed++; + } + else { + succeeded++; + } + System.out.println( "OK." ); + + System.out.print( "Sequence writer: " ); if ( testSequenceWriter() ) { System.out.println( "OK." ); @@ -787,6 +799,15 @@ public final class Test { System.out.println( "failed." ); failed++; } + System.out.print( "Clade analyis: " ); + if ( CladeAnalysisTest.test() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } System.out.print( "Phylogeny reconstruction:" ); System.out.println(); if ( TestPhylogenyReconstruction.test( new File( PATH_TO_TEST_DATA ) ) ) { @@ -1925,6 +1946,91 @@ public final class Test { } return true; } + + private static boolean testCommonPrefixSep() { + final List l0 = new ArrayList(); + l0.add( "a.b.c" ); + if ( !ForesterUtil.greatestCommonPrefix( l0, ".").equals( "a.b.c" ) ) { + return false; + } + + final List l1 = new ArrayList(); + l1.add( "a.b.c" ); + l1.add( "a.b.X" ); + if ( !ForesterUtil.greatestCommonPrefix( l1 , ".").equals( "a.b" ) ) { + return false; + } + + final List l2 = new ArrayList(); + l2.add( "a.b.c." ); + l2.add( "a.b.X." ); + l2.add( "a.x.y." ); + if ( !ForesterUtil.greatestCommonPrefix( l2, ".").equals( "a" ) ) { + return false; + } + + final List l3 = new ArrayList(); + l3.add( "a/b/X/s/d/f/s/d/f/s/d/f/s/d/f/s/d/f/s/d/" ); + l3.add( "a/b/X/s/d/f/s/d/f/s/d/f/s/d/f/s/d/f/s/d" ); + l3.add( "a/b/c" ); + l3.add( "a/b/X/s/d/f/s/d/f/s/d/f/s/d/f/s/d/f/s/d/" ); + l3.add( "a/b/" ); + l3.add( "a/b/c/" ); + l3.add( "a/b////////" ); + if ( !ForesterUtil.greatestCommonPrefix( l3, "/" ).equals( "a/b" ) ) { + return false; + } + + final List l4 = new ArrayList(); + l4.add( "a.b.X.s.d.f.s.d.f.s.d.f.s.d.f.s.d.f.s.d" ); + l4.add( "a.b.X.s.d.f.s.d.f.s.d.f.s.d.f.s.d.f.s.d" ); + l4.add( "a.b.c" ); + l4.add( "X.s.d.f.s.d.f.s.d.f.s.d.f.s.d.f.s.d..." ); + l4.add( "a.b" ); + l4.add( "a.b.c" ); + if ( !ForesterUtil.greatestCommonPrefix( l4, "." ).equals( "" ) ) { + return false; + } + + final List l5 = new ArrayList(); + l5.add( "" ); + if ( !ForesterUtil.greatestCommonPrefix( l5, "_" ).equals( "" ) ) { + return false; + } + + final List l6 = new ArrayList(); + l6.add( "_" ); + l6.add( "__" ); + if ( !ForesterUtil.greatestCommonPrefix( l6, "_" ).equals( "" ) ) { + return false; + } + + final List l7 = new ArrayList(); + l7.add( "a,b,c" ); + l7.add( "a,b,X" ); + l7.add( "" ); + l7.add( ",,,,,,,,,," ); + if ( !ForesterUtil.greatestCommonPrefix( l7, "," ).equals( "" ) ) { + return false; + } + + final List l8 = new ArrayList(); + l8.add( "123.304.403.04" ); + l8.add( "123.304.403.04.02" ); + l8.add( "123.304.403.03.03" ); + if ( !ForesterUtil.greatestCommonPrefix( l8, "." ).equals( "123.304.403" ) ) { + return false; + } + + final List l9 = new ArrayList(); + l9.add( "123.304.403.04" ); + l9.add( "123.304.403.04.02" ); + l9.add( "123.304.402.03.03" ); + if ( !ForesterUtil.greatestCommonPrefix( l9, "." ).equals( "123.304" ) ) { + return false; + } + return true; + } private static boolean testUTF8ParsingFromFile() { diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 3ba3887..971032d 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -593,12 +593,12 @@ public final class ForesterUtil { final public static boolean isEmpty( final String s ) { return ( ( s == null ) || ( s.length() < 1 ) ); } - + final public static boolean isEmptyTrimmed( final String s ) { - if ( s == null ) { - return true; - } - return ( ( s.trim().length() < 1 ) ); + if ( s == null ) { + return true; + } + return ( ( s.trim().length() < 1 ) ); } /** @@ -1589,12 +1589,49 @@ public final class ForesterUtil { return a.substring( 0, min_length ); } + public final static String greatestCommonPrefix( final String a, final String b, final String separator ) { + if ( ForesterUtil.isEmpty( separator ) ) { + throw new IllegalArgumentException( "separator must not be null or empty" ); + } + final String[] as = a.split( Pattern.quote( separator ) ); + final String[] bs = b.split( Pattern.quote( separator ) ); + final int min_length = Math.min( as.length, bs.length ); + for( int i = 0; i < min_length; ++i ) { + if ( !( as[ i ].equals( bs[ i ] ) ) ) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for( int j = 0; j < i; ++j ) { + if ( first ) { + first = false; + } + else { + sb.append( separator ); + } + sb.append( as[ j ] ); + } + return sb.toString(); + } + } + StringBuilder sb = new StringBuilder(); + boolean first = true; + for( int j = 0; j < min_length; ++j ) { + if ( first ) { + first = false; + } + else { + sb.append( separator ); + } + sb.append( as[ j ] ); + } + return sb.toString(); + } + public final static String greatestCommonPrefix( final List strings ) { if ( strings == null ) { - throw new IllegalArgumentException( "list is null" ); + throw new IllegalArgumentException( "list of strings is null" ); } if ( strings.isEmpty() ) { - throw new IllegalArgumentException( "list is empty" ); + throw new IllegalArgumentException( "list of strings is empty" ); } String common = strings.get( 0 ); for( int i = 1; i < strings.size(); ++i ) { @@ -1603,6 +1640,23 @@ public final class ForesterUtil { return common; } + public final static String greatestCommonPrefix( final List strings, final String separator ) { + if ( ForesterUtil.isEmpty( separator ) ) { + return greatestCommonPrefix( strings ); + } + if ( strings == null ) { + throw new IllegalArgumentException( "list of strings is null" ); + } + if ( strings.isEmpty() ) { + throw new IllegalArgumentException( "list of strings is empty" ); + } + String common = strings.get( 0 ); + for( int i = 1; i < strings.size(); ++i ) { + common = greatestCommonPrefix( common, strings.get( i ), separator ); + } + return common; + } + private ForesterUtil() { } } -- 1.7.10.2