in progress...
[jalview.git] / forester / java / src / org / forester / application / cladinator.java
index d1b4f44..6dc17e3 100644 (file)
@@ -31,7 +31,6 @@ import java.text.DecimalFormat;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.SortedMap;
-import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
@@ -41,34 +40,40 @@ import org.forester.clade_analysis.ResultMulti;
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.phylogeny.Phylogeny;
-import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
-import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.util.BasicTable;
 import org.forester.util.BasicTableParser;
 import org.forester.util.CommandLineArguments;
 import org.forester.util.EasyWriter;
 import org.forester.util.ForesterUtil;
+import org.forester.util.UserException;
 
 public final class cladinator {
 
-    final static private String        PRG_NAME                 = "cladinator";
-    final static private String        PRG_VERSION              = "1.01";
-    final static private String        PRG_DATE                 = "170906";
-    final static private String        PRG_DESC                 = "clades within clades of annotated labels -- analysis of pplacer-type outputs";
-    final static private String        E_MAIL                   = "phyloxml@gmail.com";
-    final static private String        WWW                      = "https://sites.google.com/site/cmzmasek/home/software/forester";
-    final static private String        HELP_OPTION_1            = "help";
-    final static private String        HELP_OPTION_2            = "h";
-    final static private String        SEP_OPTION               = "s";
-    final static private String        QUERY_PATTERN_OPTION     = "q";
-    final static private String        SPECIFICS_CUTOFF_OPTION  = "c";
-    final static private String        MAPPING_FILE_OPTION      = "m";
-    final static private double        SPECIFICS_CUTOFF_DEFAULT = 0.8;
-    final static private String        SEP_DEFAULT              = ".";
-    final static private Pattern       QUERY_PATTERN_DEFAULT    = AnalysisMulti.DEFAULT_QUERY_PATTERN_FOR_PPLACER_TYPE;
-    private final static DecimalFormat df                       = new DecimalFormat( "0.0#######" );
+    final static private String        PRG_NAME                             = "cladinator";
+    final static private String        PRG_VERSION                          = "1.04";
+    final static private String        PRG_DATE                             = "170915";
+    final static private String        PRG_DESC                             = "clades within clades of annotated labels -- analysis of pplacer-type outputs";
+    final static private String        E_MAIL                               = "phyloxml@gmail.com";
+    final static private String        WWW                                  = "https://sites.google.com/site/cmzmasek/home/software/forester";
+    final static private String        HELP_OPTION_1                        = "help";
+    final static private String        HELP_OPTION_2                        = "h";
+    final static private String        SEP_OPTION                           = "s";
+    final static private String        QUERY_PATTERN_OPTION                 = "q";
+    final static private String        SPECIFICS_CUTOFF_OPTION              = "c";
+    final static private String        MAPPING_FILE_OPTION                  = "m";
+    final static private String        EXTRA_PROCESSING_OPTION1             = "x";
+    final static private String        EXTRA_PROCESSING1_SEP_OPTION         = "xs";
+    final static private String        EXTRA_PROCESSING1_KEEP_EXTRA_OPTION  = "xk";
+    final static private String        QUIET_OPTION                         = "Q";
+    final static private String        VERBOSE_OPTION                       = "v";
+    final static private double        SPECIFICS_CUTOFF_DEFAULT             = 0.8;
+    final static private String        SEP_DEFAULT                          = ".";
+    final static private Pattern       QUERY_PATTERN_DEFAULT                = AnalysisMulti.DEFAULT_QUERY_PATTERN_FOR_PPLACER_TYPE;
+    final static private String        EXTRA_PROCESSING1_SEP_DEFAULT        = "|";
+    final static private boolean       EXTRA_PROCESSING1_KEEP_EXTRA_DEFAULT = false;
+    private final static DecimalFormat df                                   = new DecimalFormat( "0.0###" );
 
     public static void main( final String args[] ) {
         try {
@@ -100,6 +105,11 @@ public final class cladinator {
             allowed_options.add( QUERY_PATTERN_OPTION );
             allowed_options.add( SPECIFICS_CUTOFF_OPTION );
             allowed_options.add( MAPPING_FILE_OPTION );
+            allowed_options.add( EXTRA_PROCESSING_OPTION1 );
+            allowed_options.add( EXTRA_PROCESSING1_SEP_OPTION );
+            allowed_options.add( EXTRA_PROCESSING1_KEEP_EXTRA_OPTION );
+            allowed_options.add( VERBOSE_OPTION );
+            allowed_options.add( QUIET_OPTION );
             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
             if ( dissallowed_options.length() > 0 ) {
                 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
@@ -186,6 +196,54 @@ public final class cladinator {
                 t = null;
                 map = null;
             }
+            final boolean extra_processing1;
+            if ( cla.isOptionSet( EXTRA_PROCESSING_OPTION1 ) ) {
+                extra_processing1 = true;
+            }
+            else {
+                extra_processing1 = false;
+            }
+            String extra_processing1_sep = EXTRA_PROCESSING1_SEP_DEFAULT;
+            if ( cla.isOptionSet( EXTRA_PROCESSING1_SEP_OPTION ) ) {
+                if ( !extra_processing1 ) {
+                    ForesterUtil.fatalError( PRG_NAME,
+                                             "extra processing is not enabled, cannot set -"
+                                                     + EXTRA_PROCESSING1_SEP_OPTION + " option" );
+                }
+                if ( cla.isOptionValueSet( EXTRA_PROCESSING1_SEP_OPTION ) ) {
+                    extra_processing1_sep = cla.getOptionValue( EXTRA_PROCESSING1_SEP_OPTION );
+                }
+                else {
+                    ForesterUtil.fatalError( PRG_NAME, "no value for extra processing separator" );
+                }
+            }
+            if ( ( extra_processing1_sep != null ) && extra_processing1_sep.equals( separator ) ) {
+                ForesterUtil.fatalError( PRG_NAME,
+                                         "extra processing separator must not be the same the annotation-separator" );
+            }
+            boolean extra_processing1_keep = EXTRA_PROCESSING1_KEEP_EXTRA_DEFAULT;
+            if ( cla.isOptionSet( EXTRA_PROCESSING1_KEEP_EXTRA_OPTION ) ) {
+                if ( !extra_processing1 ) {
+                    ForesterUtil.fatalError( PRG_NAME,
+                                             "extra processing is not enabled, cannot set -"
+                                                     + EXTRA_PROCESSING1_KEEP_EXTRA_OPTION + " option" );
+                }
+                extra_processing1_keep = true;
+            }
+            final boolean verbose;
+            if ( cla.isOptionSet( VERBOSE_OPTION ) ) {
+                verbose = true;
+            }
+            else {
+                verbose = false;
+            }
+            final boolean quit;
+            if ( cla.isOptionSet( QUIET_OPTION ) ) {
+                quit = true;
+            }
+            else {
+                quit = false;
+            }
             System.out.println( "Input tree                 : " + intreefile );
             System.out.println( "Specific-hit support cutoff: " + cutoff_specifics );
             if ( mapping_file != null ) {
@@ -194,30 +252,74 @@ public final class cladinator {
             }
             System.out.println( "Annotation-separator       : " + separator );
             System.out.println( "Query pattern              : " + pattern );
+            System.out.println( "Extra processing           : " + extra_processing1 );
+            if ( extra_processing1 ) {
+                System.out.println( "Extra processing separator : " + extra_processing1_sep );
+                System.out.println( "Keep extra annotations     : " + extra_processing1_keep );
+            }
             if ( outtablefile != null ) {
                 System.out.println( "Output table               : " + outtablefile );
             }
-            Phylogeny p = null;
+            Phylogeny phys[] = null;
             try {
                 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
                 final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( intreefile, true );
-                p = factory.create( intreefile, pp )[ 0 ];
+                phys = factory.create( intreefile, pp );
             }
             catch ( final IOException e ) {
                 ForesterUtil.fatalError( PRG_NAME, "Could not read \"" + intreefile + "\" [" + e.getMessage() + "]" );
-                System.exit( -1 );
             }
-            System.out.println( "Ext. nodes in input tree   : " + p.getNumberOfExternalNodes() );
-            if ( map != null ) {
-                performMapping( pattern, map, p );
+            if ( phys.length == 0 ) {
+                ForesterUtil.fatalError( PRG_NAME, "\"" + intreefile + "\" does not contain any trees" );
             }
-            final ResultMulti res = AnalysisMulti.execute( p, pattern, separator, cutoff_specifics );
-            printResult( res );
+            System.out.println( "Number of input trees      : " + phys.length );
+            if ( phys.length == 1 ) {
+                System.out.println( "Ext. nodes in input tree   : " + phys[ 0 ].getNumberOfExternalNodes() );
+            }
+            else {
+                System.out.println( "Ext. nodes in input tree 1 : " + phys[ 0 ].getNumberOfExternalNodes() );
+            }
+            final EasyWriter outtable_writer;
             if ( outtablefile != null ) {
-                writeResultToTable( res, outtablefile );
+                outtable_writer = ForesterUtil.createEasyWriter( outtablefile );
+            }
+            else {
+                outtable_writer = null;
+            }
+            int counter = 0;
+            for( final Phylogeny phy : phys ) {
+                if ( map != null ) {
+                    AnalysisMulti.performMapping( pattern, map, phy, verbose );
+                }
+                if ( extra_processing1 ) {
+                    AnalysisMulti.performExtraProcessing1( pattern,
+                                                           phy,
+                                                           extra_processing1_sep,
+                                                           extra_processing1_keep,
+                                                           separator,
+                                                           verbose );
+                }
+                final ResultMulti res = AnalysisMulti.execute( phy, pattern, separator, cutoff_specifics );
+                if ( !quit ) {
+                    if ( phys.length == 1 ) {
+                        printResult( res, -1 );
+                    }
+                    else {
+                        printResult( res, counter );
+                    }
+                }
+                if ( outtable_writer != null ) {
+                    writeResultToTable( res, outtable_writer );
+                    outtable_writer.flush();
+                }
+                ++counter;
+            }
+            if ( outtable_writer != null ) {
+                outtable_writer.flush();
+                outtable_writer.close();
             }
         }
-        catch ( final IllegalArgumentException e ) {
+        catch ( final UserException e ) {
             ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
         }
         catch ( final IOException e ) {
@@ -229,136 +331,154 @@ public final class cladinator {
         }
     }
 
-    private final static void performMapping( final Pattern pattern,
-                                              final SortedMap<String, String> map,
-                                              Phylogeny p ) {
-        final PhylogenyNodeIterator it = p.iteratorExternalForward();
-        while ( it.hasNext() ) {
-            final PhylogenyNode node = it.next();
-            final String name = node.getName();
-            if ( ForesterUtil.isEmpty( name ) ) {
-                ForesterUtil.fatalError( PRG_NAME, "external node with empty name found" );
-            }
-            final Matcher m = pattern.matcher( name );
-            if ( !m.find() ) {
-                if ( !map.containsKey( name ) ) {
-                    ForesterUtil.fatalError( PRG_NAME, "no mapping for \"" + name + "\" found" );
-                }
-                node.setName( map.get( name ) );
-            }
-        }
-    }
-
-    private final static void printResult( final ResultMulti res ) {
-        System.out.println();
-        System.out.println( "Result:" );
+    private final static void printResult( final ResultMulti res, final int counter ) {
         System.out.println();
+        if ( counter == -1 ) {
+            System.out.println( "Result for " + res.getQueryNamePrefix() );
+        }
+        else {
+            System.out.println( "Result for " + res.getQueryNamePrefix() + " [tree " + counter + "]" );
+        }
         if ( ( res.getAllMultiHitPrefixes() == null ) | ( res.getAllMultiHitPrefixes().size() < 1 ) ) {
-            System.out.println( "No match to query pattern!" );
+            System.out.println( " No match to query pattern!" );
         }
         else {
-            System.out.println( "Matching Clade(s):" );
+            System.out.println( " Matching Clade(s):" );
             for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) {
-                System.out.println( prefix );
+                System.out.println( " " + prefix );
             }
             if ( res.isHasSpecificMultiHitsPrefixes() ) {
                 System.out.println();
-                System.out.println( "Specific-hit(s):" );
+                System.out.println( " Specific-hit(s):" );
                 for( final Prefix prefix : res.getSpecificMultiHitPrefixes() ) {
-                    System.out.println( prefix );
+                    System.out.println( " " + prefix );
                 }
                 System.out.println();
-                System.out.println( "Matching Clade(s) with Specific-hit(s):" );
+                System.out.println( " Matching Clade(s) with Specific-hit(s):" );
                 for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) {
-                    System.out.println( prefix );
+                    System.out.println( " " + prefix );
                     for( final Prefix spec : res.getSpecificMultiHitPrefixes() ) {
                         if ( spec.getPrefix().startsWith( prefix.getPrefix() ) ) {
-                            System.out.println( "    " + spec );
+                            System.out.println( "     " + spec );
                         }
                     }
                 }
             }
             if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesDown() ) ) {
                 System.out.println();
-                System.out.println( "Matching Down-tree Bracketing Clade(s):" );
+                System.out.println( " Matching Down-tree Bracketing Clade(s):" );
                 for( final Prefix prefix : res.getCollapsedMultiHitPrefixesDown() ) {
-                    System.out.println( prefix );
+                    System.out.println( " " + prefix );
                 }
             }
             if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesUp() ) ) {
                 System.out.println();
-                System.out.println( "Matching Up-tree Bracketing Clade(s):" );
+                System.out.println( " Matching Up-tree Bracketing Clade(s):" );
                 for( final Prefix prefix : res.getCollapsedMultiHitPrefixesUp() ) {
-                    System.out.println( prefix );
+                    System.out.println( " " + prefix );
                 }
             }
+            System.out.println();
+            System.out.println( " Total Number of Matches: " + res.getNumberOfMatches() + "/"
+                    + res.getReferenceTreeNumberOfExternalNodes() );
         }
         System.out.println();
     }
 
-    private final static void writeResultToTable( final ResultMulti res, final File outtablefile ) throws IOException {
-        final EasyWriter w = ForesterUtil.createEasyWriter( outtablefile );
+    private final static void writeResultToTable( final ResultMulti res, final EasyWriter w ) throws IOException {
         if ( ( res.getAllMultiHitPrefixes() == null ) | ( res.getAllMultiHitPrefixes().size() < 1 ) ) {
+            w.print( res.getQueryNamePrefix() );
+            w.print( "\t" );
             w.println( "No match to query pattern!" );
         }
         else {
             for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) {
+                w.print( res.getQueryNamePrefix() );
+                w.print( "\t" );
                 w.print( "Matching Clades" );
                 w.print( "\t" );
                 w.print( prefix.getPrefix() );
                 w.print( "\t" );
                 w.print( df.format( prefix.getConfidence() ) );
+                w.print( "\t" );
+                w.print( String.valueOf( res.getNumberOfMatches() ) );
+                w.print( "\t" );
+                w.print( String.valueOf( res.getReferenceTreeNumberOfExternalNodes() ) );
                 w.println();
             }
             if ( res.isHasSpecificMultiHitsPrefixes() ) {
                 for( final Prefix prefix : res.getSpecificMultiHitPrefixes() ) {
+                    w.print( res.getQueryNamePrefix() );
+                    w.print( "\t" );
                     w.print( "Specific-hits" );
                     w.print( "\t" );
                     w.print( prefix.getPrefix() );
                     w.print( "\t" );
                     w.print( df.format( prefix.getConfidence() ) );
+                    w.print( "\t" );
+                    w.print( String.valueOf( res.getNumberOfMatches() ) );
+                    w.print( "\t" );
+                    w.print( String.valueOf( res.getReferenceTreeNumberOfExternalNodes() ) );
                     w.println();
                 }
             }
             if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesDown() ) ) {
                 for( final Prefix prefix : res.getCollapsedMultiHitPrefixesDown() ) {
+                    w.print( res.getQueryNamePrefix() );
+                    w.print( "\t" );
                     w.print( "Matching Down-tree Bracketing Clades" );
                     w.print( "\t" );
                     w.print( prefix.getPrefix() );
                     w.print( "\t" );
                     w.print( df.format( prefix.getConfidence() ) );
+                    w.print( "\t" );
+                    w.print( String.valueOf( res.getNumberOfMatches() ) );
+                    w.print( "\t" );
+                    w.print( String.valueOf( res.getReferenceTreeNumberOfExternalNodes() ) );
                     w.println();
                 }
             }
             if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesUp() ) ) {
                 for( final Prefix prefix : res.getCollapsedMultiHitPrefixesUp() ) {
+                    w.print( res.getQueryNamePrefix() );
+                    w.print( "\t" );
                     w.print( "Matching Up-tree Bracketing Clades" );
                     w.print( "\t" );
                     w.print( prefix.getPrefix() );
                     w.print( "\t" );
                     w.print( df.format( prefix.getConfidence() ) );
+                    w.print( "\t" );
+                    w.print( String.valueOf( res.getNumberOfMatches() ) );
+                    w.print( "\t" );
+                    w.print( String.valueOf( res.getReferenceTreeNumberOfExternalNodes() ) );
                     w.println();
                 }
             }
         }
-        w.flush();
-        w.close();
     }
 
     private final static void print_help() {
         System.out.println( "Usage:" );
         System.out.println();
-        System.out.println( PRG_NAME + " [options] <input tree file> [output table file]" );
+        System.out.println( PRG_NAME + " [options] <input tree(s) file> [output table file]" );
         System.out.println();
         System.out.println( " options:" );
         System.out.println( "  -" + SPECIFICS_CUTOFF_OPTION
-                + "=<double>: the cutoff for \"specific-hit\" support values (default: " + SPECIFICS_CUTOFF_DEFAULT
-                + ")" );
-        System.out.println( "  -" + SEP_OPTION + "=<separator>: the annotation-separator to be used (default: "
+                + "=<double>        : the cutoff for \"specific-hit\" support values (default: "
+                + SPECIFICS_CUTOFF_DEFAULT + ")" );
+        System.out.println( "  -" + SEP_OPTION + "=<separator>     : the annotation-separator to be used (default: "
                 + SEP_DEFAULT + ")" );
         System.out.println( "  -" + MAPPING_FILE_OPTION
-                + "=<mapping table>: to map node names to appropriate annotations (tab-separated, two columns) (default: no mapping)" );
-        System.out.println( "  -" + QUERY_PATTERN_OPTION
+                + "=<mapping table> : to map node names to appropriate annotations (tab-separated, two columns) (default: no mapping)" );
+        System.out.println( "  -" + EXTRA_PROCESSING_OPTION1
+                + "                 : to enable extra processing of annotations (e.g. \"Q16611|A.1.1\" becomes \"A.1.1\")" );
+        System.out.println( "  -" + EXTRA_PROCESSING1_SEP_OPTION
+                + "=<separator>    : the separator for extra annotations (default: \"" + EXTRA_PROCESSING1_SEP_DEFAULT
+                + "\")" );
+        System.out.println( "  -" + EXTRA_PROCESSING1_KEEP_EXTRA_OPTION
+                + "                : to keep extra annotations (e.g. \"Q16611|A.1.1\" becomes \"A.1.1.Q16611\")" );
+        System.out.println( "  -" + VERBOSE_OPTION + "                 : verbose" );
+        System.out.println( "  --" + QUERY_PATTERN_OPTION
                 + "=<query pattern>: the regular expression for the query (default: \"" + QUERY_PATTERN_DEFAULT
                 + "\" for pplacer output)" );
         System.out.println();
@@ -367,6 +487,9 @@ public final class cladinator {
         System.out.println( " " + PRG_NAME + " my_tree.nh result.tsv" );
         System.out.println( " " + PRG_NAME + " -c=0.5 -s=. my_tree.nh result.tsv" );
         System.out.println( " " + PRG_NAME + " -c=0.9 -s=_ -m=map.tsv my_tree.nh result.tsv" );
+        System.out.println( " " + PRG_NAME + " -x -xs=& -xk my_tree.nh result.tsv" );
+        System.out.println( " " + PRG_NAME + " -x -xs=\"|\" my_tree.nh result.tsv" );
+        System.out.println( " " + PRG_NAME + " -x -xk -m=map.tsv pplacer_out_trees.sing.tre result.tsv" );
         System.out.println();
     }
 }