in progress...
[jalview.git] / forester / java / src / org / forester / application / decorator.java
index acf8c50..9b75dae 100644 (file)
 package org.forester.application;
 
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 
+import org.forester.io.parsers.FastaParser;
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.writers.PhylogenyWriter;
 import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyMethods;
+import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
 import org.forester.phylogeny.data.Identifier;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
+import org.forester.sequence.MolecularSequence;
 import org.forester.tools.PhylogenyDecorator;
 import org.forester.tools.PhylogenyDecorator.FIELD;
 import org.forester.util.BasicTable;
@@ -48,6 +56,7 @@ import org.forester.util.ForesterUtil;
 public final class decorator {
 
     private static final String SEQUENCE_NAME_FIELD                     = "s";
+    private static final String MOL_SEQ                                 = "m";
     private static final String TAXONOMY_CODE_FIELD                     = "c";
     private static final String TAXONOMY_SCIENTIFIC_NAME_FIELD          = "sn";
     private static final String DS_FILED                                = "d";
@@ -56,27 +65,28 @@ public final class decorator {
     final static private String PICKY_OPTION                            = "p";
     final static private String FIELD_OPTION                            = "f";
     final static private String TRIM_AFTER_TILDE_OPTION                 = "t";
+    final static private String VERBOSE_OPTION                          = "ve";
     final static private String TREE_NAME_OPTION                        = "pn";
     final static private String TREE_ID_OPTION                          = "pi";
     final static private String TREE_DESC_OPTION                        = "pd";
+    final static private String MIDPOINT_ROOT_OPTION                    = "mp";
+    final static private String ORDER_TREE_OPTION                       = "or";
     final static private String EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION  = "sn";
     final static private String EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION = "tc";
-    final static private String PROCESS_NAME_INTELLIGENTLY_OPTION       = "x";
-    final static private String PROCESS_SIMILAR_TO_OPTION               = "xs";
     final static private String CUT_NAME_AFTER_FIRST_SPACE_OPTION       = "c";
-    final static private String ALLOW_REMOVAL_OF_CHARS_OPTION           = "r";
     final static private String ADVANCED_TABLE_OPTION                   = "table";
     final static private String KEY_COLUMN                              = "k";
     final static private String VALUE_COLUMN                            = "v";
     final static private String MAPPING_FILE_SEPARATOR_OPTION           = "s";
     final static private char   MAPPING_FILE_SEPARATOR_DEFAULT          = '\t';
     final static private String PRG_NAME                                = "decorator";
-    final static private String PRG_VERSION                             = "1.13";
-    final static private String PRG_DATE                                = "2013.01.19";
+    final static private String PRG_VERSION                             = "1.16";
+    final static private String PRG_DATE                                = "131113";
 
     public static void main( final String args[] ) {
         ForesterUtil.printProgramInformation( decorator.PRG_NAME, decorator.PRG_VERSION, decorator.PRG_DATE );
-        if ( ( args.length < 4 ) || ( args.length > 12 ) ) {
+        System.out.println();
+        if ( ( args.length < 4 ) || ( args.length > 13 ) ) {
             decorator.argumentsError();
         }
         CommandLineArguments cla = null;
@@ -95,14 +105,19 @@ public final class decorator {
         if ( phylogenies_outfile.exists() ) {
             ForesterUtil.fatalError( PRG_NAME, "[" + phylogenies_outfile + "] already exists" );
         }
+        String err = ForesterUtil.isReadableFile( phylogenies_infile );
+        if ( !ForesterUtil.isEmpty( err ) ) {
+            ForesterUtil.fatalError( PRG_NAME, err );
+        }
+        err = ForesterUtil.isReadableFile( mapping_infile );
+        if ( !ForesterUtil.isEmpty( err ) ) {
+            ForesterUtil.fatalError( PRG_NAME, err );
+        }
         final List<String> allowed_options = new ArrayList<String>();
         allowed_options.add( decorator.ADVANCED_TABLE_OPTION );
         allowed_options.add( decorator.PICKY_OPTION );
         allowed_options.add( decorator.FIELD_OPTION );
-        allowed_options.add( decorator.PROCESS_NAME_INTELLIGENTLY_OPTION );
-        allowed_options.add( decorator.PROCESS_SIMILAR_TO_OPTION );
         allowed_options.add( decorator.CUT_NAME_AFTER_FIRST_SPACE_OPTION );
-        allowed_options.add( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION );
         allowed_options.add( decorator.KEY_COLUMN );
         allowed_options.add( decorator.VALUE_COLUMN );
         allowed_options.add( decorator.MAPPING_FILE_SEPARATOR_OPTION );
@@ -112,6 +127,9 @@ public final class decorator {
         allowed_options.add( decorator.TREE_ID_OPTION );
         allowed_options.add( decorator.TREE_DESC_OPTION );
         allowed_options.add( decorator.TRIM_AFTER_TILDE_OPTION );
+        allowed_options.add( decorator.ORDER_TREE_OPTION );
+        allowed_options.add( decorator.MIDPOINT_ROOT_OPTION );
+        allowed_options.add( decorator.VERBOSE_OPTION );
         final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
         if ( dissallowed_options.length() > 0 ) {
             ForesterUtil.fatalError( decorator.PRG_NAME, "unknown option(s): " + dissallowed_options );
@@ -137,13 +155,13 @@ public final class decorator {
         int value_column = 1;
         String field_str = "";
         FIELD field = FIELD.NODE_NAME;
-        int numbers_of_chars_allowed_to_remove_if_not_found_in_map = -1;
         boolean cut_name_after_space = false;
-        boolean process_name_intelligently = false;
-        boolean process_similar_to = false;
         boolean extract_bracketed_scientific_name = false;
         boolean extract_bracketed_tax_code = false;
         boolean trim_after_tilde = false;
+        boolean order_tree = false;
+        boolean midpoint_root = false;
+        boolean verbose = false;
         String tree_name = "";
         String tree_id = "";
         String tree_desc = "";
@@ -187,27 +205,20 @@ public final class decorator {
                 }
                 cut_name_after_space = true;
             }
-            if ( cla.isOptionSet( decorator.PROCESS_NAME_INTELLIGENTLY_OPTION ) ) {
-                if ( advanced_table ) {
-                    argumentsError();
-                }
-                process_name_intelligently = true;
-            }
-            if ( cla.isOptionSet( decorator.PROCESS_SIMILAR_TO_OPTION ) ) {
-                if ( advanced_table ) {
-                    argumentsError();
-                }
-                process_similar_to = true;
-            }
             if ( cla.isOptionSet( decorator.TRIM_AFTER_TILDE_OPTION ) ) {
                 if ( advanced_table ) {
                     argumentsError();
                 }
                 trim_after_tilde = true;
             }
-            if ( cla.isOptionSet( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION ) ) {
-                numbers_of_chars_allowed_to_remove_if_not_found_in_map = cla
-                        .getOptionValueAsInt( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION );
+            if ( cla.isOptionSet( decorator.MIDPOINT_ROOT_OPTION ) ) {
+                midpoint_root = true;
+            }
+            if ( cla.isOptionSet( decorator.ORDER_TREE_OPTION ) ) {
+                order_tree = true;
+            }
+            if ( cla.isOptionSet( decorator.VERBOSE_OPTION ) ) {
+                verbose = true;
             }
             if ( cla.isOptionSet( decorator.FIELD_OPTION ) ) {
                 field_str = cla.getOptionValue( decorator.FIELD_OPTION );
@@ -228,6 +239,9 @@ public final class decorator {
                 else if ( field_str.equals( SEQUENCE_NAME_FIELD ) ) {
                     field = FIELD.SEQUENCE_NAME;
                 }
+                else if ( field_str.equals( MOL_SEQ ) ) {
+                    field = FIELD.MOL_SEQ;
+                }
                 else if ( field_str.equals( TAXONOMY_SCIENTIFIC_NAME_FIELD ) ) {
                     field = FIELD.TAXONOMY_SCIENTIFIC_NAME;
                     extract_bracketed_scientific_name = false;
@@ -235,34 +249,20 @@ public final class decorator {
                 }
                 else {
                     ForesterUtil.fatalError( decorator.PRG_NAME, "unknown value for \"" + decorator.FIELD_OPTION
-                            + "\" option: \"" + field_str + "\"" );
+                                             + "\" option: \"" + field_str + "\"" );
                 }
             }
         }
         catch ( final Exception e ) {
             ForesterUtil.fatalError( decorator.PRG_NAME, "error in command line: " + e.getMessage() );
         }
-        if ( ( field != FIELD.NODE_NAME ) && ( cut_name_after_space || process_name_intelligently ) ) {
-            ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -x or -c option without -f=n" );
-        }
-        if ( ( field != FIELD.NODE_NAME ) && process_similar_to ) {
-            ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION
-                    + " option without -f=n" );
-        }
-        if ( cut_name_after_space && process_name_intelligently ) {
-            ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -x and -c option together" );
-        }
-        if ( process_similar_to && process_name_intelligently ) {
-            ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION
-                    + " and -x option together" );
-        }
-        if ( process_similar_to && cut_name_after_space ) {
-            ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION
-                    + " and -c option together" );
-        }
         if ( extract_bracketed_scientific_name && extract_bracketed_tax_code ) {
             argumentsError();
         }
+        ForesterUtil.programMessage( PRG_NAME, "input tree(s) : " + phylogenies_infile );
+        ForesterUtil.programMessage( PRG_NAME, "map           : " + mapping_infile );
+        ForesterUtil.programMessage( PRG_NAME, "output tree(s): " + phylogenies_outfile );
+        System.out.println();
         Phylogeny[] phylogenies = null;
         try {
             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
@@ -271,32 +271,62 @@ public final class decorator {
         }
         catch ( final Exception e ) {
             ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read phylgenies from [" + phylogenies_infile
-                    + "] [" + e.getMessage() + "]" );
+                                     + "] [" + e.getMessage() + "]" );
         }
         Map<String, String> map = null;
         if ( !advanced_table ) {
-            BasicTable<String> mapping_table = null;
-            try {
-                mapping_table = BasicTableParser.parse( mapping_infile, separator, true, false );
-            }
-            catch ( final Exception e ) {
-                ForesterUtil.fatalError( decorator.PRG_NAME,
-                                         "failed to read [" + mapping_infile + "] [" + e.getMessage() + "]" );
-            }
-            if ( ( key_column < 0 ) || ( key_column >= mapping_table.getNumberOfColumns() ) ) {
-                ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for key column" );
+            if ( field != FIELD.MOL_SEQ ) {
+                BasicTable<String> mapping_table = null;
+                try {
+                    mapping_table = BasicTableParser.parse( mapping_infile, separator, true, false );
+                }
+                catch ( final Exception e ) {
+                    ForesterUtil.fatalError( decorator.PRG_NAME,
+                                             "failed to read [" + mapping_infile + "] [" + e.getMessage() + "]" );
+                }
+                if ( ( key_column < 0 ) || ( key_column >= mapping_table.getNumberOfColumns() ) ) {
+                    ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for key column" );
+                }
+                if ( ( value_column < 0 ) || ( value_column >= mapping_table.getNumberOfColumns() ) ) {
+                    ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for value column" );
+                }
+                if ( mapping_table.isEmpty() || ( mapping_table.getNumberOfColumns() < 1 ) ) {
+                    ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table is empty" );
+                }
+                if ( mapping_table.getNumberOfColumns() == 1 ) {
+                    ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table has only one column" );
+                }
+                map = mapping_table.getColumnsAsMap( key_column, value_column );
+                final Iterator<Entry<String, String>> iter = map.entrySet().iterator();
+                if ( verbose ) {
+                    System.out.println();
+                }
+                while ( iter.hasNext() ) {
+                    final Entry<String, String> e = iter.next();
+                    if ( ForesterUtil.isEmpty( e.getKey() ) ) {
+                        ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table contains empty key" );
+                    }
+                    if ( ForesterUtil.isEmpty( e.getValue() ) ) {
+                        ForesterUtil.fatalError( decorator.PRG_NAME, "mapping table contains empty value for key \"" + e.getKey() + "\"");
+                    }
+                    if ( verbose ) {
+                        System.out.println( e.getKey() + " => " + e.getValue() );
+                    }
+                }
+                if ( verbose ) {
+                    System.out.println();
+                }
             }
-            if ( ( value_column < 0 ) || ( value_column >= mapping_table.getNumberOfColumns() ) ) {
-                ForesterUtil.fatalError( decorator.PRG_NAME, "illegal value for value column" );
+            else {
+                map = readFastaFileIntoMap( mapping_infile, verbose );
             }
-            map = mapping_table.getColumnsAsMap( key_column, value_column );
         }
         if ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id )
                 || !ForesterUtil.isEmpty( tree_desc ) ) {
             if ( ( phylogenies.length > 1 )
                     && ( !ForesterUtil.isEmpty( tree_name ) || !ForesterUtil.isEmpty( tree_id ) ) ) {
                 ForesterUtil.fatalError( decorator.PRG_NAME,
-                                         "attempt to set same name or id on more than one phylogeny" );
+                        "attempt to set same name or id on more than one phylogeny" );
             }
             if ( !ForesterUtil.isEmpty( tree_name ) ) {
                 phylogenies[ 0 ].setName( tree_name );
@@ -321,23 +351,23 @@ public final class decorator {
                     ForesterUtil.fatalError( decorator.PRG_NAME,
                                              "failed to read \"" + mapping_infile + "\" [" + e.getMessage() + "]" );
                 }
-                PhylogenyDecorator.decorate( phylogenies,
-                                             table,
-                                             picky,
-                                             numbers_of_chars_allowed_to_remove_if_not_found_in_map );
+                for( final Phylogeny phylogenie : phylogenies ) {
+                    PhylogenyDecorator.decorate( phylogenie, table, picky );
+                }
             }
             else {
-                PhylogenyDecorator.decorate( phylogenies,
-                                             map,
-                                             field,
-                                             extract_bracketed_scientific_name,
-                                             extract_bracketed_tax_code,
-                                             picky,
-                                             cut_name_after_space,
-                                             process_name_intelligently,
-                                             process_similar_to,
-                                             numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                             trim_after_tilde );
+                for( final Phylogeny phylogenie : phylogenies ) {
+                    final String msg = PhylogenyDecorator.decorate( phylogenie,
+                                                                    map,
+                                                                    field,
+                                                                    extract_bracketed_scientific_name,
+                                                                    extract_bracketed_tax_code,
+                                                                    picky,
+                                                                    cut_name_after_space,
+                                                                    trim_after_tilde,
+                                                                    verbose );
+                    ForesterUtil.programMessage( PRG_NAME, msg );
+                }
             }
         }
         catch ( final NullPointerException e ) {
@@ -346,6 +376,16 @@ public final class decorator {
         catch ( final Exception e ) {
             ForesterUtil.fatalError( decorator.PRG_NAME, e.getLocalizedMessage() );
         }
+        if ( midpoint_root || order_tree ) {
+            for( final Phylogeny phy : phylogenies ) {
+                if ( midpoint_root ) {
+                    PhylogenyMethods.midpointRoot( phy );
+                }
+                if ( order_tree ) {
+                    PhylogenyMethods.orderAppearance( phy.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.TAXONOMY );
+                }
+            }
+        }
         try {
             final PhylogenyWriter w = new PhylogenyWriter();
             w.toPhyloXML( phylogenies, 0, phylogenies_outfile, ForesterUtil.getLineSeparator() );
@@ -358,16 +398,48 @@ public final class decorator {
         ForesterUtil.programMessage( PRG_NAME, "OK." );
     }
 
+    private static Map<String, String> readFastaFileIntoMap( final File mapping_infile, final boolean verbose ) {
+        List<MolecularSequence> seqs = null;
+        try {
+            seqs = FastaParser.parse( new FileInputStream( mapping_infile ) );
+        }
+        catch ( final IOException e ) {
+            ForesterUtil.fatalError( decorator.PRG_NAME, "failed to read fasta-file from [" + mapping_infile + "] ["
+                    + e.getMessage() + "]" );
+        }
+        if ( ForesterUtil.isEmpty( seqs ) ) {
+            ForesterUtil.fatalError( decorator.PRG_NAME, "fasta-file [" + mapping_infile
+                                     + "] is devoid of fasta-formatted sequences" );
+        }
+        final Map<String, String> map = new HashMap<String, String>();
+        for( final MolecularSequence seq : seqs ) {
+            if ( ForesterUtil.isEmpty( seq.getIdentifier() ) ) {
+                ForesterUtil.fatalError( decorator.PRG_NAME, "fasta-file [" + mapping_infile
+                                         + "] contains sequence with empty identifier" );
+            }
+            if ( map.containsKey( seq.getIdentifier() ) ) {
+                ForesterUtil.fatalError( decorator.PRG_NAME, "sequence identifier [" + seq.getIdentifier()
+                                         + "] is not unique" );
+            }
+            if ( seq.getLength() < 1 ) {
+                ForesterUtil.fatalError( decorator.PRG_NAME, "sequence [" + seq.getIdentifier() + "] is empty" );
+            }
+            map.put( seq.getIdentifier(), seq.getMolecularSequenceAsString() );
+            if ( verbose ) {
+                System.out.println( seq.getIdentifier() + " => " + seq.getMolecularSequenceAsString() );
+            }
+        }
+        return map;
+    }
+
     private static void argumentsError() {
         System.out.println();
         System.out.println( decorator.PRG_NAME + " -" + ADVANCED_TABLE_OPTION + " | -f=<c> <phylogenies infile> "
-                + "[mapping table file] <phylogenies outfile>" );
+                + "<mapping table file|fasta-file> <phylogenies outfile>" );
         System.out.println();
         System.out.println( "options:" );
         System.out.println();
         System.out.println( " -" + ADVANCED_TABLE_OPTION + " : table instead of one to one map (-f=<c>)" );
-        System.out.println( " -r=<n> : allow to remove up to n characters from the end of the names" );
-        System.out.println( "          in phylogenies infile if not found (in map) otherwise" );
         System.out.println( " -p     : picky, fails if node name not found in mapping table" );
         System.out.println( " -" + TREE_NAME_OPTION + "=<s>: name for the phylogeny" );
         System.out.println( " -" + TREE_ID_OPTION + "=<s>: identifier for the phylogeny (in the form provider:value)" );
@@ -378,28 +450,28 @@ public final class decorator {
         System.out.println();
         System.out.println( " -f=<c> : field to be replaced: " + NODE_NAME_FIELD + " : node name" );
         System.out.println( "                                " + SEQUENCE_ANNOTATION_DESC
-                + " : sequence annotation description" );
+                            + " : sequence annotation description" );
         System.out.println( "                                " + DS_FILED + " : domain structure" );
         System.out.println( "                                " + TAXONOMY_CODE_FIELD + " : taxonomy code" );
         System.out.println( "                                " + TAXONOMY_SCIENTIFIC_NAME_FIELD
-                + ": taxonomy scientific name" );
+                            + ": taxonomy scientific name" );
         System.out.println( "                                " + SEQUENCE_NAME_FIELD + " : sequence name" );
+        System.out.println( "                                " + MOL_SEQ + " : molecular sequence" );
         System.out.println( " -k=<n> : key column in mapping table (0 based)," );
         System.out.println( "          names of the node to be decorated - default is 0" );
         System.out.println( " -v=<n> : value column in mapping table (0 based)," );
         System.out.println( "          data which with to decorate - default is 1" );
         System.out.println( " -" + EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION
-                + "    : to extract bracketed scientific names, e.g. [Nematostella vectensis]" );
+                            + "    : to extract bracketed scientific names, e.g. [Nematostella vectensis]" );
         System.out.println( " -" + EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION
-                + "    : to extract bracketed taxonomic codes, e.g. [NEMVE]" );
-        System.out.println( " -s=<c> : column separator in mapping file, default is \""
-                + decorator.MAPPING_FILE_SEPARATOR_DEFAULT + "\"" );
-        System.out.println( " -x     : process name \"intelligently\" (only for -f=n)" );
-        System.out.println( " -" + decorator.PROCESS_SIMILAR_TO_OPTION
-                + "    : process name \"intelligently\" and process information after \"similar to\" (only for -f=n)" );
+                            + "    : to extract bracketed taxonomic codes, e.g. [NEMVE]" );
+        System.out.println( " -s=<c> : column separator in mapping file, default is tab" );
         System.out.println( " -c     : cut name after first space (only for -f=n)" );
         System.out.println( " -" + decorator.TRIM_AFTER_TILDE_OPTION
-                + "     : trim node name to be replaced after tilde" );
+                            + "     : trim node name to be replaced after tilde" );
+        System.out.println( " -" + decorator.MIDPOINT_ROOT_OPTION + "    : to midpoint-root the tree" );
+        System.out.println( " -" + decorator.ORDER_TREE_OPTION + "    : to order tree branches" );
+        System.out.println( " -" + decorator.VERBOSE_OPTION + "    : verbose" );
         System.out.println();
         System.exit( -1 );
     }