in progress
authorcmzmasek <chris.zma@outlook.com>
Wed, 10 May 2017 18:37:08 +0000 (11:37 -0700)
committercmzmasek <chris.zma@outlook.com>
Wed, 10 May 2017 18:37:08 +0000 (11:37 -0700)
forester/java/src/org/forester/application/phyloxml_converter.java
forester/java/src/org/forester/phylogeny/PhylogenyMethods.java
forester/java/src/org/forester/util/ForesterUtil.java

index 0dac775..b827750 100644 (file)
@@ -40,41 +40,39 @@ import org.forester.io.writers.PhylogenyWriter;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
-import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
-import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.util.CommandLineArguments;
 import org.forester.util.ForesterUtil;
 
 public class phyloxml_converter {
 
-    final static private String  HELP_OPTION_1                     = "help";
-    final static private String  HELP_OPTION_2                     = "h";
-    final static private String  FIELD_OPTION                      = "f";
-    final static private String  FIELD_CLADE_NAME                  = "nn";
-    final static private String  FIELD_TAXONOMY_CODE               = "tc";
-    final static private String  FIELD_TAXONOMY_SCI_NAME           = "sn";
-    final static private String  FIELD_TAXONOMY_COMM_NAME          = "cn";
-    final static private String  FIELD_SEQUENCE_GENE_NAME          = "gn";
-    final static private String  FIELD_SEQUENCE_SYMBOL             = "sy";
-    final static private String  FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1";
-    final static private String  FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2";
-    final static private String  FIELD_DUMMY                       = "dummy";
-    final static private String  INTERNAL_NAMES_ARE_BOOT_SUPPPORT  = "i";
-    final static private String  MIDPOINT_REROOT                   = "m";
-    final static private String  EXTRACT_TAXONOMY                  = "xt";
-    final static private String  EXTRACT_TAXONOMY_PF               = "xp";
-    final static private String  ORDER_SUBTREES                    = "o";
-    final static private String  NO_TREE_LEVEL_INDENDATION         = "ni";
-    final static private String  REPLACE_UNDER_SCORES              = "ru";
-    final static private String  IGNORE_QUOTES                     = "iqs";
-    final static private String  PRG_NAME                          = "phyloxml_converter";
-    final static private String  PRG_VERSION                       = "1.302";
-    final static private String  PRG_DATE                          = "140516";
-    final static private String  E_MAIL                            = "phyloxml@gmail.com";
-    final static private String  WWW                               = "sites.google.com/site/cmzmasek/home/software/forester";
-    final static private boolean SPECIAL                           = false;
+    final static private String HELP_OPTION_1                     = "help";
+    final static private String HELP_OPTION_2                     = "h";
+    final static private String FIELD_OPTION                      = "f";
+    final static private String FIELD_CLADE_NAME                  = "nn";
+    final static private String FIELD_TAXONOMY_CODE               = "tc";
+    final static private String FIELD_TAXONOMY_SCI_NAME           = "sn";
+    final static private String FIELD_TAXONOMY_COMM_NAME          = "cn";
+    final static private String FIELD_SEQUENCE_GENE_NAME          = "gn";
+    final static private String FIELD_SEQUENCE_SYMBOL             = "sy";
+    final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1";
+    final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2";
+    final static private String FIELD_DUMMY                       = "dummy";
+    final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT  = "i";
+    final static private String MIDPOINT_REROOT                   = "m";
+    final static private String EXTRACT_TAXONOMY                  = "xt";
+    final static private String EXTRACT_TAXONOMY_PF               = "xp";
+    final static private String ORDER_SUBTREES                    = "o";
+    final static private String NO_TREE_LEVEL_INDENDATION         = "ni";
+    final static private String REPLACE_UNDER_SCORES              = "ru";
+    final static private String IGNORE_QUOTES                     = "iqs";
+    final static private String CONFIDENCE_TYPE                   = "c";
+    final static private String PRG_NAME                          = "phyloxml_converter";
+    final static private String PRG_VERSION                       = "1.303";
+    final static private String PRG_DATE                          = "170510";
+    final static private String E_MAIL                            = "phyloxml@gmail.com";
+    final static private String WWW                               = "sites.google.com/site/cmzmasek/home/software/forester/phyloxml-converter";
 
     public static void main( final String args[] ) throws PhyloXmlDataFormatException {
         ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
@@ -106,6 +104,7 @@ public class phyloxml_converter {
         allowed_options.add( EXTRACT_TAXONOMY );
         allowed_options.add( EXTRACT_TAXONOMY_PF );
         allowed_options.add( IGNORE_QUOTES );
+        allowed_options.add( CONFIDENCE_TYPE );
         if ( cla.getNumberOfNames() != 2 ) {
             System.out.println();
             System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" );
@@ -157,8 +156,9 @@ public class phyloxml_converter {
         else if ( field_option_value.equals( FIELD_DUMMY ) ) {
         }
         else {
-            ForesterUtil.fatalError( PRG_NAME, "unknown value for -\"" + FIELD_OPTION + "\" option: \""
-                    + field_option_value + "\"" );
+            ForesterUtil
+                    .fatalError( PRG_NAME,
+                                 "unknown value for -\"" + FIELD_OPTION + "\" option: \"" + field_option_value + "\"" );
         }
         boolean ignore_quotes = false;
         if ( cla.isOptionSet( IGNORE_QUOTES ) ) {
@@ -168,6 +168,13 @@ public class phyloxml_converter {
         if ( cla.isOptionSet( INTERNAL_NAMES_ARE_BOOT_SUPPPORT ) ) {
             int_values_are_boots = true;
         }
+        String conf_type = "unknown";
+        if ( cla.isOptionSet( CONFIDENCE_TYPE ) ) {
+            final String str = cla.getOptionValueAsCleanString( CONFIDENCE_TYPE );
+            if ( !ForesterUtil.isEmpty( str ) ) {
+                conf_type = str;
+            }
+        }
         boolean midpoint_reroot = false;
         if ( cla.isOptionSet( MIDPOINT_REROOT ) ) {
             midpoint_reroot = true;
@@ -210,12 +217,12 @@ public class phyloxml_converter {
                         && ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ) ) {
                     if ( extr_taxonomy_pf_only ) {
                         ( ( NHXParser ) parser )
-                        .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT );
+                                .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT );
                         replace_underscores = false;
                     }
                     else if ( extr_taxonomy ) {
                         ( ( NHXParser ) parser )
-                        .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED );
+                                .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED );
                         replace_underscores = false;
                     }
                 }
@@ -234,14 +241,9 @@ public class phyloxml_converter {
         catch ( final IOException e ) {
             ForesterUtil.fatalError( PRG_NAME, "failed to read phylogeny from [" + infile + "]: " + e.getMessage() );
         }
-        if ( SPECIAL ) {
-            for( final Phylogeny phy : phys ) {
-                performSpecialProcessing( phy );
-            }
-        }
         if ( int_values_are_boots ) {
             for( final Phylogeny phy : phys ) {
-                PhylogenyMethods.transferInternalNamesToBootstrapSupport( phy );
+                PhylogenyMethods.transferInternalNamesToConfidenceValues( phy, conf_type );
             }
         }
         if ( field != null ) {
@@ -283,87 +285,11 @@ public class phyloxml_converter {
         System.out.println();
     }
 
-    private static void performSpecialProcessing( final Phylogeny phy ) {
-        // Can place some kind of custom processing here.
-        //        final List<PhylogenyNode> remove_us = new ArrayList<PhylogenyNode>();
-        //        int counter = 0;
-        //        for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
-        //            final PhylogenyNode node = it.next();
-        //            final String name = node.getNodeName().toLowerCase();
-        //            if ( name.startsWith( "environmental_samples" ) || name.startsWith( "unclassified" )
-        //                    || name.startsWith( "bacteria" ) || name.startsWith( "other" )
-        //                    || name.startsWith( "viroids" ) || name.startsWith( "viruses" ) ) {
-        //                remove_us.add( node );
-        //                System.out.println( counter++ );
-        //            }
-        //        }
-        //        phy.hashIDs();
-        //        for( final PhylogenyNode node : remove_us ) {
-        //            if ( phy.getNode( node.getNodeId() ) != null ) {
-        //                phy.deleteSubtree( node );
-        //                System.out.println( "deleted: " + node );
-        //            }
-        //        }
-        //        phy.hashIDs();
-        //
-        //        for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
-        //            final PhylogenyNode node = it.next();
-        //            node.getNodeData().setTaxonomy( null );
-        //        }
-        //        phy.reRoot( phy.getFirstExternalNode() );
-        //        PhylogenyMethods.midpointRoot( phy );
-        //        phy.orderAppearance( true );
-        for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
-            final PhylogenyNode node = it.next();
-            final String name = node.getName();
-            if ( !ForesterUtil.isEmpty( name ) ) {
-                //                final Taxonomy taxo = new Taxonomy();
-                //                if ( node.isExternal() ) {
-                //                    taxo.setTaxonomyCode( name );
-                //                    node.getNodeData().setTaxonomy( taxo );
-                //                }
-                //                else if ( name.indexOf( '_' ) == -1 || name.length() > 6 ) {
-                //                    taxo.setScientificName( name );
-                //                    node.getNodeData().setTaxonomy( taxo );
-                //                }
-                //                node.setName( "" );
-                //                if ( name.indexOf( "BF" ) >= 0 ) {
-                //                    taxo.setTaxonomyCode( "BACFR" );
-                //                }
-                //                else if ( name.indexOf( "BT" ) >= 0 ) {
-                //                    taxo.setTaxonomyCode( "BACTN" );
-                //                }
-                //                else if ( name.indexOf( "MXAN" ) >= 0 ) {
-                //                    taxo.setTaxonomyCode( "MYXXD" );
-                //                }
-                //                else if ( name.indexOf( "STIAU" ) >= 0 ) {
-                //                    taxo.setTaxonomyCode( "STIAU" );
-                //                }
-                //                else if ( name.indexOf( "BOVA" ) >= 0 ) {
-                //                    taxo.setTaxonomyCode( "BACOV" );
-                //                }
-                //                else if ( name.indexOf( "BUNI" ) >= 0 ) {
-                //                    taxo.setTaxonomyCode( "BACUN" );
-                //                }
-                //                else if ( name.indexOf( "Pgin" ) >= 0 ) {
-                //                    taxo.setTaxonomyCode( "PORGI" );
-                //                }
-                //                else if ( name.equals( "3CGH" ) || name.equals( "3CK7" ) ) {
-                //                    taxo.setTaxonomyCode( "BACTN" );
-                //                }
-                // node.getNodeData().setTaxonomy( taxo );
-            }
-        }
-    }
-
     private static void printHelp() {
         System.out.println( "Usage:" );
         System.out.println();
-        System.out
-        .println( PRG_NAME
-                  + " -"
-                  + FIELD_OPTION
-                  + "=<field option> [options] <infile in New Hamphshire, NHX, Nexus, ToL XML, or phyloXML format> <outfile>" );
+        System.out.println( PRG_NAME + " -" + FIELD_OPTION
+                + "=<field option> [options] <infile in New Hamphshire, NHX, Nexus, ToL XML, or phyloXML format> <outfile>" );
         System.out.println();
         System.out.println( " field options: " );
         System.out.println();
@@ -375,30 +301,28 @@ public class phyloxml_converter {
         System.out.println( "   " + FIELD_SEQUENCE_SYMBOL + ":    transfer name to sequence symbol" );
         System.out.println( "   " + FIELD_DUMMY + ": to convert NHX formatted trees to phyloXML" );
         System.out.println( "   " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1
-                            + ":    transfer/split name to taxonomy uniprot identifier" );
+                + ":    transfer/split name to taxonomy uniprot identifier" );
         System.out.println( "          (split at underscore if \"id_name\" pattern, e.g. \"817_SusD\")" );
         System.out.println( "   " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2
-                            + ":    transfer/split name to taxonomy uniprot identifier" );
+                + ":    transfer/split name to taxonomy uniprot identifier" );
         System.out.println( "          (split at underscore if \"name_id\" pattern, e.g. \"SusD_817\")" );
         System.out.println();
         System.out.println( " options: " );
         System.out.println( " -" + INTERNAL_NAMES_ARE_BOOT_SUPPPORT
-                            + "  : internal names in NH or NHX tree are bootstrap support values" );
-        System.out.println( " -" + REPLACE_UNDER_SCORES + " : replace all underscores with spaces" );
-        System.out.println( " -" + MIDPOINT_REROOT + "  : midpoint reroot" );
-        System.out.println( " -" + ORDER_SUBTREES + "  : order subtrees" );
-        System.out
-        .println( " -"
-                + EXTRACT_TAXONOMY
-                + " : extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: "
+                + "       : internal names in NH or NHX tree are confidence values" );
+        System.out.println( " -" + CONFIDENCE_TYPE + "=<conf>"
+                + ": confidence type (e.g. \"bootstrap\", default is \"unknown\")" );
+        System.out.println( " -" + REPLACE_UNDER_SCORES + "      : replace all underscores with spaces" );
+        System.out.println( " -" + MIDPOINT_REROOT + "       : midpoint reroot" );
+        System.out.println( " -" + ORDER_SUBTREES + "       : order subtrees" );
+        System.out.println( " -" + EXTRACT_TAXONOMY
+                + "      : extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: "
                 + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" );
-        System.out
-        .println( " -"
-                + EXTRACT_TAXONOMY_PF
-                + " : extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: "
+        System.out.println( " -" + EXTRACT_TAXONOMY_PF
+                + "      : extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: "
                 + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" );
-        System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + " : no tree level indendation in phyloXML output" );
-        System.out.println( " -" + IGNORE_QUOTES + ": ignore quotes and whitespace (e.g. \"a b\" becomes ab)" );
+        System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + "      : no tree level indendation in phyloXML output" );
+        System.out.println( " -" + IGNORE_QUOTES + "     : ignore quotes and whitespace (e.g. \"a b\" becomes ab)" );
         System.out.println();
     }
 }
index ee17de9..150177b 100644 (file)
@@ -1543,7 +1543,7 @@ public class PhylogenyMethods {
         return nodes_to_delete;
     }
 
-    final static public void transferInternalNamesToBootstrapSupport( final Phylogeny phy ) {
+    final static public void transferInternalNamesToConfidenceValues( final Phylogeny phy, final String confidence_type ) {
         final PhylogenyNodeIterator it = phy.iteratorPostorder();
         while ( it.hasNext() ) {
             final PhylogenyNode n = it.next();
@@ -1557,7 +1557,7 @@ public class PhylogenyMethods {
                             + e.getLocalizedMessage() );
                 }
                 if ( value >= 0.0 ) {
-                    n.getBranchData().addConfidence( new Confidence( value, "bootstrap" ) );
+                    n.getBranchData().addConfidence( new Confidence( value, confidence_type ) );
                     n.setName( "" );
                 }
             }
index 8f0744e..2374654 100644 (file)
@@ -1507,9 +1507,6 @@ public final class ForesterUtil {
         return trees;
     }
 
-    private ForesterUtil() {
-    }
-
     public final static File getMatchingFile( final File dir, final String prefix, final String suffix )
             throws IOException {
         if ( !dir.exists() ) {
@@ -1518,23 +1515,26 @@ public final class ForesterUtil {
         if ( !dir.isDirectory() ) {
             throw new IOException( "[" + dir + "] is not a directory" );
         }
-        final File mapping_files[] = dir.listFiles( new FilenameFilter() {
+        if ( dir.listFiles().length == 0 ) {
+            throw new IOException( "[" + dir + "] is empty" );
+        }
+        final File files[] = dir.listFiles( new FilenameFilter() {
 
             @Override
             public boolean accept( final File dir, final String name ) {
                 return ( name.endsWith( suffix ) );
             }
         } );
-        if ( mapping_files.length == 1 ) {
+        if ( files.length == 0 ) {
             throw new IOException( "no files ending with \"" + suffix + "\" found in [" + dir + "]" );
         }
-        String my_prefix = removeFileExtension( prefix );
+        String my_prefix = prefix;
         boolean done = false;
         boolean more_than_one = false;
         File the_one = null;
         do {
             int matches = 0;
-            for( File file : mapping_files ) {
+            for( File file : files ) {
                 if ( file.getName().startsWith( my_prefix ) ) {
                     matches++;
                     if ( matches > 1 ) {
@@ -1571,4 +1571,7 @@ public final class ForesterUtil {
         }
         return the_one;
     }
+
+    private ForesterUtil() {
+    }
 }