in progress...
authorcmzmasek <chris.zma@outlook.com>
Wed, 30 Aug 2017 23:39:29 +0000 (16:39 -0700)
committercmzmasek <chris.zma@outlook.com>
Wed, 30 Aug 2017 23:39:29 +0000 (16:39 -0700)
forester/java/src/org/forester/application/cladinator.java
forester/java/src/org/forester/application/serin.java [new file with mode: 0644]
forester/java/src/org/forester/application/tap.java [deleted file]
forester/java/src/org/forester/clade_analysis/Prefix.java
forester/java/src/org/forester/sequence/BasicSequence.java
forester/java/src/org/forester/sequence/MolecularSequence.java
forester/java/src/org/forester/util/ForesterConstants.java
forester/java/src/org/forester/util/ForesterUtil.java

index 95322e1..69affd2 100644 (file)
@@ -30,12 +30,11 @@ import java.io.IOException;
 import java.text.DecimalFormat;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import org.forester.clade_analysis.AnalysisMulti;
-import org.forester.clade_analysis.AnalysisSingle;
 import org.forester.clade_analysis.Prefix;
 import org.forester.clade_analysis.ResultMulti;
-import org.forester.clade_analysis.ResultSingle;
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.phylogeny.Phylogeny;
@@ -78,14 +77,14 @@ public final class cladinator {
                 print_help();
                 System.exit( 0 );
             }
-            else if ( ( args.length != 2 && args.length != 3 ) ) {
+            else if ( ( ( args.length != 2 ) && ( args.length != 3 ) ) ) {
                 System.out.println();
                 System.out.println( "Wrong number of arguments." );
                 System.out.println();
                 print_help();
                 System.exit( -1 );
             }
-            final List<String> allowed_options = new ArrayList<String>();
+            final List<String> allowed_options = new ArrayList<>();
             allowed_options.add( SEP_OPTION );
             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
             if ( dissallowed_options.length() > 0 ) {
@@ -118,104 +117,44 @@ public final class cladinator {
                 System.out.println( "\nCould not read \"" + intreefile + "\" [" + e.getMessage() + "]\n" );
                 System.exit( -1 );
             }
-          
-            final ResultMulti res = AnalysisMulti.execute( p, query, separator, 0.5 );
-            
+            final Pattern pattern = Pattern.compile( query );
+            final ResultMulti res = AnalysisMulti.execute( p, pattern, separator, 0.5 );
             System.out.println();
             System.out.println( "Result:" );
             System.out.println( "Query                        : " + query );
-            
             ///////////////////
-            
-         
-         
             System.out.println( "Collapsed:" );
-          
-              for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) {
-                  System.out.println( prefix );
-              }
-              if ( _has_specifics ) {
-                 
-                  System.out.println( "Specifics:" );
-                 
-                  for( final Prefix prefix : _cleaned_spec ) {
-                      System.out.println( prefix );
-                     
-                  }
-                  
-                  System.out.println( "Collapsed With Specifics:" );
-                 
-                  for( final Prefix prefix : _collapsed ) {
-                      System.out.println( prefix );
-                      
-                      for( final Prefix spec : _cleaned_spec ) {
-                          if ( spec.getPrefix().startsWith( prefix.getPrefix() ) ) {
-                              System.out.println( "    " + spec );
-                             
-                          }
-                      }
-                  }
-              }
-              if ( !ForesterUtil.isEmpty( _all_down ) ) {
-                  
-                  System.out.println( "Collapsed Down:" );
-                  
-                  for( final Prefix prefix : _collapsed_down ) {
-                      System.out.println( prefix );
-                      
-                  }
-              
-              }
-              if ( !ForesterUtil.isEmpty( _all_up ) ) {
-                  
-           
-                  System.out.println( "Collapsed Up:" );
-                 
-                  for( final Prefix prefix : _collapsed_up ) {
-                      System.out.println( prefix );
-                     
-                  }
-             
-              }
-            
-            ///////////////////
-            
-            
-            System.out.print( "Greatest Common Prefix       : " + res.getGreatestCommonPrefix() );
-            if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefix() )
-                    && !ForesterUtil.isEmpty( res.getGreatestCommonCladeSubtreeConfidence() ) ) {
-                System.out.println( "\t(" + res.getGreatestCommonCladeSubtreeConfidence() + ")" );
-            }
-            else {
-                System.out.println();
-            }
-            System.out.print( "Greatest Common Prefix Up    : " + res.getGreatestCommonPrefixUp() );
-            if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefixUp() )
-                    && !ForesterUtil.isEmpty( res.getGreatestCommonCladeUpSubtreeConfidence() ) ) {
-                System.out.println( "\t(" + res.getGreatestCommonCladeUpSubtreeConfidence() + ")" );
-            }
-            else {
-                System.out.println();
+            for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) {
+                System.out.println( prefix );
             }
-            System.out.print( "Greatest Common Prefix Down  : " + res.getGreatestCommonPrefixDown() );
-            if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefixDown() )
-                    && !ForesterUtil.isEmpty( res.getGreatestCommonCladeDownSubtreeConfidence() ) ) {
-                System.out.println( "\t(" + res.getGreatestCommonCladeDownSubtreeConfidence() + ")" );
+            if ( res.isHasSpecificMultiHitsPrefixes() ) {
+                System.out.println( "Specifics:" );
+                for( final Prefix prefix : res.getSpecificMultiHitPrefixes() ) {
+                    System.out.println( prefix );
+                }
+                System.out.println( "Collapsed With Specifics:" );
+                for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) {
+                    System.out.println( prefix );
+                    for( final Prefix spec : res.getSpecificMultiHitPrefixes() ) {
+                        if ( spec.getPrefix().startsWith( prefix.getPrefix() ) ) {
+                            System.out.println( "    " + spec );
+                        }
+                    }
+                }
             }
-            else {
-                System.out.println();
+            if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesDown() ) ) {
+                System.out.println( "Collapsed Down:" );
+                for( final Prefix prefix : res.getCollapsedMultiHitPrefixesDown() ) {
+                    System.out.println( prefix );
+                }
             }
-            System.out.println( "Least Encompassing Clade size: " + res.getLeastEncompassingCladeSize()
-                    + " external nodes" );
-            final double lec_ratio = ( 100.0 * res.getLeastEncompassingCladeSize() ) / res.getTreeSize();
-            System.out.println( "Least Encompassing Clade size: " + df2.format( lec_ratio ) + "%" );
-            System.out.println( "Total tree size              : " + res.getTreeSize() + " external nodes" );
-            if ( res.getWarnings().size() > 0 ) {
-                System.out.println( "Warnings:" );
-                for( final String s : res.getWarnings() ) {
-                    System.out.println( s );
+            if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesUp() ) ) {
+                System.out.println( "Collapsed Up:" );
+                for( final Prefix prefix : res.getAllMultiHitPrefixesUp() ) {
+                    System.out.println( prefix );
                 }
             }
+            ///////////////////
             System.out.println();
         }
         catch ( final IllegalArgumentException e ) {
diff --git a/forester/java/src/org/forester/application/serin.java b/forester/java/src/org/forester/application/serin.java
new file mode 100644 (file)
index 0000000..a9ee157
--- /dev/null
@@ -0,0 +1,343 @@
+
+package org.forester.application;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.forester.io.parsers.FastaParser;
+import org.forester.io.parsers.GeneralMsaParser;
+import org.forester.io.writers.SequenceWriter;
+import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
+import org.forester.msa.BasicMsa;
+import org.forester.msa.Msa;
+import org.forester.msa.Msa.MSA_FORMAT;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.MolecularSequence;
+import org.forester.util.BasicDescriptiveStatistics;
+import org.forester.util.CommandLineArguments;
+import org.forester.util.ForesterConstants;
+import org.forester.util.ForesterUtil;
+
+public class serin {
+
+    final static private String PRG_NAME               = "serin";
+    final static private String PRG_DATE               = "170830";
+    final static private String PRG_DESC               = "sequence file reformatting and identifier normalization";
+    final static private String PRG_VERSION            = "1.00";
+    final static private String WWW                    = "https://sites.google.com/site/cmzmasek/home/software/forester";
+    final static private String E_MAIL                 = "phyloxml@gmail.com";
+    final static private String OUTPUT_FORMAT_OPTION   = "o";
+    final static private String ID_NORM_OPTION         = "i";
+    final static private String HELP_OPTION_1          = "help";
+    final static private String HELP_OPTION_2          = "h";
+    private static final String OUTPUT_FORMAT_FASTA    = "f";
+    private static final String OUTPUT_FORMAT_PHYLIP   = "p";
+    private static final String OUTPUT_FORMAT_NEXUS    = "n";
+    private static final String OUTPUT_FORMAT_FASTA_L  = "fasta";
+    private static final String OUTPUT_FORMAT_PHYLIP_L = "phylip";
+    private static final String OUTPUT_FORMAT_NEXUS_L  = "nexus";
+
+    public static void main( final String args[] ) {
+        try {
+            ForesterUtil.printProgramInformation( PRG_NAME,
+                                                  PRG_DESC,
+                                                  PRG_VERSION,
+                                                  PRG_DATE,
+                                                  E_MAIL,
+                                                  WWW,
+                                                  ForesterUtil.getForesterLibraryInformation() );
+            CommandLineArguments cla = null;
+            try {
+                cla = new CommandLineArguments( args );
+            }
+            catch ( final Exception e ) {
+                ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
+            }
+            if ( ( cla.getNumberOfNames() == 0 ) || cla.isOptionSet( HELP_OPTION_1 )
+                    || cla.isOptionSet( HELP_OPTION_2 ) ) {
+                System.out.println();
+                print_help();
+                System.exit( 0 );
+            }
+            String input_seqs_file_str = null;
+            String output_seqs_file_str = null;
+            String output_map_file_str = null;
+            String input_seqs_name_wo_suffix = null;
+            if ( ( cla.getNumberOfNames() == 2 ) || ( cla.getNumberOfNames() == 3 ) ) {
+                input_seqs_file_str = cla.getName( 0 );
+                output_seqs_file_str = cla.getName( 1 );
+                if ( cla.getNumberOfNames() == 3 ) {
+                    output_map_file_str = cla.getName( 2 );
+                }
+            }
+            else if ( cla.getNumberOfNames() == 1 ) {
+                input_seqs_file_str = cla.getName( 0 );
+                input_seqs_name_wo_suffix = null;
+                if ( input_seqs_file_str.toLowerCase().endsWith( ".fasta" ) ) {
+                    input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 6 );
+                }
+                else if ( input_seqs_file_str.toLowerCase().endsWith( ".fsa" ) ) {
+                    input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
+                }
+                else if ( input_seqs_file_str.toLowerCase().endsWith( ".phy" ) ) {
+                    input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
+                }
+                else if ( input_seqs_file_str.toLowerCase().endsWith( ".aln" ) ) {
+                    input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
+                }
+                else if ( input_seqs_file_str.toLowerCase().endsWith( ".phylip" ) ) {
+                    input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 7 );
+                }
+                else if ( input_seqs_file_str.toLowerCase().endsWith( ".nex" ) ) {
+                    input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 );
+                }
+                else if ( input_seqs_file_str.toLowerCase().endsWith( ".nexus" ) ) {
+                    input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 5 );
+                }
+                else {
+                    input_seqs_name_wo_suffix = input_seqs_file_str;
+                }
+                output_map_file_str = input_seqs_name_wo_suffix + ForesterConstants.ID_MAP_FILE_SUFFIX;
+            }
+            else {
+                print_help();
+                System.exit( -1 );
+            }
+            final List<String> allowed_options = new ArrayList<>();
+            allowed_options.add( OUTPUT_FORMAT_OPTION );
+            allowed_options.add( ID_NORM_OPTION );
+            final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
+            if ( dissallowed_options.length() > 0 ) {
+                ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
+            }
+            final File input_seqs_file = new File( input_seqs_file_str );
+            final String error0 = ForesterUtil.isReadableFile( input_seqs_file );
+            if ( !ForesterUtil.isEmpty( error0 ) ) {
+                ForesterUtil.fatalError( PRG_NAME, error0 );
+            }
+            final boolean input_seqs_fasta_like = ForesterUtil.isLooksLikeFasta( input_seqs_file );
+            Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA;
+            if ( cla.isOptionSet( OUTPUT_FORMAT_OPTION ) ) {
+                if ( cla.isOptionValueSet( OUTPUT_FORMAT_OPTION ) ) {
+                    final String output_format_str = cla.getOptionValue( OUTPUT_FORMAT_OPTION );
+                    if ( output_format_str.equals( OUTPUT_FORMAT_FASTA )
+                            || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_FASTA_L ) ) {
+                        output_format = MSA_FORMAT.FASTA;
+                    }
+                    else if ( output_format_str.equals( OUTPUT_FORMAT_PHYLIP )
+                            || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_PHYLIP_L ) ) {
+                        output_format = MSA_FORMAT.PHYLIP;
+                    }
+                    else if ( output_format_str.equals( OUTPUT_FORMAT_NEXUS )
+                            || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_NEXUS_L ) ) {
+                        output_format = MSA_FORMAT.NEXUS;
+                    }
+                    else {
+                        ForesterUtil.fatalError( PRG_NAME, "unknown format option: " + output_format_str );
+                    }
+                }
+            }
+            final boolean normalize_identifiers;
+            if ( cla.isOptionSet( ID_NORM_OPTION ) || ( cla.getNumberOfNames() == 3 ) ) {
+                normalize_identifiers = true;
+            }
+            else {
+                normalize_identifiers = false;
+            }
+            if ( normalize_identifiers && ForesterUtil.isEmpty( output_map_file_str ) ) {
+                ForesterUtil.fatalError( PRG_NAME, "need to indicate name for output map file" );
+            }
+            final File output_map_file;
+            if ( normalize_identifiers ) {
+                output_map_file = new File( output_map_file_str );
+                final String error = ForesterUtil.isWritableFile( output_map_file );
+                if ( !ForesterUtil.isEmpty( error ) ) {
+                    ForesterUtil.fatalError( PRG_NAME, error );
+                }
+            }
+            else {
+                output_map_file = null;
+            }
+            if ( cla.getNumberOfNames() == 1 ) {
+                if ( normalize_identifiers ) {
+                    if ( output_format == MSA_FORMAT.FASTA ) {
+                        output_seqs_file_str = input_seqs_name_wo_suffix
+                                + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX;
+                    }
+                    else if ( output_format == MSA_FORMAT.NEXUS ) {
+                        output_seqs_file_str = input_seqs_name_wo_suffix
+                                + ForesterConstants.ID_NORMALIZED_NEXUS_FILE_SUFFIX;
+                    }
+                    else if ( output_format == MSA_FORMAT.PHYLIP ) {
+                        output_seqs_file_str = input_seqs_name_wo_suffix
+                                + ForesterConstants.ID_NORMALIZED_PHYLIP_FILE_SUFFIX;
+                    }
+                }
+                else {
+                    if ( output_format == MSA_FORMAT.FASTA ) {
+                        output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.FASTA_FILE_SUFFIX;
+                        if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
+                            output_seqs_file_str = input_seqs_name_wo_suffix + "_"
+                                    + ForesterConstants.FASTA_FILE_SUFFIX;
+                        }
+                    }
+                    else if ( output_format == MSA_FORMAT.NEXUS ) {
+                        output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.NEXUS_FILE_SUFFIX;
+                        if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
+                            output_seqs_file_str = input_seqs_name_wo_suffix + "_"
+                                    + ForesterConstants.NEXUS_FILE_SUFFIX;
+                        }
+                    }
+                    else if ( output_format == MSA_FORMAT.PHYLIP ) {
+                        output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.PHYLIP_FILE_SUFFIX;
+                        if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) {
+                            output_seqs_file_str = input_seqs_name_wo_suffix + "_"
+                                    + ForesterConstants.PHYLIP_FILE_SUFFIX;
+                        }
+                    }
+                }
+            }
+            final File outfile_seqs_file = new File( output_seqs_file_str );
+            final String error1 = ForesterUtil.isWritableFile( outfile_seqs_file );
+            if ( !ForesterUtil.isEmpty( error1 ) ) {
+                ForesterUtil.fatalError( PRG_NAME, error1 );
+            }
+            System.out.println();
+            if ( input_seqs_fasta_like ) {
+                System.out.println( "Input format          : Fasta" );
+            }
+            else {
+                System.out.println( "Input format          : Phylip like" );
+            }
+            System.out.println( "Input file            : " + input_seqs_file_str );
+            if ( output_format == MSA_FORMAT.FASTA ) {
+                System.out.println( "Output format         : Fasta" );
+            }
+            else if ( output_format == MSA_FORMAT.NEXUS ) {
+                System.out.println( "Output format         : Nexus" );
+            }
+            else if ( output_format == MSA_FORMAT.PHYLIP ) {
+                System.out.println( "Output format         : Phylip" );
+            }
+            System.out.println( "Output file           : " + output_seqs_file_str );
+            System.out.println( "Shorten names         : " + normalize_identifiers );
+            if ( normalize_identifiers ) {
+                System.out.println( "Identifier map        : " + output_map_file_str );
+            }
+            final List<MolecularSequence> input_seqs;
+            final FileInputStream is = new FileInputStream( input_seqs_file );
+            if ( FastaParser.isLikelyFasta( input_seqs_file ) ) {
+                input_seqs = FastaParser.parse( is );
+            }
+            else {
+                input_seqs = GeneralMsaParser.parseSeqs( is );
+            }
+            if ( input_seqs == null ) {
+                ForesterUtil.fatalError( PRG_NAME, "failed to read input sequences" );
+            }
+            if ( input_seqs.size() < 1 ) {
+                ForesterUtil.fatalError( PRG_NAME, "input seems to be devoid of sequences" );
+            }
+            final BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
+            for( final MolecularSequence seq : input_seqs ) {
+                stats.addValue( seq.getLength() );
+            }
+            System.out.println( "Number of sequences   : " + input_seqs.size() );
+            if ( !ForesterUtil.isEqual( stats.getMin(), stats.getMax() ) ) {
+                System.out.println( "Sequence lenght min   : " + ( int ) stats.getMin() );
+                System.out.println( "Sequence lenght max   : " + ( int ) stats.getMax() );
+                if ( input_seqs.size() > 2 ) {
+                    System.out.println( "Sequence lenght median: " + ( int ) stats.median() );
+                }
+                if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
+                    ForesterUtil.fatalError( PRG_NAME,
+                                             "Input is not an alignment, cannot write in Nexus or Phylip format" );
+                }
+            }
+            else {
+                System.out.println( "Alignment length      : " + ( int ) stats.getMax() );
+            }
+            final List<MolecularSequence> output_seqs = new ArrayList<>();
+            int counter = 0;
+            final BufferedWriter output_map_writer;
+            if ( normalize_identifiers ) {
+                output_map_writer = ForesterUtil.createBufferedWriter( output_map_file_str );
+            }
+            else {
+                output_map_writer = null;
+            }
+            for( final MolecularSequence seq : input_seqs ) {
+                final String new_name;
+                if ( normalize_identifiers ) {
+                    new_name = modify_name( seq.getIdentifier(), counter++, output_map_writer );
+                }
+                else {
+                    new_name = seq.getIdentifier();
+                }
+                final MolecularSequence ns = BasicSequence.createGeneralSequence( new_name,
+                                                                                  seq.getMolecularSequenceAsString() );
+                output_seqs.add( ns );
+            }
+            if ( normalize_identifiers ) {
+                output_map_writer.flush();
+                output_map_writer.close();
+                System.out.println();
+                System.out.println( "Wrote                 : " + output_map_file );
+            }
+            final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_seqs_file );
+            if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
+                final Msa m = BasicMsa.createInstance( output_seqs );
+                m.write( seq_writer, output_format );
+            }
+            else if ( output_format == MSA_FORMAT.FASTA ) {
+                SequenceWriter.writeSeqs( output_seqs, seq_writer, SEQ_FORMAT.FASTA, 60 );
+            }
+            seq_writer.flush();
+            seq_writer.close();
+            System.out.println( "Wrote                 : " + outfile_seqs_file );
+            System.out.println();
+        }
+        catch ( final IllegalArgumentException e ) {
+            ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
+        }
+        catch ( final Exception e ) {
+            e.printStackTrace();
+            ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" );
+        }
+    }
+
+    final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException {
+        desc.replaceAll( "\\s+", " " );
+        final String new_desc = Integer.toHexString( counter );
+        if ( new_desc.length() > 9 ) {
+            ForesterUtil.fatalError( PRG_NAME,
+                                     "shortened identifier [" + new_desc + "] is too long (" + new_desc.length()
+                                             + " characters)" );
+        }
+        writer.write( new_desc + "\t" + desc + "\n" );
+        return new_desc;
+    }
+
+    private final static void print_help() {
+        System.out.println( "Usage:" );
+        System.out.println();
+        System.out.println( PRG_NAME + " [options] <input sequences file> [output sequences file] [output map file]" );
+        System.out.println();
+        System.out.println( " options:" );
+        System.out.println( "  -" + OUTPUT_FORMAT_OPTION + "=<format>: output format: " + OUTPUT_FORMAT_FASTA_L + " or "
+                + OUTPUT_FORMAT_FASTA + " for Fasta (default), " + OUTPUT_FORMAT_PHYLIP_L + " or "
+                + OUTPUT_FORMAT_PHYLIP + " for Phylip, " + OUTPUT_FORMAT_NEXUS_L + " or " + OUTPUT_FORMAT_NEXUS
+                + " for Nexus" );
+        System.out.println( "  -" + ID_NORM_OPTION + ": to replace sequence names with short(er) identifiers" );
+        System.out.println();
+        System.out.println( "Example:" );
+        System.out.println();
+        System.out.println( " " + PRG_NAME + " -i -o=p my_seqs.fasta" );
+        System.out.println();
+    }
+}
diff --git a/forester/java/src/org/forester/application/tap.java b/forester/java/src/org/forester/application/tap.java
deleted file mode 100644 (file)
index 2fcf2cc..0000000
+++ /dev/null
@@ -1,211 +0,0 @@
-
-package org.forester.application;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.forester.io.parsers.FastaParser;
-import org.forester.io.parsers.GeneralMsaParser;
-import org.forester.io.writers.SequenceWriter;
-import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
-import org.forester.msa.BasicMsa;
-import org.forester.msa.Msa;
-import org.forester.msa.Msa.MSA_FORMAT;
-import org.forester.sequence.BasicSequence;
-import org.forester.sequence.MolecularSequence;
-import org.forester.util.CommandLineArguments;
-import org.forester.util.ForesterConstants;
-import org.forester.util.ForesterUtil;
-
-public class tap {
-
-    final static private String PRG_NAME                = "tap";
-    final static private String PRG_DATE                = "170327";
-    final static private String PRG_DESC                = "Replacement of labels in multiple sequence files";
-    final static private String PRG_VERSION             = "1.00";
-    final static private String WWW                     = "https://sites.google.com/site/cmzmasek/home/software/forester";
-    final static private String E_MAIL                  = "phyloxml@gmail.com";
-    final static private String EXTRACT_TAXONOMY_OPTION = "t";
-    final static private String ANNOTATION_OPTION       = "a";
-    final static private String HELP_OPTION_1           = "help";
-    final static private String HELP_OPTION_2           = "h";
-
-    public static void main( final String args[] ) {
-        try {
-            ForesterUtil.printProgramInformation( PRG_NAME,
-                                                  PRG_DESC,
-                                                  PRG_VERSION,
-                                                  PRG_DATE,
-                                                  E_MAIL,
-                                                  WWW,
-                                                  ForesterUtil.getForesterLibraryInformation() );
-            CommandLineArguments cla = null;
-            try {
-                cla = new CommandLineArguments( args );
-            }
-            catch ( final Exception e ) {
-                ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
-            }
-            if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) {
-                System.out.println();
-                print_help();
-                System.exit( 0 );
-            }
-            String input = null;
-            String output = null;
-            String list_file = null;
-            String i = null;
-            if ( args.length == 3 ) {
-                input = cla.getName( 0 );
-                output = cla.getName( 1 );
-                list_file = cla.getName( 2 );
-            }
-            else if ( args.length == 1 ) {
-                input = cla.getName( 0 );
-                i = null;
-                if ( input.toLowerCase().endsWith( ".fasta" ) ) {
-                    i = input.substring( 0, input.length() - 7 );
-                }
-                else if ( input.toLowerCase().endsWith( ".fsa" ) ) {
-                    i = input.substring( 0, input.length() - 5 );
-                }
-                else {
-                    i = input;
-                }
-                output = i + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX;
-                list_file = i + ForesterConstants.ID_MAP_FILE_SUFFIX;
-            }
-            else {
-                print_help();
-                System.exit( -1 );
-            }
-            final List<String> allowed_options = new ArrayList<>();
-            allowed_options.add( ANNOTATION_OPTION );
-            final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
-            if ( dissallowed_options.length() > 0 ) {
-                ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
-            }
-            final File outfile_file = new File( output );
-            final File listfile = new File( list_file );
-            final File input_file = new File( input );
-            final String error1 = ForesterUtil.isWritableFile( outfile_file );
-            if ( !ForesterUtil.isEmpty( error1 ) ) {
-                ForesterUtil.fatalError( PRG_NAME, error1 );
-            }
-            final String error2 = ForesterUtil.isWritableFile( listfile );
-            if ( !ForesterUtil.isEmpty( error2 ) ) {
-                ForesterUtil.fatalError( PRG_NAME, error2 );
-            }
-            final String error3 = ForesterUtil.isReadableFile( input_file );
-            if ( !ForesterUtil.isEmpty( error3 ) ) {
-                ForesterUtil.fatalError( PRG_NAME, error3 );
-            }
-            final boolean fasta_like = ForesterUtil.isLooksLikeFasta( input_file );
-            final Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA;
-            System.out.println();
-            System.out.println( "Input alignment       : " + input );
-            System.out.println( "Output alignment      : " + output );
-            System.out.println( "Name list             : " + list_file );
-            if ( fasta_like ) {
-                System.out.println( "Input format          : Fasta" );
-            }
-            else {
-                System.out.println( "Input format          : Phylip like" );
-            }
-            if ( output_format == MSA_FORMAT.FASTA ) {
-                System.out.println( "Output format         : Fasta" );
-            }
-            else if ( output_format == MSA_FORMAT.NEXUS ) {
-                System.out.println( "Output format         : Nexus" );
-            }
-            else if ( output_format == MSA_FORMAT.PHYLIP ) {
-                System.out.println( "Output format         : Phylip" );
-            }
-            System.out.println();
-            
-            final List<MolecularSequence> seqs;
-            final FileInputStream is = new FileInputStream( input_file );
-            if ( FastaParser.isLikelyFasta( input_file ) ) {
-                seqs = FastaParser.parse( is );
-            }
-            else {
-                seqs = GeneralMsaParser.parseSeqs( is );
-            }
-            if ( seqs == null ) {
-                ForesterUtil.fatalError( PRG_NAME, "failed to read MSA" );
-            }
-            if ( seqs.size() < 1 ) {
-                ForesterUtil.fatalError( PRG_NAME, "MSA seems to be devoid of sequences" );
-            }
-           // TODO print number of seqs
-           // TODO print number min length
-           // TODO print max length
-           // TODO OR
-          //  TODO print length is aligned
-          //  TODO if no aligned no phylip or nexus outpt
-            //
-           
-            final List<MolecularSequence> seqs2 = new ArrayList<>();
-            int counter = 0;
-            final BufferedWriter writer = ForesterUtil.createBufferedWriter( list_file );
-            for( final MolecularSequence seq : seqs ) {
-                final String new_name = modify_name( seq.getIdentifier(), counter++, writer );
-                final MolecularSequence ns = BasicSequence.createSequence( new_name,
-                                                                           seq.getMolecularSequenceAsString() );
-                seqs2.add( ns );
-            }
-            writer.flush();
-            writer.close();
-            final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_file );
-            if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) {
-                final Msa m = BasicMsa.createInstance( seqs2 );
-                m.write( seq_writer, output_format );
-            }
-            else if ( output_format == MSA_FORMAT.FASTA ) {
-                SequenceWriter.writeSeqs( seqs2, seq_writer, SEQ_FORMAT.FASTA, 60 );
-            }
-            seq_writer.flush();
-            seq_writer.close();
-            //                    Util.print_message( PRG_NAME, "wrote: " + list_file )
-            //                    Util.print_message( PRG_NAME, "wrote: " + output )
-        }
-        catch ( final IllegalArgumentException e ) {
-            ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
-        }
-        catch ( final Exception e ) {
-            e.printStackTrace();
-            ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" );
-        }
-    }
-
-    final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException {
-        desc.replaceAll( "\\s+", " " );
-        final String new_desc = Integer.toHexString( counter );
-        if ( new_desc.length() > 9 ) {
-            ForesterUtil.fatalError( PRG_NAME,
-                                     "shortened identifier [" + new_desc + "] is too long (" + new_desc.length()
-                                             + " characters)" );
-        }
-        writer.write( new_desc + "\t" + desc + "\n" );
-        return new_desc;
-    }
-
-    private final static void print_help() {
-        System.out.println( "Usage:" );
-        System.out.println();
-        System.out.println( PRG_NAME + " [options] <gene tree file> <query>" );
-        System.out.println();
-        System.out.println( " options:" );
-        //System.out.println( "  -" + SEP_OPTION + "=<separator>: the separator to be used" );
-        System.out.println();
-        System.out.println( "Example:" );
-        System.out.println();
-        System.out.println( " " + PRG_NAME + " -s=. my_tree.xml A.1.1.1" );
-        System.out.println();
-    }
-}
index 85fcbf7..64a7b47 100644 (file)
@@ -4,7 +4,7 @@ package org.forester.clade_analysis;
 import java.math.BigDecimal;
 import java.text.DecimalFormat;
 
-final class Prefix {
+public final class Prefix {
 
     private final static DecimalFormat df = new DecimalFormat( "0.0#####" );
     private final String               _prefix;
@@ -12,7 +12,7 @@ final class Prefix {
     private final String               _separator;
     private final String               _first;
 
-    Prefix( final String prefix, final String confidence, final String separator ) {
+    public Prefix( final String prefix, final String confidence, final String separator ) {
         _prefix = prefix;
         _confidence = new BigDecimal( confidence );
         _separator = separator;
@@ -24,7 +24,7 @@ final class Prefix {
         }
     }
 
-    Prefix( final String prefix, final double confidence, final String separator ) {
+    public Prefix( final String prefix, final double confidence, final String separator ) {
         _prefix = prefix;
         _confidence = new BigDecimal( confidence );
         _separator = separator;
@@ -36,15 +36,15 @@ final class Prefix {
         }
     }
 
-    String getPrefix() {
+    public  String getPrefix() {
         return _prefix;
     }
 
-    String getPrefixFirstElement() {
+    public  String getPrefixFirstElement() {
         return _first;
     }
 
-    double getConfidence() {
+    public double getConfidence() {
         return _confidence.doubleValue();
     }
 
index 2540cbe..de3084e 100644 (file)
@@ -171,6 +171,12 @@ public class BasicSequence implements MolecularSequence {
                                   .replaceAll( re, Character.toString( repl ) ), type );
     }
     
+    public static MolecularSequence createGeneralSequence( final String identifier, final String mol_sequence ) {
+        check( identifier, mol_sequence );
+        return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR 
+                                  ), TYPE.GENERAL );
+    }
+    
     public static MolecularSequence createAaSequence( final String identifier, final String mol_sequence ) {
         check( identifier, mol_sequence );
         return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
index ae8a91e..d4f9bf3 100644 (file)
@@ -54,6 +54,6 @@ public interface MolecularSequence {
     public abstract TYPE getType();
 
     public enum TYPE {
-        RNA, DNA, AA;
+        RNA, DNA, AA, GENERAL;
     }
 }
\ No newline at end of file
index 727f30c..ffb6c96 100644 (file)
@@ -27,20 +27,25 @@ package org.forester.util;
 
 public final class ForesterConstants {
 
-    public final static String  FORESTER_VERSION                = "1.045";
-    public final static String  FORESTER_DATE                   = "161214";
-    public final static String  PHYLO_XML_VERSION               = "1.20";
-    public final static String  PHYLO_XML_LOCATION              = "http://www.phyloxml.org";
-    public final static String  PHYLO_XML_XSD                   = "phyloxml.xsd";
-    public final static String  XML_SCHEMA_INSTANCE             = "http://www.w3.org/2001/XMLSchema-instance";
-    public final static String  LOCAL_PHYLOXML_XSD_RESOURCE     = "resources/phyloxml.xsd";
-    public final static String  PHYLO_XML_SUFFIX                = ".xml";
-    public final static String  ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta";
-    public final static String  ID_MAP_FILE_SUFFIX              = ".nim";
-    public final static String  UTF_8                           = "UTF-8";
-    public final static String  ISO_8859_1                      = "ISO-8859-1";
-    public final static String  PHYLO_XML_REFERENCE             = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356";
-    public final static boolean RELEASE                         = false;
+    public final static String  FORESTER_VERSION                 = "1.046";
+    public final static String  FORESTER_DATE                    = "170830";
+    public final static String  PHYLO_XML_VERSION                = "1.20";
+    public final static String  PHYLO_XML_LOCATION               = "http://www.phyloxml.org";
+    public final static String  PHYLO_XML_XSD                    = "phyloxml.xsd";
+    public final static String  XML_SCHEMA_INSTANCE              = "http://www.w3.org/2001/XMLSchema-instance";
+    public final static String  LOCAL_PHYLOXML_XSD_RESOURCE      = "resources/phyloxml.xsd";
+    public final static String  PHYLO_XML_SUFFIX                 = ".xml";
+    public final static String  ID_NORMALIZED_FASTA_FILE_SUFFIX  = "_ni.fasta";
+    public final static String  ID_NORMALIZED_NEXUS_FILE_SUFFIX  = "_ni.nexus";
+    public final static String  ID_NORMALIZED_PHYLIP_FILE_SUFFIX = "_ni.phylip";
+    public final static String  FASTA_FILE_SUFFIX                = ".fasta";
+    public final static String  NEXUS_FILE_SUFFIX                = ".nexus";
+    public final static String  PHYLIP_FILE_SUFFIX               = ".phylip";
+    public final static String  ID_MAP_FILE_SUFFIX               = ".nim";
+    public final static String  UTF_8                            = "UTF-8";
+    public final static String  ISO_8859_1                       = "ISO-8859-1";
+    public final static String  PHYLO_XML_REFERENCE              = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356";
+    public final static boolean RELEASE                          = false;
 
     public enum PhylogeneticTreeFormats {
                                          NH,
index dddbf28..dc89976 100644 (file)
@@ -722,6 +722,10 @@ public final class ForesterUtil {
         }
         return null;
     }
+    
+    final public static String isWritableFile( final String s ) {
+        return isWritableFile( new File( s ) );
+    }
 
     /**
      * Helper for method "stringToColor".