From: cmzmasek Date: Wed, 30 Aug 2017 23:39:29 +0000 (-0700) Subject: in progress... X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=917fdd32a745f77a8b70d7779e44b8b36e11185f;p=jalview.git in progress... --- diff --git a/forester/java/src/org/forester/application/cladinator.java b/forester/java/src/org/forester/application/cladinator.java index 95322e1..69affd2 100644 --- a/forester/java/src/org/forester/application/cladinator.java +++ b/forester/java/src/org/forester/application/cladinator.java @@ -30,12 +30,11 @@ import java.io.IOException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; import org.forester.clade_analysis.AnalysisMulti; -import org.forester.clade_analysis.AnalysisSingle; import org.forester.clade_analysis.Prefix; import org.forester.clade_analysis.ResultMulti; -import org.forester.clade_analysis.ResultSingle; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; @@ -78,14 +77,14 @@ public final class cladinator { print_help(); System.exit( 0 ); } - else if ( ( args.length != 2 && args.length != 3 ) ) { + else if ( ( ( args.length != 2 ) && ( args.length != 3 ) ) ) { System.out.println(); System.out.println( "Wrong number of arguments." ); System.out.println(); print_help(); System.exit( -1 ); } - final List allowed_options = new ArrayList(); + final List allowed_options = new ArrayList<>(); allowed_options.add( SEP_OPTION ); final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); if ( dissallowed_options.length() > 0 ) { @@ -118,104 +117,44 @@ public final class cladinator { System.out.println( "\nCould not read \"" + intreefile + "\" [" + e.getMessage() + "]\n" ); System.exit( -1 ); } - - final ResultMulti res = AnalysisMulti.execute( p, query, separator, 0.5 ); - + final Pattern pattern = Pattern.compile( query ); + final ResultMulti res = AnalysisMulti.execute( p, pattern, separator, 0.5 ); System.out.println(); System.out.println( "Result:" ); System.out.println( "Query : " + query ); - /////////////////// - - - System.out.println( "Collapsed:" ); - - for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) { - System.out.println( prefix ); - } - if ( _has_specifics ) { - - System.out.println( "Specifics:" ); - - for( final Prefix prefix : _cleaned_spec ) { - System.out.println( prefix ); - - } - - System.out.println( "Collapsed With Specifics:" ); - - for( final Prefix prefix : _collapsed ) { - System.out.println( prefix ); - - for( final Prefix spec : _cleaned_spec ) { - if ( spec.getPrefix().startsWith( prefix.getPrefix() ) ) { - System.out.println( " " + spec ); - - } - } - } - } - if ( !ForesterUtil.isEmpty( _all_down ) ) { - - System.out.println( "Collapsed Down:" ); - - for( final Prefix prefix : _collapsed_down ) { - System.out.println( prefix ); - - } - - } - if ( !ForesterUtil.isEmpty( _all_up ) ) { - - - System.out.println( "Collapsed Up:" ); - - for( final Prefix prefix : _collapsed_up ) { - System.out.println( prefix ); - - } - - } - - /////////////////// - - - System.out.print( "Greatest Common Prefix : " + res.getGreatestCommonPrefix() ); - if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefix() ) - && !ForesterUtil.isEmpty( res.getGreatestCommonCladeSubtreeConfidence() ) ) { - System.out.println( "\t(" + res.getGreatestCommonCladeSubtreeConfidence() + ")" ); - } - else { - System.out.println(); - } - System.out.print( "Greatest Common Prefix Up : " + res.getGreatestCommonPrefixUp() ); - if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefixUp() ) - && !ForesterUtil.isEmpty( res.getGreatestCommonCladeUpSubtreeConfidence() ) ) { - System.out.println( "\t(" + res.getGreatestCommonCladeUpSubtreeConfidence() + ")" ); - } - else { - System.out.println(); + for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) { + System.out.println( prefix ); } - System.out.print( "Greatest Common Prefix Down : " + res.getGreatestCommonPrefixDown() ); - if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefixDown() ) - && !ForesterUtil.isEmpty( res.getGreatestCommonCladeDownSubtreeConfidence() ) ) { - System.out.println( "\t(" + res.getGreatestCommonCladeDownSubtreeConfidence() + ")" ); + if ( res.isHasSpecificMultiHitsPrefixes() ) { + System.out.println( "Specifics:" ); + for( final Prefix prefix : res.getSpecificMultiHitPrefixes() ) { + System.out.println( prefix ); + } + System.out.println( "Collapsed With Specifics:" ); + for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) { + System.out.println( prefix ); + for( final Prefix spec : res.getSpecificMultiHitPrefixes() ) { + if ( spec.getPrefix().startsWith( prefix.getPrefix() ) ) { + System.out.println( " " + spec ); + } + } + } } - else { - System.out.println(); + if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesDown() ) ) { + System.out.println( "Collapsed Down:" ); + for( final Prefix prefix : res.getCollapsedMultiHitPrefixesDown() ) { + System.out.println( prefix ); + } } - System.out.println( "Least Encompassing Clade size: " + res.getLeastEncompassingCladeSize() - + " external nodes" ); - final double lec_ratio = ( 100.0 * res.getLeastEncompassingCladeSize() ) / res.getTreeSize(); - System.out.println( "Least Encompassing Clade size: " + df2.format( lec_ratio ) + "%" ); - System.out.println( "Total tree size : " + res.getTreeSize() + " external nodes" ); - if ( res.getWarnings().size() > 0 ) { - System.out.println( "Warnings:" ); - for( final String s : res.getWarnings() ) { - System.out.println( s ); + if ( !ForesterUtil.isEmpty( res.getAllMultiHitPrefixesUp() ) ) { + System.out.println( "Collapsed Up:" ); + for( final Prefix prefix : res.getAllMultiHitPrefixesUp() ) { + System.out.println( prefix ); } } + /////////////////// System.out.println(); } catch ( final IllegalArgumentException e ) { diff --git a/forester/java/src/org/forester/application/serin.java b/forester/java/src/org/forester/application/serin.java new file mode 100644 index 0000000..a9ee157 --- /dev/null +++ b/forester/java/src/org/forester/application/serin.java @@ -0,0 +1,343 @@ + +package org.forester.application; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.FastaParser; +import org.forester.io.parsers.GeneralMsaParser; +import org.forester.io.writers.SequenceWriter; +import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; +import org.forester.msa.BasicMsa; +import org.forester.msa.Msa; +import org.forester.msa.Msa.MSA_FORMAT; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.MolecularSequence; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; + +public class serin { + + final static private String PRG_NAME = "serin"; + final static private String PRG_DATE = "170830"; + final static private String PRG_DESC = "sequence file reformatting and identifier normalization"; + final static private String PRG_VERSION = "1.00"; + final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; + final static private String E_MAIL = "phyloxml@gmail.com"; + final static private String OUTPUT_FORMAT_OPTION = "o"; + final static private String ID_NORM_OPTION = "i"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + private static final String OUTPUT_FORMAT_FASTA = "f"; + private static final String OUTPUT_FORMAT_PHYLIP = "p"; + private static final String OUTPUT_FORMAT_NEXUS = "n"; + private static final String OUTPUT_FORMAT_FASTA_L = "fasta"; + private static final String OUTPUT_FORMAT_PHYLIP_L = "phylip"; + private static final String OUTPUT_FORMAT_NEXUS_L = "nexus"; + + public static void main( final String args[] ) { + try { + ForesterUtil.printProgramInformation( PRG_NAME, + PRG_DESC, + PRG_VERSION, + PRG_DATE, + E_MAIL, + WWW, + ForesterUtil.getForesterLibraryInformation() ); + CommandLineArguments cla = null; + try { + cla = new CommandLineArguments( args ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + if ( ( cla.getNumberOfNames() == 0 ) || cla.isOptionSet( HELP_OPTION_1 ) + || cla.isOptionSet( HELP_OPTION_2 ) ) { + System.out.println(); + print_help(); + System.exit( 0 ); + } + String input_seqs_file_str = null; + String output_seqs_file_str = null; + String output_map_file_str = null; + String input_seqs_name_wo_suffix = null; + if ( ( cla.getNumberOfNames() == 2 ) || ( cla.getNumberOfNames() == 3 ) ) { + input_seqs_file_str = cla.getName( 0 ); + output_seqs_file_str = cla.getName( 1 ); + if ( cla.getNumberOfNames() == 3 ) { + output_map_file_str = cla.getName( 2 ); + } + } + else if ( cla.getNumberOfNames() == 1 ) { + input_seqs_file_str = cla.getName( 0 ); + input_seqs_name_wo_suffix = null; + if ( input_seqs_file_str.toLowerCase().endsWith( ".fasta" ) ) { + input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 6 ); + } + else if ( input_seqs_file_str.toLowerCase().endsWith( ".fsa" ) ) { + input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 ); + } + else if ( input_seqs_file_str.toLowerCase().endsWith( ".phy" ) ) { + input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 ); + } + else if ( input_seqs_file_str.toLowerCase().endsWith( ".aln" ) ) { + input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 ); + } + else if ( input_seqs_file_str.toLowerCase().endsWith( ".phylip" ) ) { + input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 7 ); + } + else if ( input_seqs_file_str.toLowerCase().endsWith( ".nex" ) ) { + input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 4 ); + } + else if ( input_seqs_file_str.toLowerCase().endsWith( ".nexus" ) ) { + input_seqs_name_wo_suffix = input_seqs_file_str.substring( 0, input_seqs_file_str.length() - 5 ); + } + else { + input_seqs_name_wo_suffix = input_seqs_file_str; + } + output_map_file_str = input_seqs_name_wo_suffix + ForesterConstants.ID_MAP_FILE_SUFFIX; + } + else { + print_help(); + System.exit( -1 ); + } + final List allowed_options = new ArrayList<>(); + allowed_options.add( OUTPUT_FORMAT_OPTION ); + allowed_options.add( ID_NORM_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + final File input_seqs_file = new File( input_seqs_file_str ); + final String error0 = ForesterUtil.isReadableFile( input_seqs_file ); + if ( !ForesterUtil.isEmpty( error0 ) ) { + ForesterUtil.fatalError( PRG_NAME, error0 ); + } + final boolean input_seqs_fasta_like = ForesterUtil.isLooksLikeFasta( input_seqs_file ); + Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA; + if ( cla.isOptionSet( OUTPUT_FORMAT_OPTION ) ) { + if ( cla.isOptionValueSet( OUTPUT_FORMAT_OPTION ) ) { + final String output_format_str = cla.getOptionValue( OUTPUT_FORMAT_OPTION ); + if ( output_format_str.equals( OUTPUT_FORMAT_FASTA ) + || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_FASTA_L ) ) { + output_format = MSA_FORMAT.FASTA; + } + else if ( output_format_str.equals( OUTPUT_FORMAT_PHYLIP ) + || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_PHYLIP_L ) ) { + output_format = MSA_FORMAT.PHYLIP; + } + else if ( output_format_str.equals( OUTPUT_FORMAT_NEXUS ) + || output_format_str.equalsIgnoreCase( OUTPUT_FORMAT_NEXUS_L ) ) { + output_format = MSA_FORMAT.NEXUS; + } + else { + ForesterUtil.fatalError( PRG_NAME, "unknown format option: " + output_format_str ); + } + } + } + final boolean normalize_identifiers; + if ( cla.isOptionSet( ID_NORM_OPTION ) || ( cla.getNumberOfNames() == 3 ) ) { + normalize_identifiers = true; + } + else { + normalize_identifiers = false; + } + if ( normalize_identifiers && ForesterUtil.isEmpty( output_map_file_str ) ) { + ForesterUtil.fatalError( PRG_NAME, "need to indicate name for output map file" ); + } + final File output_map_file; + if ( normalize_identifiers ) { + output_map_file = new File( output_map_file_str ); + final String error = ForesterUtil.isWritableFile( output_map_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( PRG_NAME, error ); + } + } + else { + output_map_file = null; + } + if ( cla.getNumberOfNames() == 1 ) { + if ( normalize_identifiers ) { + if ( output_format == MSA_FORMAT.FASTA ) { + output_seqs_file_str = input_seqs_name_wo_suffix + + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX; + } + else if ( output_format == MSA_FORMAT.NEXUS ) { + output_seqs_file_str = input_seqs_name_wo_suffix + + ForesterConstants.ID_NORMALIZED_NEXUS_FILE_SUFFIX; + } + else if ( output_format == MSA_FORMAT.PHYLIP ) { + output_seqs_file_str = input_seqs_name_wo_suffix + + ForesterConstants.ID_NORMALIZED_PHYLIP_FILE_SUFFIX; + } + } + else { + if ( output_format == MSA_FORMAT.FASTA ) { + output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.FASTA_FILE_SUFFIX; + if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) { + output_seqs_file_str = input_seqs_name_wo_suffix + "_" + + ForesterConstants.FASTA_FILE_SUFFIX; + } + } + else if ( output_format == MSA_FORMAT.NEXUS ) { + output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.NEXUS_FILE_SUFFIX; + if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) { + output_seqs_file_str = input_seqs_name_wo_suffix + "_" + + ForesterConstants.NEXUS_FILE_SUFFIX; + } + } + else if ( output_format == MSA_FORMAT.PHYLIP ) { + output_seqs_file_str = input_seqs_name_wo_suffix + ForesterConstants.PHYLIP_FILE_SUFFIX; + if ( ForesterUtil.isWritableFile( output_seqs_file_str ) != null ) { + output_seqs_file_str = input_seqs_name_wo_suffix + "_" + + ForesterConstants.PHYLIP_FILE_SUFFIX; + } + } + } + } + final File outfile_seqs_file = new File( output_seqs_file_str ); + final String error1 = ForesterUtil.isWritableFile( outfile_seqs_file ); + if ( !ForesterUtil.isEmpty( error1 ) ) { + ForesterUtil.fatalError( PRG_NAME, error1 ); + } + System.out.println(); + if ( input_seqs_fasta_like ) { + System.out.println( "Input format : Fasta" ); + } + else { + System.out.println( "Input format : Phylip like" ); + } + System.out.println( "Input file : " + input_seqs_file_str ); + if ( output_format == MSA_FORMAT.FASTA ) { + System.out.println( "Output format : Fasta" ); + } + else if ( output_format == MSA_FORMAT.NEXUS ) { + System.out.println( "Output format : Nexus" ); + } + else if ( output_format == MSA_FORMAT.PHYLIP ) { + System.out.println( "Output format : Phylip" ); + } + System.out.println( "Output file : " + output_seqs_file_str ); + System.out.println( "Shorten names : " + normalize_identifiers ); + if ( normalize_identifiers ) { + System.out.println( "Identifier map : " + output_map_file_str ); + } + final List input_seqs; + final FileInputStream is = new FileInputStream( input_seqs_file ); + if ( FastaParser.isLikelyFasta( input_seqs_file ) ) { + input_seqs = FastaParser.parse( is ); + } + else { + input_seqs = GeneralMsaParser.parseSeqs( is ); + } + if ( input_seqs == null ) { + ForesterUtil.fatalError( PRG_NAME, "failed to read input sequences" ); + } + if ( input_seqs.size() < 1 ) { + ForesterUtil.fatalError( PRG_NAME, "input seems to be devoid of sequences" ); + } + final BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final MolecularSequence seq : input_seqs ) { + stats.addValue( seq.getLength() ); + } + System.out.println( "Number of sequences : " + input_seqs.size() ); + if ( !ForesterUtil.isEqual( stats.getMin(), stats.getMax() ) ) { + System.out.println( "Sequence lenght min : " + ( int ) stats.getMin() ); + System.out.println( "Sequence lenght max : " + ( int ) stats.getMax() ); + if ( input_seqs.size() > 2 ) { + System.out.println( "Sequence lenght median: " + ( int ) stats.median() ); + } + if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) { + ForesterUtil.fatalError( PRG_NAME, + "Input is not an alignment, cannot write in Nexus or Phylip format" ); + } + } + else { + System.out.println( "Alignment length : " + ( int ) stats.getMax() ); + } + final List output_seqs = new ArrayList<>(); + int counter = 0; + final BufferedWriter output_map_writer; + if ( normalize_identifiers ) { + output_map_writer = ForesterUtil.createBufferedWriter( output_map_file_str ); + } + else { + output_map_writer = null; + } + for( final MolecularSequence seq : input_seqs ) { + final String new_name; + if ( normalize_identifiers ) { + new_name = modify_name( seq.getIdentifier(), counter++, output_map_writer ); + } + else { + new_name = seq.getIdentifier(); + } + final MolecularSequence ns = BasicSequence.createGeneralSequence( new_name, + seq.getMolecularSequenceAsString() ); + output_seqs.add( ns ); + } + if ( normalize_identifiers ) { + output_map_writer.flush(); + output_map_writer.close(); + System.out.println(); + System.out.println( "Wrote : " + output_map_file ); + } + final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_seqs_file ); + if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) { + final Msa m = BasicMsa.createInstance( output_seqs ); + m.write( seq_writer, output_format ); + } + else if ( output_format == MSA_FORMAT.FASTA ) { + SequenceWriter.writeSeqs( output_seqs, seq_writer, SEQ_FORMAT.FASTA, 60 ); + } + seq_writer.flush(); + seq_writer.close(); + System.out.println( "Wrote : " + outfile_seqs_file ); + System.out.println(); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + catch ( final Exception e ) { + e.printStackTrace(); + ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" ); + } + } + + final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException { + desc.replaceAll( "\\s+", " " ); + final String new_desc = Integer.toHexString( counter ); + if ( new_desc.length() > 9 ) { + ForesterUtil.fatalError( PRG_NAME, + "shortened identifier [" + new_desc + "] is too long (" + new_desc.length() + + " characters)" ); + } + writer.write( new_desc + "\t" + desc + "\n" ); + return new_desc; + } + + private final static void print_help() { + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + " [options] [output sequences file] [output map file]" ); + System.out.println(); + System.out.println( " options:" ); + System.out.println( " -" + OUTPUT_FORMAT_OPTION + "=: output format: " + OUTPUT_FORMAT_FASTA_L + " or " + + OUTPUT_FORMAT_FASTA + " for Fasta (default), " + OUTPUT_FORMAT_PHYLIP_L + " or " + + OUTPUT_FORMAT_PHYLIP + " for Phylip, " + OUTPUT_FORMAT_NEXUS_L + " or " + OUTPUT_FORMAT_NEXUS + + " for Nexus" ); + System.out.println( " -" + ID_NORM_OPTION + ": to replace sequence names with short(er) identifiers" ); + System.out.println(); + System.out.println( "Example:" ); + System.out.println(); + System.out.println( " " + PRG_NAME + " -i -o=p my_seqs.fasta" ); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/application/tap.java b/forester/java/src/org/forester/application/tap.java deleted file mode 100644 index 2fcf2cc..0000000 --- a/forester/java/src/org/forester/application/tap.java +++ /dev/null @@ -1,211 +0,0 @@ - -package org.forester.application; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.Writer; -import java.util.ArrayList; -import java.util.List; - -import org.forester.io.parsers.FastaParser; -import org.forester.io.parsers.GeneralMsaParser; -import org.forester.io.writers.SequenceWriter; -import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; -import org.forester.msa.BasicMsa; -import org.forester.msa.Msa; -import org.forester.msa.Msa.MSA_FORMAT; -import org.forester.sequence.BasicSequence; -import org.forester.sequence.MolecularSequence; -import org.forester.util.CommandLineArguments; -import org.forester.util.ForesterConstants; -import org.forester.util.ForesterUtil; - -public class tap { - - final static private String PRG_NAME = "tap"; - final static private String PRG_DATE = "170327"; - final static private String PRG_DESC = "Replacement of labels in multiple sequence files"; - final static private String PRG_VERSION = "1.00"; - final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; - final static private String E_MAIL = "phyloxml@gmail.com"; - final static private String EXTRACT_TAXONOMY_OPTION = "t"; - final static private String ANNOTATION_OPTION = "a"; - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - - public static void main( final String args[] ) { - try { - ForesterUtil.printProgramInformation( PRG_NAME, - PRG_DESC, - PRG_VERSION, - PRG_DATE, - E_MAIL, - WWW, - ForesterUtil.getForesterLibraryInformation() ); - CommandLineArguments cla = null; - try { - cla = new CommandLineArguments( args ); - } - catch ( final Exception e ) { - ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); - } - if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { - System.out.println(); - print_help(); - System.exit( 0 ); - } - String input = null; - String output = null; - String list_file = null; - String i = null; - if ( args.length == 3 ) { - input = cla.getName( 0 ); - output = cla.getName( 1 ); - list_file = cla.getName( 2 ); - } - else if ( args.length == 1 ) { - input = cla.getName( 0 ); - i = null; - if ( input.toLowerCase().endsWith( ".fasta" ) ) { - i = input.substring( 0, input.length() - 7 ); - } - else if ( input.toLowerCase().endsWith( ".fsa" ) ) { - i = input.substring( 0, input.length() - 5 ); - } - else { - i = input; - } - output = i + ForesterConstants.ID_NORMALIZED_FASTA_FILE_SUFFIX; - list_file = i + ForesterConstants.ID_MAP_FILE_SUFFIX; - } - else { - print_help(); - System.exit( -1 ); - } - final List allowed_options = new ArrayList<>(); - allowed_options.add( ANNOTATION_OPTION ); - final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); - if ( dissallowed_options.length() > 0 ) { - ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); - } - final File outfile_file = new File( output ); - final File listfile = new File( list_file ); - final File input_file = new File( input ); - final String error1 = ForesterUtil.isWritableFile( outfile_file ); - if ( !ForesterUtil.isEmpty( error1 ) ) { - ForesterUtil.fatalError( PRG_NAME, error1 ); - } - final String error2 = ForesterUtil.isWritableFile( listfile ); - if ( !ForesterUtil.isEmpty( error2 ) ) { - ForesterUtil.fatalError( PRG_NAME, error2 ); - } - final String error3 = ForesterUtil.isReadableFile( input_file ); - if ( !ForesterUtil.isEmpty( error3 ) ) { - ForesterUtil.fatalError( PRG_NAME, error3 ); - } - final boolean fasta_like = ForesterUtil.isLooksLikeFasta( input_file ); - final Msa.MSA_FORMAT output_format = MSA_FORMAT.FASTA; - System.out.println(); - System.out.println( "Input alignment : " + input ); - System.out.println( "Output alignment : " + output ); - System.out.println( "Name list : " + list_file ); - if ( fasta_like ) { - System.out.println( "Input format : Fasta" ); - } - else { - System.out.println( "Input format : Phylip like" ); - } - if ( output_format == MSA_FORMAT.FASTA ) { - System.out.println( "Output format : Fasta" ); - } - else if ( output_format == MSA_FORMAT.NEXUS ) { - System.out.println( "Output format : Nexus" ); - } - else if ( output_format == MSA_FORMAT.PHYLIP ) { - System.out.println( "Output format : Phylip" ); - } - System.out.println(); - - final List seqs; - final FileInputStream is = new FileInputStream( input_file ); - if ( FastaParser.isLikelyFasta( input_file ) ) { - seqs = FastaParser.parse( is ); - } - else { - seqs = GeneralMsaParser.parseSeqs( is ); - } - if ( seqs == null ) { - ForesterUtil.fatalError( PRG_NAME, "failed to read MSA" ); - } - if ( seqs.size() < 1 ) { - ForesterUtil.fatalError( PRG_NAME, "MSA seems to be devoid of sequences" ); - } - // TODO print number of seqs - // TODO print number min length - // TODO print max length - // TODO OR - // TODO print length is aligned - // TODO if no aligned no phylip or nexus outpt - // - - final List seqs2 = new ArrayList<>(); - int counter = 0; - final BufferedWriter writer = ForesterUtil.createBufferedWriter( list_file ); - for( final MolecularSequence seq : seqs ) { - final String new_name = modify_name( seq.getIdentifier(), counter++, writer ); - final MolecularSequence ns = BasicSequence.createSequence( new_name, - seq.getMolecularSequenceAsString() ); - seqs2.add( ns ); - } - writer.flush(); - writer.close(); - final BufferedWriter seq_writer = ForesterUtil.createBufferedWriter( outfile_file ); - if ( ( output_format == MSA_FORMAT.NEXUS ) || ( output_format == MSA_FORMAT.PHYLIP ) ) { - final Msa m = BasicMsa.createInstance( seqs2 ); - m.write( seq_writer, output_format ); - } - else if ( output_format == MSA_FORMAT.FASTA ) { - SequenceWriter.writeSeqs( seqs2, seq_writer, SEQ_FORMAT.FASTA, 60 ); - } - seq_writer.flush(); - seq_writer.close(); - // Util.print_message( PRG_NAME, "wrote: " + list_file ) - // Util.print_message( PRG_NAME, "wrote: " + output ) - } - catch ( final IllegalArgumentException e ) { - ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); - } - catch ( final Exception e ) { - e.printStackTrace(); - ForesterUtil.fatalError( PRG_NAME, "Unexpected errror!" ); - } - } - - final static String modify_name( final String desc, final int counter, final Writer writer ) throws IOException { - desc.replaceAll( "\\s+", " " ); - final String new_desc = Integer.toHexString( counter ); - if ( new_desc.length() > 9 ) { - ForesterUtil.fatalError( PRG_NAME, - "shortened identifier [" + new_desc + "] is too long (" + new_desc.length() - + " characters)" ); - } - writer.write( new_desc + "\t" + desc + "\n" ); - return new_desc; - } - - private final static void print_help() { - System.out.println( "Usage:" ); - System.out.println(); - System.out.println( PRG_NAME + " [options] " ); - System.out.println(); - System.out.println( " options:" ); - //System.out.println( " -" + SEP_OPTION + "=: the separator to be used" ); - System.out.println(); - System.out.println( "Example:" ); - System.out.println(); - System.out.println( " " + PRG_NAME + " -s=. my_tree.xml A.1.1.1" ); - System.out.println(); - } -} diff --git a/forester/java/src/org/forester/clade_analysis/Prefix.java b/forester/java/src/org/forester/clade_analysis/Prefix.java index 85fcbf7..64a7b47 100644 --- a/forester/java/src/org/forester/clade_analysis/Prefix.java +++ b/forester/java/src/org/forester/clade_analysis/Prefix.java @@ -4,7 +4,7 @@ package org.forester.clade_analysis; import java.math.BigDecimal; import java.text.DecimalFormat; -final class Prefix { +public final class Prefix { private final static DecimalFormat df = new DecimalFormat( "0.0#####" ); private final String _prefix; @@ -12,7 +12,7 @@ final class Prefix { private final String _separator; private final String _first; - Prefix( final String prefix, final String confidence, final String separator ) { + public Prefix( final String prefix, final String confidence, final String separator ) { _prefix = prefix; _confidence = new BigDecimal( confidence ); _separator = separator; @@ -24,7 +24,7 @@ final class Prefix { } } - Prefix( final String prefix, final double confidence, final String separator ) { + public Prefix( final String prefix, final double confidence, final String separator ) { _prefix = prefix; _confidence = new BigDecimal( confidence ); _separator = separator; @@ -36,15 +36,15 @@ final class Prefix { } } - String getPrefix() { + public String getPrefix() { return _prefix; } - String getPrefixFirstElement() { + public String getPrefixFirstElement() { return _first; } - double getConfidence() { + public double getConfidence() { return _confidence.doubleValue(); } diff --git a/forester/java/src/org/forester/sequence/BasicSequence.java b/forester/java/src/org/forester/sequence/BasicSequence.java index 2540cbe..de3084e 100644 --- a/forester/java/src/org/forester/sequence/BasicSequence.java +++ b/forester/java/src/org/forester/sequence/BasicSequence.java @@ -171,6 +171,12 @@ public class BasicSequence implements MolecularSequence { .replaceAll( re, Character.toString( repl ) ), type ); } + public static MolecularSequence createGeneralSequence( final String identifier, final String mol_sequence ) { + check( identifier, mol_sequence ); + return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR + ), TYPE.GENERAL ); + } + public static MolecularSequence createAaSequence( final String identifier, final String mol_sequence ) { check( identifier, mol_sequence ); return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) diff --git a/forester/java/src/org/forester/sequence/MolecularSequence.java b/forester/java/src/org/forester/sequence/MolecularSequence.java index ae8a91e..d4f9bf3 100644 --- a/forester/java/src/org/forester/sequence/MolecularSequence.java +++ b/forester/java/src/org/forester/sequence/MolecularSequence.java @@ -54,6 +54,6 @@ public interface MolecularSequence { public abstract TYPE getType(); public enum TYPE { - RNA, DNA, AA; + RNA, DNA, AA, GENERAL; } } \ No newline at end of file diff --git a/forester/java/src/org/forester/util/ForesterConstants.java b/forester/java/src/org/forester/util/ForesterConstants.java index 727f30c..ffb6c96 100644 --- a/forester/java/src/org/forester/util/ForesterConstants.java +++ b/forester/java/src/org/forester/util/ForesterConstants.java @@ -27,20 +27,25 @@ package org.forester.util; public final class ForesterConstants { - public final static String FORESTER_VERSION = "1.045"; - public final static String FORESTER_DATE = "161214"; - public final static String PHYLO_XML_VERSION = "1.20"; - public final static String PHYLO_XML_LOCATION = "http://www.phyloxml.org"; - public final static String PHYLO_XML_XSD = "phyloxml.xsd"; - public final static String XML_SCHEMA_INSTANCE = "http://www.w3.org/2001/XMLSchema-instance"; - public final static String LOCAL_PHYLOXML_XSD_RESOURCE = "resources/phyloxml.xsd"; - public final static String PHYLO_XML_SUFFIX = ".xml"; - public final static String ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta"; - public final static String ID_MAP_FILE_SUFFIX = ".nim"; - public final static String UTF_8 = "UTF-8"; - public final static String ISO_8859_1 = "ISO-8859-1"; - public final static String PHYLO_XML_REFERENCE = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356"; - public final static boolean RELEASE = false; + public final static String FORESTER_VERSION = "1.046"; + public final static String FORESTER_DATE = "170830"; + public final static String PHYLO_XML_VERSION = "1.20"; + public final static String PHYLO_XML_LOCATION = "http://www.phyloxml.org"; + public final static String PHYLO_XML_XSD = "phyloxml.xsd"; + public final static String XML_SCHEMA_INSTANCE = "http://www.w3.org/2001/XMLSchema-instance"; + public final static String LOCAL_PHYLOXML_XSD_RESOURCE = "resources/phyloxml.xsd"; + public final static String PHYLO_XML_SUFFIX = ".xml"; + public final static String ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta"; + public final static String ID_NORMALIZED_NEXUS_FILE_SUFFIX = "_ni.nexus"; + public final static String ID_NORMALIZED_PHYLIP_FILE_SUFFIX = "_ni.phylip"; + public final static String FASTA_FILE_SUFFIX = ".fasta"; + public final static String NEXUS_FILE_SUFFIX = ".nexus"; + public final static String PHYLIP_FILE_SUFFIX = ".phylip"; + public final static String ID_MAP_FILE_SUFFIX = ".nim"; + public final static String UTF_8 = "UTF-8"; + public final static String ISO_8859_1 = "ISO-8859-1"; + public final static String PHYLO_XML_REFERENCE = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356"; + public final static boolean RELEASE = false; public enum PhylogeneticTreeFormats { NH, diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index dddbf28..dc89976 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -722,6 +722,10 @@ public final class ForesterUtil { } return null; } + + final public static String isWritableFile( final String s ) { + return isWritableFile( new File( s ) ); + } /** * Helper for method "stringToColor".