From e49ac25b5d578ea326900b7b56495ee33c3cb92c Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Mon, 19 May 2014 23:59:47 +0000 Subject: [PATCH] inprogress --- .../org/forester/application/msa_compactor.java | 160 ++++++++++++-------- forester/java/src/org/forester/msa/BasicMsa.java | 34 ++++- forester/java/src/org/forester/msa/Msa.java | 2 +- .../org/forester/msa_compactor/MsaCompactor.java | 24 ++- forester/java/src/org/forester/test/Test.java | 4 +- 5 files changed, 154 insertions(+), 70 deletions(-) diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index 19d5caf..4e51675 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -48,9 +48,11 @@ import org.forester.util.ForesterUtil; public class msa_compactor { - final private static NumberFormat NF_1 = new DecimalFormat( "#.0" ); + final private static NumberFormat NF_1 = new DecimalFormat( "0.#" ); + final private static NumberFormat NF_4 = new DecimalFormat( "0.####" ); static { NF_1.setRoundingMode( RoundingMode.HALF_UP ); + NF_4.setRoundingMode( RoundingMode.HALF_UP ); } final static private String HELP_OPTION_1 = "help"; final static private String HELP_OPTION_2 = "h"; @@ -64,7 +66,7 @@ public class msa_compactor { final static private String MIN_LENGTH_OPTION = "ml"; final static private String GAP_RATIO_LENGTH_OPTION = "gr"; final static private String REPORT_ENTROPY = "e"; - final static private String OUTPUT_FORMAT_PHYLIP_OPTION = "p"; + final static private String OUTPUT_FORMAT_OPTION = "f"; final static private String OUTPUT_REMOVED_SEQS_OPTION = "ro"; final static private String MAFFT_OPTIONS = "mo"; final static private String PERFORM_PHYLOGENETIC_INFERENCE = "t"; @@ -118,7 +120,7 @@ public class msa_compactor { allowed_options.add( MIN_LENGTH_OPTION ); allowed_options.add( GAP_RATIO_LENGTH_OPTION ); allowed_options.add( REPORT_ENTROPY ); - allowed_options.add( OUTPUT_FORMAT_PHYLIP_OPTION ); + allowed_options.add( OUTPUT_FORMAT_OPTION ); allowed_options.add( OUTPUT_REMOVED_SEQS_OPTION ); allowed_options.add( MAFFT_OPTIONS ); allowed_options.add( PERFORM_PHYLOGENETIC_INFERENCE ); @@ -137,7 +139,7 @@ public class msa_compactor { final DescriptiveStatistics initial_msa_stats = MsaMethods.calculateEffectiveLengthStatistics( msa ); final boolean chart_only = ( !cla.isOptionSet( LENGTH_OPTION ) ) && ( !cla.isOptionSet( REMOVE_WORST_OFFENDERS_OPTION ) ) - && ( !cla.isOptionSet( AV_GAPINESS_OPTION ) ); + && ( !cla.isOptionSet( AV_GAPINESS_OPTION ) && ( !cla.isOptionSet( MIN_LENGTH_OPTION ) ) ); if ( !chart_only && ( out == null ) ) { ForesterUtil.fatalError( PRG_NAME, "outfile file missing" ); } @@ -175,6 +177,22 @@ public class msa_compactor { + initial_msa_stats.getMin() + ") ]: " + length ); } } + if ( cla.isOptionSet( MIN_LENGTH_OPTION ) ) { + if ( cla.isOptionSet( LENGTH_OPTION ) || cla.isOptionSet( REMOVE_WORST_OFFENDERS_OPTION ) + || cla.isOptionSet( AV_GAPINESS_OPTION ) || cla.isOptionSet( STEP_OPTION ) + || cla.isOptionSet( REALIGN_OPTION ) || cla.isOptionSet( PATH_TO_MAFFT_OPTION ) + || cla.isOptionSet( STEP_FOR_DIAGNOSTICS_OPTION ) || cla.isOptionSet( REPORT_ENTROPY ) + || cla.isOptionSet( OUTPUT_REMOVED_SEQS_OPTION ) + || cla.isOptionSet( PERFORM_PHYLOGENETIC_INFERENCE ) ) { + printHelp(); + System.exit( 0 ); + } + min_length = cla.getOptionValueAsInt( MIN_LENGTH_OPTION ); + if ( ( min_length < 2 ) || ( min_length > initial_msa_stats.getMax() ) ) { + ForesterUtil.fatalError( PRG_NAME, "value for minimal sequence length is out of range: " + + min_length ); + } + } if ( cla.isOptionSet( STEP_OPTION ) ) { step = cla.getOptionValueAsInt( STEP_OPTION ); if ( ( step < 1 ) @@ -202,13 +220,6 @@ public class msa_compactor { + step_for_diagnostics ); } } - if ( cla.isOptionSet( MIN_LENGTH_OPTION ) ) { - min_length = cla.getOptionValueAsInt( MIN_LENGTH_OPTION ); - if ( ( min_length < 2 ) || ( min_length > initial_msa_stats.getMax() ) ) { - ForesterUtil.fatalError( PRG_NAME, "value for minimal sequence length is out of range: " - + min_length ); - } - } if ( cla.isOptionSet( GAP_RATIO_LENGTH_OPTION ) ) { gap_ratio = cla.getOptionValueAsDouble( GAP_RATIO_LENGTH_OPTION ); if ( ( gap_ratio < 0 ) || ( gap_ratio > 1 ) ) { @@ -218,8 +229,20 @@ public class msa_compactor { if ( cla.isOptionSet( REPORT_ENTROPY ) ) { report_entropy = true; } - if ( cla.isOptionSet( OUTPUT_FORMAT_PHYLIP_OPTION ) ) { - output_format = MSA_FORMAT.PHYLIP; + if ( cla.isOptionSet( OUTPUT_FORMAT_OPTION ) ) { + final String fs = cla.getOptionValueAsCleanString( OUTPUT_FORMAT_OPTION ); + if ( fs.equalsIgnoreCase( "p" ) ) { + output_format = MSA_FORMAT.PHYLIP; + } + else if ( fs.equalsIgnoreCase( "f" ) ) { + output_format = MSA_FORMAT.FASTA; + } + else if ( fs.equalsIgnoreCase( "n" ) ) { + output_format = MSA_FORMAT.NEXUS; + } + else { + ForesterUtil.fatalError( PRG_NAME, "illegal or empty output format option: " + fs ); + } } if ( cla.isOptionSet( OUTPUT_REMOVED_SEQS_OPTION ) ) { final String s = cla.getOptionValueAsCleanString( OUTPUT_REMOVED_SEQS_OPTION ); @@ -276,6 +299,10 @@ public class msa_compactor { + NF_1.format( initial_msa_stats.arithmeticMean() ) ); System.out.println( " Max sequence length : " + ( ( int ) initial_msa_stats.getMax() ) ); System.out.println( " Min sequence length : " + ( ( int ) initial_msa_stats.getMin() ) ); + System.out.println( " Gap ratio : " + + NF_4.format( MsaMethods.calcGapRatio( msa ) ) ); + System.out.println( " Normalized Shannon Entropy (entn21): " + + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 21, msa ) ) ); if ( !chart_only ) { System.out.println( "Output : " + out ); } @@ -301,69 +328,80 @@ public class msa_compactor { System.out.println( "Maximum allowed gap ratio per column : " + gap_ratio ); } if ( ( out != null ) || ( removed_seqs_out_base != null ) ) { - System.out.println( "Output format : " - + ( output_format == MSA_FORMAT.FASTA ? "fasta" : "phylip" ) ); - } - if ( chart_only && !realign ) { - System.out.println( "Step for output and re-aligning : n/a" ); + System.out.print( "Output format : " ); + if ( output_format == MSA_FORMAT.FASTA ) { + System.out.println( "fasta" ); + } + else if ( output_format == MSA_FORMAT.PHYLIP ) { + System.out.println( "phylip" ); + } + else if ( output_format == MSA_FORMAT.NEXUS ) { + System.out.println( "nexus" ); + } } - else { - if ( chart_only ) { - System.out.println( "Step for re-aligning : " + step ); + if ( min_length == -1 ) { + if ( chart_only && !realign ) { + System.out.println( "Step for output and re-aligning : n/a" ); } else { - System.out.println( "Step for output and re-aligning : " + step ); + if ( chart_only ) { + System.out.println( "Step for re-aligning : " + step ); + } + else { + System.out.println( "Step for output and re-aligning : " + step ); + } } + System.out.println( "Step for diagnostics reports : " + step_for_diagnostics ); + System.out.println( "Calculate normalized Shannon Entropy : " + report_entropy ); + if ( !norm ) { + System.out.println( "Normalize : " + norm ); + } + System.out.println( "Realign with MAFFT : " + realign ); + if ( realign ) { + System.out.println( "MAFFT options : " + mafft_options ); + } + System.out.println( "Simple tree (Kimura distances, NJ) : " + perform_phylogenetic_inference ); } - System.out.println( "Step for diagnostics reports : " + step_for_diagnostics ); - System.out.println( "Calculate normalized Shannon Entropy : " + report_entropy ); - if ( !norm ) { - System.out.println( "Normalize : " + norm ); - } - System.out.println( "Realign with MAFFT : " + realign ); - if ( realign ) { - System.out.println( "MAFFT options : " + mafft_options ); - } - System.out.println( "Simple tree (Kimura distances, NJ) : " + perform_phylogenetic_inference ); System.out.println(); final int initial_number_of_seqs = msa.getNumberOfSequences(); List msa_props = null; final MsaCompactor mc = new MsaCompactor( msa ); mc.setInfileName( in.getName() ); - mc.setNorm( norm ); - mc.setRealign( realign ); - if ( realign ) { - mc.setPathToMafft( path_to_mafft ); - mc.setMafftOptions( mafft_options ); - } - mc.setStep( step ); - mc.setStepForDiagnostics( step_for_diagnostics ); - mc.setCalculateNormalizedShannonEntropy( report_entropy ); - mc.setPeformPhylogenticInference( perform_phylogenetic_inference ); - if ( ( worst_remove > 0 ) || ( av_gap > 0 ) || ( length > 0 ) ) { + if ( ( worst_remove > 0 ) || ( av_gap > 0 ) || ( length > 0 ) || ( min_length != -1 ) ) { mc.setOutputFormat( output_format ); mc.setOutFileBase( out ); - if ( removed_seqs_out_base != null ) { - mc.setRemovedSeqsOutBase( removed_seqs_out_base ); - } } - if ( min_length > 1 ) { + if ( min_length != -1 ) { mc.removeSequencesByMinimalLength( min_length ); - mc.writeMsa( new File( "removed" ) ); - } - if ( worst_remove > 0 ) { - msa_props = mc.removeWorstOffenders( worst_remove ); - } - else if ( av_gap > 0 ) { - msa_props = mc.removeViaGapAverage( av_gap ); - } - else if ( length > 0 ) { - msa_props = mc.removeViaLength( length ); } else { - msa_props = mc.chart( step, realign, norm ); + mc.setPeformPhylogenticInference( perform_phylogenetic_inference ); + if ( removed_seqs_out_base != null ) { + mc.setRemovedSeqsOutBase( removed_seqs_out_base ); + } + mc.setNorm( norm ); + mc.setRealign( realign ); + if ( realign ) { + mc.setPathToMafft( path_to_mafft ); + mc.setMafftOptions( mafft_options ); + } + mc.setStep( step ); + mc.setStepForDiagnostics( step_for_diagnostics ); + mc.setCalculateNormalizedShannonEntropy( report_entropy ); + if ( worst_remove > 0 ) { + msa_props = mc.removeWorstOffenders( worst_remove ); + } + else if ( av_gap > 0 ) { + msa_props = mc.removeViaGapAverage( av_gap ); + } + else if ( length > 0 ) { + msa_props = mc.removeViaLength( length ); + } + else { + msa_props = mc.chart( step, realign, norm ); + } + Chart.display( msa_props, initial_number_of_seqs, report_entropy, in.getName() ); } - Chart.display( msa_props, initial_number_of_seqs, report_entropy, in.getName() ); } catch ( final IllegalArgumentException iae ) { // iae.printStackTrace(); //TODO remove me @@ -425,8 +463,8 @@ public class msa_compactor { + "= step for diagnostics reports (default: 1)" ); System.out.println( " -" + REPORT_ENTROPY + " to calculate normalized Shannon Entropy (not recommended for very large alignments)" ); - System.out.println( " -" + OUTPUT_FORMAT_PHYLIP_OPTION - + " to write output alignments in phylip format instead of fasta" ); + System.out.println( " -" + OUTPUT_FORMAT_OPTION + + "= format for output alignments: f for fasta (default), p for phylip, or n for nexus" ); System.out.println( " -" + OUTPUT_REMOVED_SEQS_OPTION + "= to output the removed sequences" ); System.out.println( " -" + MIN_LENGTH_OPTION + "= minimal effecive sequence length (for deleting of shorter sequences)" ); diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index d913531..dc383c9 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -175,6 +175,9 @@ public class BasicMsa implements Msa { case FASTA: writeToFasta( w ); break; + case NEXUS: + writeToNexus( w ); + break; default: throw new RuntimeException( "unknown format " + format ); } @@ -195,10 +198,39 @@ public class BasicMsa implements Msa { SequenceWriter.writeSeqs( asSequenceList(), w, SEQ_FORMAT.FASTA, 100 ); } + private void writeToNexus( final Writer w ) throws IOException { + final int max = determineMaxIdLength() + 1; + w.write( "Begin Data;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Dimensions NTax=" + getNumberOfSequences() ); + w.write( " NChar=" + getLength() ); + w.write( ";" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Format DataType=Protein Interleave=No gap=-;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( " Matrix" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + for( int row = 0; row < getNumberOfSequences(); ++row ) { + final Sequence seq = getSequence( row ); + final String s = seq.getMolecularSequenceAsString(); + w.write( " " ); + w.write( ForesterUtil.pad( getIdentifier( row ).replace( ' ', '_' ), max, ' ', false ).toString() ); + w.write( " " ); + w.write( s ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + w.write( " ;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "End;" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + private void writeToPhylip( final Writer w ) throws IOException { final int max = determineMaxIdLength() + 1; + w.write( getNumberOfSequences() + " " + getLength() ); + w.write( ForesterUtil.LINE_SEPARATOR ); for( int row = 0; row < getNumberOfSequences(); ++row ) { - w.write( ForesterUtil.pad( getIdentifier( row ), max, ' ', false ).toString() ); + w.write( ForesterUtil.pad( getIdentifier( row ).replace( ' ', '_' ), max, ' ', false ).toString() ); for( int col = 0; col < getLength(); ++col ) { w.write( getResidueAt( row, col ) ); } diff --git a/forester/java/src/org/forester/msa/Msa.java b/forester/java/src/org/forester/msa/Msa.java index 648006a..ed73cd4 100644 --- a/forester/java/src/org/forester/msa/Msa.java +++ b/forester/java/src/org/forester/msa/Msa.java @@ -35,7 +35,7 @@ import org.forester.sequence.Sequence.TYPE; public interface Msa { public static enum MSA_FORMAT { - FASTA, PHYLIP; + FASTA, PHYLIP, NEXUS; } public String getIdentifier( int row ); diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index a1c61c1..81f22da 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -67,12 +67,14 @@ import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.sequence.Sequence; import org.forester.tools.ConfidenceAssessor; import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; public class MsaCompactor { - final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); - final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); + final private static NumberFormat NF_1 = new DecimalFormat( "0.#" ); + final private static NumberFormat NF_3 = new DecimalFormat( "0.###" ); + final private static NumberFormat NF_4 = new DecimalFormat( "0.####" ); private boolean _calculate_shannon_entropy = false; // private String _infile_name = null; @@ -93,6 +95,7 @@ public class MsaCompactor { private int _step = -1; private int _step_for_diagnostics = -1; static { + NF_1.setRoundingMode( RoundingMode.HALF_UP ); NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); } @@ -260,12 +263,21 @@ public class MsaCompactor { return _msa; } - public final void removeSequencesByMinimalLength( final int min_effective_length ) { - printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) ); - System.out.println(); + public final void removeSequencesByMinimalLength( final int min_effective_length ) throws IOException { _msa = DeleteableMsa.createInstance( MsaMethods.removeSequencesByMinimalLength( _msa, min_effective_length ) ); removeGapColumns(); - printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) ); + final String s = writeOutfile(); + final DescriptiveStatistics msa_stats = MsaMethods.calculateEffectiveLengthStatistics( _msa ); + System.out.println( "Output MSA : " + s ); + System.out.println( " MSA length : " + _msa.getLength() ); + System.out.println( " Number of sequences : " + _msa.getNumberOfSequences() ); + System.out.println( " Median sequence length : " + NF_1.format( msa_stats.median() ) ); + System.out.println( " Mean sequence length : " + NF_1.format( msa_stats.arithmeticMean() ) ); + System.out.println( " Max sequence length : " + ( ( int ) msa_stats.getMax() ) ); + System.out.println( " Min sequence length : " + ( ( int ) msa_stats.getMin() ) ); + System.out.println( " Gap ratio : " + NF_4.format( MsaMethods.calcGapRatio( _msa ) ) ); + System.out.println( " Normalized Shannon Entropy (entn21): " + + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 21, _msa ) ) ); System.out.println(); } diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 59150d5..22253df 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -6376,13 +6376,15 @@ public final class Test { final Writer w = new StringWriter(); dmsa2.write( w, MSA_FORMAT.PHYLIP ); final String phylip = w.toString(); - if ( !phylip.equals( "new_c x" + ForesterUtil.LINE_SEPARATOR ) ) { + if ( !phylip.equals( "1 1" + ForesterUtil.LINE_SEPARATOR + "new_c x" + ForesterUtil.LINE_SEPARATOR ) ) { + System.out.println( phylip ); return false; } final Writer w2 = new StringWriter(); dmsa2.write( w2, MSA_FORMAT.FASTA ); final String fasta = w2.toString(); if ( !fasta.equals( ">new_c" + ForesterUtil.LINE_SEPARATOR + "x" + ForesterUtil.LINE_SEPARATOR ) ) { + System.out.println( fasta ); return false; } } -- 1.7.10.2