From 7c0f14dccfa95a3f709da6eff6ebe635390d1841 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 29 Apr 2014 00:47:03 +0000 Subject: [PATCH] inprogress --- .../org/forester/application/msa_compactor.java | 81 ++++--- .../java/src/org/forester/msa_compactor/Chart.java | 10 +- .../org/forester/msa_compactor/MsaCompactor.java | 252 ++++++++++---------- 3 files changed, 180 insertions(+), 163 deletions(-) diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index b6cc057..996ccf3 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -27,6 +27,9 @@ package org.forester.application; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.math.RoundingMode; +import java.text.DecimalFormat; +import java.text.NumberFormat; import java.util.ArrayList; import java.util.List; @@ -45,30 +48,34 @@ import org.forester.util.ForesterUtil; public class msa_compactor { - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String REMOVE_WORST_OFFENDERS_OPTION = "r"; - final static private String AV_GAPINESS_OPTION = "g"; - final static private String STEP_OPTION = "s"; - final static private String LENGTH_OPTION = "l"; - final static private String REALIGN_OPTION = "a"; + final private static NumberFormat NF_1 = new DecimalFormat( "#.0" ); + static { + NF_1.setRoundingMode( RoundingMode.HALF_UP ); + } + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String REMOVE_WORST_OFFENDERS_OPTION = "r"; + final static private String AV_GAPINESS_OPTION = "g"; + final static private String STEP_OPTION = "s"; + final static private String LENGTH_OPTION = "l"; + final static private String REALIGN_OPTION = "a"; // - final static private String STEP_FOR_DIAGNOSTICS_OPTION = "sd"; - final static private String MIN_LENGTH_OPTION = "ml"; - final static private String GAP_RATIO_LENGTH_OPTION = "gr"; - final static private String REPORT_ALN_MEAN_IDENTITY = "q"; - final static private String OUTPUT_FORMAT_PHYLIP_OPTION = "p"; - final static private String OUTPUT_REMOVED_SEQS_OPTION = "ro"; - final static private String MAFFT_OPTIONS = "mo"; + final static private String STEP_FOR_DIAGNOSTICS_OPTION = "sd"; + final static private String MIN_LENGTH_OPTION = "ml"; + final static private String GAP_RATIO_LENGTH_OPTION = "gr"; + final static private String REPORT_ALN_MEAN_IDENTITY = "q"; + final static private String OUTPUT_FORMAT_PHYLIP_OPTION = "p"; + final static private String OUTPUT_REMOVED_SEQS_OPTION = "ro"; + final static private String MAFFT_OPTIONS = "mo"; // - final static private String PATH_TO_MAFFT_OPTION = "mafft"; - final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn"; - final static private String PRG_NAME = "msa_compactor"; - final static private String PRG_DESC = "multiple sequence aligment compactor"; - final static private String PRG_VERSION = "0.2"; - final static private String PRG_DATE = "140428"; - final static private String E_MAIL = "czmasek@sanfordburham.org"; - final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; + final static private String PATH_TO_MAFFT_OPTION = "mafft"; + final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn"; + final static private String PRG_NAME = "msa_compactor"; + final static private String PRG_DESC = "multiple sequence aligment compactor"; + final static private String PRG_VERSION = "0.2"; + final static private String PRG_DATE = "140428"; + final static private String E_MAIL = "czmasek@sanfordburham.org"; + final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; public static void main( final String args[] ) { try { @@ -153,8 +160,15 @@ public class msa_compactor { System.exit( 0 ); } length = cla.getOptionValueAsInt( LENGTH_OPTION ); - if ( ( length < 2 ) || ( length >= msa.getLength() ) ) { - ForesterUtil.fatalError( PRG_NAME, "target length is out of range: " + length ); + if ( length >= msa.getLength() ) { + ForesterUtil.fatalError( PRG_NAME, + "target length is out of range [longer than MSA (" + msa.getLength() + + ")]: " + length ); + } + else if ( length < initial_msa_stats.getMin() ) { + ForesterUtil.fatalError( PRG_NAME, + "target length is out of range [shorter than the shortest sequence (" + + initial_msa_stats.getMin() + ") ]: " + length ); } } if ( cla.isOptionSet( STEP_OPTION ) ) { @@ -176,7 +190,6 @@ public class msa_compactor { if ( cla.isOptionSet( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION ) ) { norm = false; } - // if ( cla.isOptionSet( STEP_FOR_DIAGNOSTICS_OPTION ) ) { step_for_diagnostics = cla.getOptionValueAsInt( STEP_FOR_DIAGNOSTICS_OPTION ); if ( ( step_for_diagnostics < 1 ) @@ -192,7 +205,7 @@ public class msa_compactor { + min_length ); } } - if ( cla.isOptionSet( MIN_LENGTH_OPTION ) ) { + if ( cla.isOptionSet( GAP_RATIO_LENGTH_OPTION ) ) { gap_ratio = cla.getOptionValueAsDouble( GAP_RATIO_LENGTH_OPTION ); if ( ( gap_ratio < 0 ) || ( gap_ratio > 1 ) ) { ForesterUtil.fatalError( PRG_NAME, "gap ratio is out of range: " + gap_ratio ); @@ -232,8 +245,14 @@ public class msa_compactor { E_MAIL, WWW, ForesterUtil.getForesterLibraryInformation() ); - // System.out.println( "Input MSA : " + in ); + System.out.println( " MSA length : " + msa.getLength() ); + System.out.println( " Number of sequences : " + msa.getNumberOfSequences() ); + System.out.println( " Median sequence length : " + NF_1.format( initial_msa_stats.median() ) ); + System.out.println( " Mean sequence length : " + + NF_1.format( initial_msa_stats.arithmeticMean() ) ); + System.out.println( " Max sequence length : " + ( ( int ) initial_msa_stats.getMax() ) ); + System.out.println( " Min sequence length : " + ( ( int ) initial_msa_stats.getMin() ) ); if ( out != null ) { System.out.println( "Output : " + out ); } @@ -265,7 +284,7 @@ public class msa_compactor { if ( !norm ) { System.out.println( "Normalize : " + norm ); } - System.out.println( "Realign : " + realign ); + System.out.println( "Realign with MAFFT : " + realign ); if ( realign ) { System.out.println( "MAFFT options : " + mafft_options ); } @@ -317,7 +336,6 @@ public class msa_compactor { msa_props = mc.removeViaGapAverage( av_gap ); } else if ( length > 0 ) { - // TODO if < shortest seq -> error final MsaCompactor mc = new MsaCompactor( msa ); mc.setRealign( realign ); mc.setOutputFormat( output_format ); @@ -353,11 +371,11 @@ public class msa_compactor { Chart.display( msa_props, initial_number_of_seqs, report_aln_mean_identity, in.toString() ); } catch ( final IllegalArgumentException iae ) { - iae.printStackTrace(); //TODO remove me + // iae.printStackTrace(); //TODO remove me ForesterUtil.fatalError( PRG_NAME, iae.getMessage() ); } catch ( final IOException ioe ) { - ioe.printStackTrace(); //TODO remove me + // ioe.printStackTrace(); //TODO remove me ForesterUtil.fatalError( PRG_NAME, ioe.getMessage() ); } catch ( final Exception e ) { @@ -367,7 +385,6 @@ public class msa_compactor { private static void checkPathToMafft( final String path_to_mafft ) { if ( !ForesterUtil.isEmpty( path_to_mafft ) && MsaInferrer.isInstalled( path_to_mafft ) ) { - ForesterUtil.programMessage( PRG_NAME, "using MAFFT at \"" + path_to_mafft + "\"" ); } else { if ( ForesterUtil.isEmpty( path_to_mafft ) ) { diff --git a/forester/java/src/org/forester/msa_compactor/Chart.java b/forester/java/src/org/forester/msa_compactor/Chart.java index 5563d7c..ccb6a0b 100644 --- a/forester/java/src/org/forester/msa_compactor/Chart.java +++ b/forester/java/src/org/forester/msa_compactor/Chart.java @@ -64,7 +64,7 @@ public final class Chart extends JDialog implements ActionListener { _initial_number_of_seqs = initial_number_of_seqs; _show_msa_qual = show_msa_qual; setTitle( "msa compactor" ); - setSize( 500, 400 ); + setSize( 600, 500 ); setResizable( true ); final JPanel content_pane = new JPanel(); content_pane.setLayout( new BorderLayout() ); @@ -113,7 +113,7 @@ public final class Chart extends JDialog implements ActionListener { final double[][] seqs_gaps = new double[ _msa_props.size() ][ 2 ]; for( int i = 0; i < _msa_props.size(); ++i ) { seqs_gaps[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); - seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * 100 ); + seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * 200 ); } model.addData( seqs_gaps, "Gap ratio" ); model.setSeriesLine( "Series " + "Gap ratio", true ); @@ -123,15 +123,15 @@ public final class Chart extends JDialog implements ActionListener { for( int i = 0; i < _msa_props.size(); ++i ) { seqs_identity[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); seqs_identity[ i ][ 1 ] = ForesterUtil - .roundToInt( _msa_props.get( i ).getAverageIdentityRatio() * 100 ); + .roundToInt( _msa_props.get( i ).getAverageIdentityRatio() * 200 ); } model.addData( seqs_identity, "mean MSA column identity" ); model.setSeriesLine( "Series " + "mean MSA column identity", true ); model.setSeriesMarker( "Series " + "mean MSA column identity", false ); } final BoxCoordSystem coord = new BoxCoordSystem( model ); - coord.setUnitFont( coord.getUnitFont().deriveFont( 20.0f ) ); - coord.setXAxisUnit( "Number of Sequences" ); + coord.setUnitFont( coord.getUnitFont().deriveFont( 16.0f ) ); + coord.setXAxisUnit( "Number of Removed Sequences" ); coord.setPaintGrid( true ); coord.setYAxisUnit( "MSA Length" ); _chart_panel = new ChartPanel( model, _title ); diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 938273d..0673cf3 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -58,31 +58,27 @@ import org.forester.util.ForesterUtil; public class MsaCompactor { - final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); - final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); - private double _gap_ratio = -1; + final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); + final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); + private double _gap_ratio = -1; + private final short _longest_id_length; // - private String _maffts_opts = "--auto"; - private int _min_length = -1; + private String _maffts_opts = "--auto"; + private int _min_length = -1; // - private DeleteableMsa _msa = null; - private boolean _norm = true; - private File _out_file_base = null; - private MSA_FORMAT _output_format = MSA_FORMAT.FASTA; - private String _path_to_mafft = null; + private DeleteableMsa _msa = null; + private boolean _norm = true; + private File _out_file_base = null; + private MSA_FORMAT _output_format = MSA_FORMAT.FASTA; + private String _path_to_mafft = null; // - private boolean _realign = false; + private boolean _realign = false; private final SortedSet _removed_seq_ids; - private File _removed_seqs_out_base = null; - - public final void setRemovedSeqsOutBase( final File removed_seqs_out_base ) { - _removed_seqs_out_base = removed_seqs_out_base; - } + private final ArrayList _removed_seqs; + private File _removed_seqs_out_base = null; private boolean _report_aln_mean_identity = false; private int _step = -1; private int _step_for_diagnostics = -1; - private final short _longest_id_length; - private final ArrayList _removed_seqs; static { NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); @@ -95,6 +91,45 @@ public class MsaCompactor { _removed_seqs = new ArrayList(); } + public final List chart( final int step, final boolean realign, final boolean norm ) + throws IOException, InterruptedException { + final GapContribution stats[] = calcGapContribtionsStats( norm ); + final List to_remove_ids = new ArrayList(); + final List msa_props = new ArrayList(); + for( final GapContribution gap_gontribution : stats ) { + to_remove_ids.add( gap_gontribution.getId() ); + } + printTableHeader(); + final int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 ); + MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); + msa_props.add( msa_prop ); + printMsaProperties( "", msa_prop ); + System.out.println(); + int i = 0; + while ( _msa.getNumberOfSequences() > x ) { + final String id = to_remove_ids.get( i ); + _msa.deleteRow( id, false ); + if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) { + removeGapColumns(); + realignWithMafft(); + msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); + msa_props.add( msa_prop ); + printMsaProperties( id, msa_prop ); + System.out.print( "(realigned)" ); + System.out.println(); + } + else if ( isPrintMsaStats( i ) ) { + removeGapColumns(); + msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); + msa_props.add( msa_prop ); + printMsaProperties( id, msa_prop ); + System.out.println(); + } + ++i; + } + return msa_props; + } + final public Msa getMsa() { return _msa; } @@ -224,57 +259,14 @@ public class MsaCompactor { return msa_props; } - public final List chart( final int step, final boolean realign, final boolean norm ) - throws IOException, InterruptedException { - final GapContribution stats[] = calcGapContribtionsStats( norm ); - final List to_remove_ids = new ArrayList(); - final List msa_props = new ArrayList(); - for( final GapContribution gap_gontribution : stats ) { - to_remove_ids.add( gap_gontribution.getId() ); - } - printTableHeader(); - final int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 ); - MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); - msa_props.add( msa_prop ); - printMsaProperties( "", msa_prop ); - System.out.println(); - int i = 0; - while ( _msa.getNumberOfSequences() > x ) { - final String id = to_remove_ids.get( i ); - _msa.deleteRow( id, false ); - if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) { - removeGapColumns(); - realignWithMafft(); - msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); - msa_props.add( msa_prop ); - printMsaProperties( id, msa_prop ); - System.out.print( "(realigned)" ); - System.out.println(); - } - else if ( isPrintMsaStats( i ) ) { - removeGapColumns(); - msa_prop = new MsaProperties( _msa, _report_aln_mean_identity ); - msa_props.add( msa_prop ); - printMsaProperties( id, msa_prop ); - System.out.println(); - } - ++i; - } - return msa_props; - } - - private final boolean isPrintMsaStats( final int i ) { - return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step_for_diagnostics > 0 ) && ( ( ( i + 1 ) % _step_for_diagnostics ) == 0 ) ) ); - } - - private final boolean isPrintMsaStatsWriteOutfileAndRealign( final int i ) { - return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) ); - } - public final void setGapRatio( final double gap_ratio ) { _gap_ratio = gap_ratio; } + public final void setMafftOptions( final String maffts_opts ) { + _maffts_opts = maffts_opts; + } + public final void setMinLength( final int min_length ) { _min_length = min_length; } @@ -299,24 +291,20 @@ public class MsaCompactor { _realign = realign; } - public final void setStep( final int step ) { - _step = step; - } - - public final void setStepForDiagnostics( final int step_for_diagnostics ) { - _step_for_diagnostics = step_for_diagnostics; + public final void setRemovedSeqsOutBase( final File removed_seqs_out_base ) { + _removed_seqs_out_base = removed_seqs_out_base; } public final void setReportAlnMeanIdentity( final boolean report_aln_mean_identity ) { _report_aln_mean_identity = report_aln_mean_identity; } - final public String writeMsa( final File outfile ) throws IOException { - final Double gr = MsaMethods.calcGapRatio( _msa ); - final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" - + ForesterUtil.roundToInt( gr * 100 ); - writeMsa( _msa, s + obtainSuffix(), _output_format ); - return s; + public final void setStep( final int step ) { + _step = step; + } + + public final void setStepForDiagnostics( final int step_for_diagnostics ) { + _step_for_diagnostics = step_for_diagnostics; } final public String writeAndAlignRemovedSeqs() throws IOException, InterruptedException { @@ -342,14 +330,12 @@ public class MsaCompactor { return msg.toString(); } - private String obtainSuffix() { - if ( _output_format == MSA_FORMAT.FASTA ) { - return ".fasta"; - } - else if ( _output_format == MSA_FORMAT.PHYLIP ) { - return ".aln"; - } - return ""; + final public String writeMsa( final File outfile ) throws IOException { + final Double gr = MsaMethods.calcGapRatio( _msa ); + final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + + ForesterUtil.roundToInt( gr * 100 ); + writeMsa( _msa, s + obtainSuffix(), _output_format ); + return s; } final int calcNonGapResidues( final Sequence seq ) { @@ -429,6 +415,38 @@ public class MsaCompactor { return phy; } + private final boolean isPrintMsaStats( final int i ) { + return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step_for_diagnostics > 0 ) && ( ( ( i + 1 ) % _step_for_diagnostics ) == 0 ) ) ); + } + + private final boolean isPrintMsaStatsWriteOutfileAndRealign( final int i ) { + return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) ); + } + + private final StringBuilder msaPropertiesAsSB( final MsaProperties msa_properties ) { + final StringBuilder sb = new StringBuilder(); + sb.append( msa_properties.getNumberOfSequences() ); + sb.append( "\t" ); + sb.append( msa_properties.getLength() ); + sb.append( "\t" ); + sb.append( NF_4.format( msa_properties.getGapRatio() ) ); + if ( _report_aln_mean_identity /*msa_properties.getAverageIdentityRatio() >= 0*/) { + sb.append( "\t" ); + sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) ); + } + return sb; + } + + private String obtainSuffix() { + if ( _output_format == MSA_FORMAT.FASTA ) { + return ".fasta"; + } + else if ( _output_format == MSA_FORMAT.PHYLIP ) { + return ".aln"; + } + return ""; + } + private final Phylogeny pi( final String matrix ) { final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix ); final int seed = 15; @@ -456,20 +474,6 @@ public class MsaCompactor { System.out.print( "\t" ); } - private final StringBuilder msaPropertiesAsSB( final MsaProperties msa_properties ) { - final StringBuilder sb = new StringBuilder(); - sb.append( msa_properties.getNumberOfSequences() ); - sb.append( "\t" ); - sb.append( msa_properties.getLength() ); - sb.append( "\t" ); - sb.append( NF_4.format( msa_properties.getGapRatio() ) ); - if ( _report_aln_mean_identity /*msa_properties.getAverageIdentityRatio() >= 0*/) { - sb.append( "\t" ); - sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) ); - } - return sb; - } - final private MsaProperties printMsaStatsWriteOutfileAndRealign( final boolean realign, final String id ) throws IOException, InterruptedException { if ( realign ) { @@ -482,6 +486,24 @@ public class MsaCompactor { return msa_prop; } + private final void printTableHeader() { + if ( ( _step < 2 ) || ( _step_for_diagnostics < 2 ) ) { + System.out.print( ForesterUtil.pad( "Id", _longest_id_length, ' ', false ) ); + System.out.print( "\t" ); + } + System.out.print( "Seqs" ); + System.out.print( "\t" ); + System.out.print( "Length" ); + System.out.print( "\t" ); + System.out.print( "Gaps" ); + System.out.print( "\t" ); + if ( _report_aln_mean_identity ) { + System.out.print( "MSA qual" ); + System.out.print( "\t" ); + } + System.out.println(); + } + final private void realignWithMafft() throws IOException, InterruptedException { final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft ); final List opts = new ArrayList(); @@ -491,21 +513,10 @@ public class MsaCompactor { _msa = DeleteableMsa.createInstance( mafft.infer( _msa.asSequenceList(), opts ) ); } - public final void setMafftOptions( final String maffts_opts ) { - _maffts_opts = maffts_opts; - } - final private void removeGapColumns() { _msa.deleteGapOnlyColumns(); } - final private static void writeMsa( final Msa msa, final String outfile, final MSA_FORMAT format ) - throws IOException { - final Writer w = ForesterUtil.createBufferedWriter( outfile ); - msa.write( w, format ); - w.close(); - } - private final String writeOutfile() throws IOException { final String s = writeMsa( _out_file_base ); return s; @@ -543,21 +554,10 @@ public class MsaCompactor { return null; } - private final void printTableHeader() { - if ( ( _step < 2 ) || ( _step_for_diagnostics < 2 ) ) { - System.out.print( ForesterUtil.pad( "Id", _longest_id_length, ' ', false ) ); - System.out.print( "\t" ); - } - System.out.print( "Seqs" ); - System.out.print( "\t" ); - System.out.print( "Length" ); - System.out.print( "\t" ); - System.out.print( "Gaps" ); - System.out.print( "\t" ); - if ( _report_aln_mean_identity ) { - System.out.print( "MSA qual" ); - System.out.print( "\t" ); - } - System.out.println(); + final private static void writeMsa( final Msa msa, final String outfile, final MSA_FORMAT format ) + throws IOException { + final Writer w = ForesterUtil.createBufferedWriter( outfile ); + msa.write( w, format ); + w.close(); } } -- 1.7.10.2