import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.math.RoundingMode;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.List;
public class msa_compactor {
- final static private String HELP_OPTION_1 = "help";
- final static private String HELP_OPTION_2 = "h";
- final static private String REMOVE_WORST_OFFENDERS_OPTION = "r";
- final static private String AV_GAPINESS_OPTION = "g";
- final static private String STEP_OPTION = "s";
- final static private String LENGTH_OPTION = "l";
- final static private String REALIGN_OPTION = "a";
+ final private static NumberFormat NF_1 = new DecimalFormat( "#.0" );
+ static {
+ NF_1.setRoundingMode( RoundingMode.HALF_UP );
+ }
+ final static private String HELP_OPTION_1 = "help";
+ final static private String HELP_OPTION_2 = "h";
+ final static private String REMOVE_WORST_OFFENDERS_OPTION = "r";
+ final static private String AV_GAPINESS_OPTION = "g";
+ final static private String STEP_OPTION = "s";
+ final static private String LENGTH_OPTION = "l";
+ final static private String REALIGN_OPTION = "a";
//
- final static private String STEP_FOR_DIAGNOSTICS_OPTION = "sd";
- final static private String MIN_LENGTH_OPTION = "ml";
- final static private String GAP_RATIO_LENGTH_OPTION = "gr";
- final static private String REPORT_ALN_MEAN_IDENTITY = "q";
- final static private String OUTPUT_FORMAT_PHYLIP_OPTION = "p";
- final static private String OUTPUT_REMOVED_SEQS_OPTION = "ro";
- final static private String MAFFT_OPTIONS = "mo";
+ final static private String STEP_FOR_DIAGNOSTICS_OPTION = "sd";
+ final static private String MIN_LENGTH_OPTION = "ml";
+ final static private String GAP_RATIO_LENGTH_OPTION = "gr";
+ final static private String REPORT_ALN_MEAN_IDENTITY = "q";
+ final static private String OUTPUT_FORMAT_PHYLIP_OPTION = "p";
+ final static private String OUTPUT_REMOVED_SEQS_OPTION = "ro";
+ final static private String MAFFT_OPTIONS = "mo";
//
- final static private String PATH_TO_MAFFT_OPTION = "mafft";
- final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn";
- final static private String PRG_NAME = "msa_compactor";
- final static private String PRG_DESC = "multiple sequence aligment compactor";
- final static private String PRG_VERSION = "0.2";
- final static private String PRG_DATE = "140428";
- final static private String E_MAIL = "czmasek@sanfordburham.org";
- final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
+ final static private String PATH_TO_MAFFT_OPTION = "mafft";
+ final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn";
+ final static private String PRG_NAME = "msa_compactor";
+ final static private String PRG_DESC = "multiple sequence aligment compactor";
+ final static private String PRG_VERSION = "0.2";
+ final static private String PRG_DATE = "140428";
+ final static private String E_MAIL = "czmasek@sanfordburham.org";
+ final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
public static void main( final String args[] ) {
try {
System.exit( 0 );
}
length = cla.getOptionValueAsInt( LENGTH_OPTION );
- if ( ( length < 2 ) || ( length >= msa.getLength() ) ) {
- ForesterUtil.fatalError( PRG_NAME, "target length is out of range: " + length );
+ if ( length >= msa.getLength() ) {
+ ForesterUtil.fatalError( PRG_NAME,
+ "target length is out of range [longer than MSA (" + msa.getLength()
+ + ")]: " + length );
+ }
+ else if ( length < initial_msa_stats.getMin() ) {
+ ForesterUtil.fatalError( PRG_NAME,
+ "target length is out of range [shorter than the shortest sequence ("
+ + initial_msa_stats.getMin() + ") ]: " + length );
}
}
if ( cla.isOptionSet( STEP_OPTION ) ) {
if ( cla.isOptionSet( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION ) ) {
norm = false;
}
- //
if ( cla.isOptionSet( STEP_FOR_DIAGNOSTICS_OPTION ) ) {
step_for_diagnostics = cla.getOptionValueAsInt( STEP_FOR_DIAGNOSTICS_OPTION );
if ( ( step_for_diagnostics < 1 )
+ min_length );
}
}
- if ( cla.isOptionSet( MIN_LENGTH_OPTION ) ) {
+ if ( cla.isOptionSet( GAP_RATIO_LENGTH_OPTION ) ) {
gap_ratio = cla.getOptionValueAsDouble( GAP_RATIO_LENGTH_OPTION );
if ( ( gap_ratio < 0 ) || ( gap_ratio > 1 ) ) {
ForesterUtil.fatalError( PRG_NAME, "gap ratio is out of range: " + gap_ratio );
E_MAIL,
WWW,
ForesterUtil.getForesterLibraryInformation() );
- //
System.out.println( "Input MSA : " + in );
+ System.out.println( " MSA length : " + msa.getLength() );
+ System.out.println( " Number of sequences : " + msa.getNumberOfSequences() );
+ System.out.println( " Median sequence length : " + NF_1.format( initial_msa_stats.median() ) );
+ System.out.println( " Mean sequence length : "
+ + NF_1.format( initial_msa_stats.arithmeticMean() ) );
+ System.out.println( " Max sequence length : " + ( ( int ) initial_msa_stats.getMax() ) );
+ System.out.println( " Min sequence length : " + ( ( int ) initial_msa_stats.getMin() ) );
if ( out != null ) {
System.out.println( "Output : " + out );
}
if ( !norm ) {
System.out.println( "Normalize : " + norm );
}
- System.out.println( "Realign : " + realign );
+ System.out.println( "Realign with MAFFT : " + realign );
if ( realign ) {
System.out.println( "MAFFT options : " + mafft_options );
}
msa_props = mc.removeViaGapAverage( av_gap );
}
else if ( length > 0 ) {
- // TODO if < shortest seq -> error
final MsaCompactor mc = new MsaCompactor( msa );
mc.setRealign( realign );
mc.setOutputFormat( output_format );
Chart.display( msa_props, initial_number_of_seqs, report_aln_mean_identity, in.toString() );
}
catch ( final IllegalArgumentException iae ) {
- iae.printStackTrace(); //TODO remove me
+ // iae.printStackTrace(); //TODO remove me
ForesterUtil.fatalError( PRG_NAME, iae.getMessage() );
}
catch ( final IOException ioe ) {
- ioe.printStackTrace(); //TODO remove me
+ // ioe.printStackTrace(); //TODO remove me
ForesterUtil.fatalError( PRG_NAME, ioe.getMessage() );
}
catch ( final Exception e ) {
private static void checkPathToMafft( final String path_to_mafft ) {
if ( !ForesterUtil.isEmpty( path_to_mafft ) && MsaInferrer.isInstalled( path_to_mafft ) ) {
- ForesterUtil.programMessage( PRG_NAME, "using MAFFT at \"" + path_to_mafft + "\"" );
}
else {
if ( ForesterUtil.isEmpty( path_to_mafft ) ) {
public class MsaCompactor {
- final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
- final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
- private double _gap_ratio = -1;
+ final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
+ final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
+ private double _gap_ratio = -1;
+ private final short _longest_id_length;
//
- private String _maffts_opts = "--auto";
- private int _min_length = -1;
+ private String _maffts_opts = "--auto";
+ private int _min_length = -1;
//
- private DeleteableMsa _msa = null;
- private boolean _norm = true;
- private File _out_file_base = null;
- private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
- private String _path_to_mafft = null;
+ private DeleteableMsa _msa = null;
+ private boolean _norm = true;
+ private File _out_file_base = null;
+ private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
+ private String _path_to_mafft = null;
//
- private boolean _realign = false;
+ private boolean _realign = false;
private final SortedSet<String> _removed_seq_ids;
- private File _removed_seqs_out_base = null;
-
- public final void setRemovedSeqsOutBase( final File removed_seqs_out_base ) {
- _removed_seqs_out_base = removed_seqs_out_base;
- }
+ private final ArrayList<Sequence> _removed_seqs;
+ private File _removed_seqs_out_base = null;
private boolean _report_aln_mean_identity = false;
private int _step = -1;
private int _step_for_diagnostics = -1;
- private final short _longest_id_length;
- private final ArrayList<Sequence> _removed_seqs;
static {
NF_4.setRoundingMode( RoundingMode.HALF_UP );
NF_3.setRoundingMode( RoundingMode.HALF_UP );
_removed_seqs = new ArrayList<Sequence>();
}
+ public final List<MsaProperties> chart( final int step, final boolean realign, final boolean norm )
+ throws IOException, InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( norm );
+ final List<String> to_remove_ids = new ArrayList<String>();
+ final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
+ for( final GapContribution gap_gontribution : stats ) {
+ to_remove_ids.add( gap_gontribution.getId() );
+ }
+ printTableHeader();
+ final int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 );
+ MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
+ msa_props.add( msa_prop );
+ printMsaProperties( "", msa_prop );
+ System.out.println();
+ int i = 0;
+ while ( _msa.getNumberOfSequences() > x ) {
+ final String id = to_remove_ids.get( i );
+ _msa.deleteRow( id, false );
+ if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) {
+ removeGapColumns();
+ realignWithMafft();
+ msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
+ msa_props.add( msa_prop );
+ printMsaProperties( id, msa_prop );
+ System.out.print( "(realigned)" );
+ System.out.println();
+ }
+ else if ( isPrintMsaStats( i ) ) {
+ removeGapColumns();
+ msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
+ msa_props.add( msa_prop );
+ printMsaProperties( id, msa_prop );
+ System.out.println();
+ }
+ ++i;
+ }
+ return msa_props;
+ }
+
final public Msa getMsa() {
return _msa;
}
return msa_props;
}
- public final List<MsaProperties> chart( final int step, final boolean realign, final boolean norm )
- throws IOException, InterruptedException {
- final GapContribution stats[] = calcGapContribtionsStats( norm );
- final List<String> to_remove_ids = new ArrayList<String>();
- final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
- for( final GapContribution gap_gontribution : stats ) {
- to_remove_ids.add( gap_gontribution.getId() );
- }
- printTableHeader();
- final int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 );
- MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
- msa_props.add( msa_prop );
- printMsaProperties( "", msa_prop );
- System.out.println();
- int i = 0;
- while ( _msa.getNumberOfSequences() > x ) {
- final String id = to_remove_ids.get( i );
- _msa.deleteRow( id, false );
- if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) {
- removeGapColumns();
- realignWithMafft();
- msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
- msa_props.add( msa_prop );
- printMsaProperties( id, msa_prop );
- System.out.print( "(realigned)" );
- System.out.println();
- }
- else if ( isPrintMsaStats( i ) ) {
- removeGapColumns();
- msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
- msa_props.add( msa_prop );
- printMsaProperties( id, msa_prop );
- System.out.println();
- }
- ++i;
- }
- return msa_props;
- }
-
- private final boolean isPrintMsaStats( final int i ) {
- return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step_for_diagnostics > 0 ) && ( ( ( i + 1 ) % _step_for_diagnostics ) == 0 ) ) );
- }
-
- private final boolean isPrintMsaStatsWriteOutfileAndRealign( final int i ) {
- return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) );
- }
-
public final void setGapRatio( final double gap_ratio ) {
_gap_ratio = gap_ratio;
}
+ public final void setMafftOptions( final String maffts_opts ) {
+ _maffts_opts = maffts_opts;
+ }
+
public final void setMinLength( final int min_length ) {
_min_length = min_length;
}
_realign = realign;
}
- public final void setStep( final int step ) {
- _step = step;
- }
-
- public final void setStepForDiagnostics( final int step_for_diagnostics ) {
- _step_for_diagnostics = step_for_diagnostics;
+ public final void setRemovedSeqsOutBase( final File removed_seqs_out_base ) {
+ _removed_seqs_out_base = removed_seqs_out_base;
}
public final void setReportAlnMeanIdentity( final boolean report_aln_mean_identity ) {
_report_aln_mean_identity = report_aln_mean_identity;
}
- final public String writeMsa( final File outfile ) throws IOException {
- final Double gr = MsaMethods.calcGapRatio( _msa );
- final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
- + ForesterUtil.roundToInt( gr * 100 );
- writeMsa( _msa, s + obtainSuffix(), _output_format );
- return s;
+ public final void setStep( final int step ) {
+ _step = step;
+ }
+
+ public final void setStepForDiagnostics( final int step_for_diagnostics ) {
+ _step_for_diagnostics = step_for_diagnostics;
}
final public String writeAndAlignRemovedSeqs() throws IOException, InterruptedException {
return msg.toString();
}
- private String obtainSuffix() {
- if ( _output_format == MSA_FORMAT.FASTA ) {
- return ".fasta";
- }
- else if ( _output_format == MSA_FORMAT.PHYLIP ) {
- return ".aln";
- }
- return "";
+ final public String writeMsa( final File outfile ) throws IOException {
+ final Double gr = MsaMethods.calcGapRatio( _msa );
+ final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
+ + ForesterUtil.roundToInt( gr * 100 );
+ writeMsa( _msa, s + obtainSuffix(), _output_format );
+ return s;
}
final int calcNonGapResidues( final Sequence seq ) {
return phy;
}
+ private final boolean isPrintMsaStats( final int i ) {
+ return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step_for_diagnostics > 0 ) && ( ( ( i + 1 ) % _step_for_diagnostics ) == 0 ) ) );
+ }
+
+ private final boolean isPrintMsaStatsWriteOutfileAndRealign( final int i ) {
+ return ( ( ( _step < 2 ) && ( _step_for_diagnostics < 2 ) ) || ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) );
+ }
+
+ private final StringBuilder msaPropertiesAsSB( final MsaProperties msa_properties ) {
+ final StringBuilder sb = new StringBuilder();
+ sb.append( msa_properties.getNumberOfSequences() );
+ sb.append( "\t" );
+ sb.append( msa_properties.getLength() );
+ sb.append( "\t" );
+ sb.append( NF_4.format( msa_properties.getGapRatio() ) );
+ if ( _report_aln_mean_identity /*msa_properties.getAverageIdentityRatio() >= 0*/) {
+ sb.append( "\t" );
+ sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) );
+ }
+ return sb;
+ }
+
+ private String obtainSuffix() {
+ if ( _output_format == MSA_FORMAT.FASTA ) {
+ return ".fasta";
+ }
+ else if ( _output_format == MSA_FORMAT.PHYLIP ) {
+ return ".aln";
+ }
+ return "";
+ }
+
private final Phylogeny pi( final String matrix ) {
final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix );
final int seed = 15;
System.out.print( "\t" );
}
- private final StringBuilder msaPropertiesAsSB( final MsaProperties msa_properties ) {
- final StringBuilder sb = new StringBuilder();
- sb.append( msa_properties.getNumberOfSequences() );
- sb.append( "\t" );
- sb.append( msa_properties.getLength() );
- sb.append( "\t" );
- sb.append( NF_4.format( msa_properties.getGapRatio() ) );
- if ( _report_aln_mean_identity /*msa_properties.getAverageIdentityRatio() >= 0*/) {
- sb.append( "\t" );
- sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) );
- }
- return sb;
- }
-
final private MsaProperties printMsaStatsWriteOutfileAndRealign( final boolean realign, final String id )
throws IOException, InterruptedException {
if ( realign ) {
return msa_prop;
}
+ private final void printTableHeader() {
+ if ( ( _step < 2 ) || ( _step_for_diagnostics < 2 ) ) {
+ System.out.print( ForesterUtil.pad( "Id", _longest_id_length, ' ', false ) );
+ System.out.print( "\t" );
+ }
+ System.out.print( "Seqs" );
+ System.out.print( "\t" );
+ System.out.print( "Length" );
+ System.out.print( "\t" );
+ System.out.print( "Gaps" );
+ System.out.print( "\t" );
+ if ( _report_aln_mean_identity ) {
+ System.out.print( "MSA qual" );
+ System.out.print( "\t" );
+ }
+ System.out.println();
+ }
+
final private void realignWithMafft() throws IOException, InterruptedException {
final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft );
final List<String> opts = new ArrayList<String>();
_msa = DeleteableMsa.createInstance( mafft.infer( _msa.asSequenceList(), opts ) );
}
- public final void setMafftOptions( final String maffts_opts ) {
- _maffts_opts = maffts_opts;
- }
-
final private void removeGapColumns() {
_msa.deleteGapOnlyColumns();
}
- final private static void writeMsa( final Msa msa, final String outfile, final MSA_FORMAT format )
- throws IOException {
- final Writer w = ForesterUtil.createBufferedWriter( outfile );
- msa.write( w, format );
- w.close();
- }
-
private final String writeOutfile() throws IOException {
final String s = writeMsa( _out_file_base );
return s;
return null;
}
- private final void printTableHeader() {
- if ( ( _step < 2 ) || ( _step_for_diagnostics < 2 ) ) {
- System.out.print( ForesterUtil.pad( "Id", _longest_id_length, ' ', false ) );
- System.out.print( "\t" );
- }
- System.out.print( "Seqs" );
- System.out.print( "\t" );
- System.out.print( "Length" );
- System.out.print( "\t" );
- System.out.print( "Gaps" );
- System.out.print( "\t" );
- if ( _report_aln_mean_identity ) {
- System.out.print( "MSA qual" );
- System.out.print( "\t" );
- }
- System.out.println();
+ final private static void writeMsa( final Msa msa, final String outfile, final MSA_FORMAT format )
+ throws IOException {
+ final Writer w = ForesterUtil.createBufferedWriter( outfile );
+ msa.write( w, format );
+ w.close();
}
}