final static private String HELP_OPTION_2 = "h";
final static private String REMOVE_WORST_OFFENDERS_OPTION = "w";
final static private String AV_GAPINESS_OPTION = "a";
+ final static private String STEP_OPTION = "s";
final static private String LENGTH_OPTION = "l";
final static private String REALIGN_OPTION = "r";
final static private String PRG_NAME = "msa_compactor";
public static void main( final String args[] ) {
try {
final CommandLineArguments cla = new CommandLineArguments( args );
- if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) {
+ if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( cla.getNumberOfNames() != 2 ) ) {
printHelp();
System.exit( 0 );
}
final File in = cla.getFile( 0 );
+ final File out = cla.getFile( 1 );
int worst_remove = -1;
double av = -1;
int length = -1;
- final int step = 5;
+ int step = 1;
boolean realign = false;
- // int to = 0;
- // int window = 0;
- // int step = 0;
final List<String> allowed_options = new ArrayList<String>();
allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION );
allowed_options.add( AV_GAPINESS_OPTION );
allowed_options.add( LENGTH_OPTION );
allowed_options.add( REALIGN_OPTION );
+ allowed_options.add( STEP_OPTION );
final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
if ( dissallowed_options.length() > 0 ) {
ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
if ( cla.isOptionSet( LENGTH_OPTION ) ) {
length = cla.getOptionValueAsInt( LENGTH_OPTION );
}
+ if ( cla.isOptionSet( STEP_OPTION ) ) {
+ step = cla.getOptionValueAsInt( STEP_OPTION );
+ }
if ( cla.isOptionSet( REALIGN_OPTION ) ) {
realign = true;
}
else {
msa = GeneralMsaParser.parse( is );
}
- System.out.println( msa.toString() );
- System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).arithmeticMean() );
MsaCompactor mc = null;
if ( worst_remove > 0 ) {
mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign );
}
else if ( av > 0 ) {
- mc = MsaCompactor.reduceGapAverage( msa, av, step, realign );
+ mc = MsaCompactor.reduceGapAverage( msa, av, step, realign, out, 50 );
}
else if ( length > 0 ) {
mc = MsaCompactor.reduceLength( msa, length, step, realign );
}
- System.out.println( mc.getMsa().toString() );
- System.out.println( MsaMethods.calcBasicGapinessStatistics( mc.getMsa() ).arithmeticMean() );
+ System.out.println( MsaMethods.calcGapRatio( mc.getMsa() ) );
for( final String id : mc.getRemovedSeqIds() ) {
System.out.println( id );
}
import org.forester.msa.ClustalOmega;
import org.forester.msa.Mafft;
import org.forester.msa.Msa;
+import org.forester.msa.Msa.MSA_FORMAT;
import org.forester.msa.MsaInferrer;
import org.forester.msa.MsaMethods;
import org.forester.msa.ResampleableMsa;
}
if ( DEBUG ) {
System.out.println( msa.toString() );
- System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).toString() );
+ System.out.println( MsaMethods.calcGapRatio( msa ) );
}
final MsaMethods msa_tools = MsaMethods.createInstance();
if ( _options.isExecuteMsaProcessing() ) {
if ( DEBUG ) {
System.out.println( msa_tools.getIgnoredSequenceIds() );
System.out.println( msa.toString() );
- System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).toString() );
+ System.out.println( MsaMethods.calcGapRatio( msa ) );
}
_msa = msa;
}
try {
final BufferedWriter msa_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
+ MSA_FILE_SUFFIX ) );
- _msa.write( msa_writer );
+ _msa.write( msa_writer, MSA_FORMAT.PHYLIP );
msa_writer.close();
final BufferedWriter pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase()
+ PWD_FILE_SUFFIX ) );
import java.util.List;
import java.util.Set;
+import org.forester.io.writers.SequenceWriter;
+import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
import org.forester.sequence.BasicSequence;
import org.forester.sequence.Sequence;
import org.forester.sequence.Sequence.TYPE;
}
@Override
- public void write( final Writer w ) throws IOException {
+ public void write( final Writer w, final MSA_FORMAT format ) throws IOException {
+ switch ( format ) {
+ case PHYLIP:
+ writeToPhylip( w );
+ break;
+ case FASTA:
+ writeToFasta( w );
+ break;
+ default:
+ throw new RuntimeException( "unknown format " + format );
+ }
+ }
+
+ private void writeToFasta( final Writer w ) throws IOException {
+ SequenceWriter.writeSeqs( asSequenceList(), w, SEQ_FORMAT.FASTA, 100 );
+ }
+
+ private void writeToPhylip( final Writer w ) throws IOException {
final int max = determineMaxIdLength() + 1;
for( int row = 0; row < _data.length; ++row ) {
w.write( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ).toString() );
public interface Msa {
+ public static enum MSA_FORMAT {
+ FASTA, PHYLIP;
+ }
+
public String getIdentifier( int row );
public void setIdentifier( int row, String identifier );
public void setResidueAt( final int row, final int col, final char residue );
- public void write( Writer w ) throws IOException;
+ public void write( Writer w, MSA_FORMAT format ) throws IOException;
}
package org.forester.msa;
+import java.io.File;
import java.io.IOException;
+import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.SortedSet;
import java.util.TreeSet;
+import org.forester.msa.Msa.MSA_FORMAT;
import org.forester.sequence.Sequence;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.DescriptiveStatistics;
public class MsaCompactor {
- private static final boolean VERBOSE = true;
+ private static final boolean VERBOSE = true;
+
+ public static enum SORT_BY {
+ MAX, MEAN, MEDIAN;
+ }
private Msa _msa;
private final SortedSet<String> _removed_seq_ids;
public final static MsaCompactor reduceGapAverage( final Msa msa,
final double max_gap_average,
final int step,
- final boolean realign ) throws IOException, InterruptedException {
+ final boolean realign,
+ final File out,
+ final int minimal_effective_length ) throws IOException,
+ InterruptedException {
final MsaCompactor mc = new MsaCompactor( msa );
- mc.removeViaGapAverage( max_gap_average, step, realign );
+ mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length );
return mc;
}
_msa = mafft.infer( _msa.asSequenceList(), opts );
}
- final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign )
- throws IOException, InterruptedException {
+ final private void removeViaGapAverage( final double mean_gapiness,
+ final int step,
+ final boolean realign,
+ final File outfile,
+ final int minimal_effective_length ) throws IOException,
+ InterruptedException {
if ( step < 1 ) {
throw new IllegalArgumentException( "step cannot be less than 1" );
}
throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" );
}
if ( VERBOSE ) {
- System.out.println( "start: " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( "orig: " + msaStatsAsSB() );
+ }
+ if ( minimal_effective_length > 1 ) {
+ _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length );
+ if ( VERBOSE ) {
+ System.out.println( "short seq removal: " + msaStatsAsSB() );
+ }
}
int counter = step;
- while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) {
+ double gr;
+ do {
removeWorstOffenders( step, 1, false );
if ( realign ) {
mafft();
}
+ gr = MsaMethods.calcGapRatio( _msa );
if ( VERBOSE ) {
- System.out.println( counter + ": " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( counter + ": " + msaStatsAsSB() );
}
+ write( outfile, gr );
counter += step;
+ } while ( gr > mean_gapiness );
+ if ( VERBOSE ) {
+ System.out.println( "final: " + msaStatsAsSB() );
}
}
+ final private void write( final File outfile, final double gr ) throws IOException {
+ writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + gr + ".aln" );
+ }
+
+ final private void writeMsa( final String outfile ) throws IOException {
+ final Writer w = ForesterUtil.createBufferedWriter( outfile );
+ _msa.write( w, MSA_FORMAT.PHYLIP );
+ w.close();
+ }
+
+ final private StringBuilder msaStatsAsSB() {
+ final StringBuilder sb = new StringBuilder();
+ sb.append( _msa.getLength() );
+ sb.append( "\t" );
+ sb.append( _msa.getNumberOfSequences() );
+ sb.append( "\t" );
+ sb.append( ForesterUtil.round( MsaMethods.calcGapRatio( _msa ), 4 ) );
+ sb.append( "\t" );
+ return sb;
+ }
+
final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
InterruptedException {
if ( step < 1 ) {
throw new IllegalArgumentException( "target length cannot be less than 1" );
}
if ( VERBOSE ) {
- System.out.println( "start: " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( "orig: " + msaStatsAsSB() );
}
int counter = step;
while ( _msa.getLength() > length ) {
mafft();
}
if ( VERBOSE ) {
- System.out.println( counter + ": " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( counter + ": " + msaStatsAsSB() );
}
counter += step;
}
}
private final static void sort( final DescriptiveStatistics stats[] ) {
- Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) );
+ Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MAX ) );
}
private final DescriptiveStatistics[] calc() {
final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
- final boolean _ascending;
+ final private boolean _ascending;
+ final private SORT_BY _sort_by;
- public DescriptiveStatisticsComparator( final boolean ascending ) {
+ public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) {
_ascending = ascending;
+ _sort_by = sort_by;
}
@Override
public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
- if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
- return _ascending ? -1 : 1;
- }
- else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
- return _ascending ? 1 : -1;
+ switch ( _sort_by ) {
+ case MAX:
+ if ( s0.getMax() < s1.getMax() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( s0.getMax() > s1.getMax() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ case MEAN:
+ if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ case MEDIAN:
+ if ( s0.median() < s1.median() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( s0.median() > s1.median() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ default:
+ return 0;
}
- return 0;
}
}
}
return BasicMsa.createInstance( seqs );
}
+ final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_rows.contains( row ) ) {
+ seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio,
final int min_allowed_length,
final Msa msa ) {
}
return stats;
}
+
+ public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) {
+ final List<Integer> to_remove_rows = new ArrayList<Integer>();
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ int eff_length = 0;
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) {
+ eff_length++;
+ }
+ }
+ if ( eff_length < min_effective_length ) {
+ to_remove_rows.add( seq );
+ }
+ }
+ return removeSequencesByRow( msa, to_remove_rows );
+ }
+
+ public static double calcGapRatio( final Msa msa ) {
+ int gaps = 0;
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
+ gaps++;
+ }
+ }
+ }
+ return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+ }
}