package org.forester.msa;
+import java.io.File;
import java.io.IOException;
+import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.SortedSet;
import java.util.TreeSet;
+import org.forester.msa.Msa.MSA_FORMAT;
import org.forester.sequence.Sequence;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.DescriptiveStatistics;
public class MsaCompactor {
- private static final boolean VERBOSE = true;
+ private static final boolean VERBOSE = true;
+
+ public static enum SORT_BY {
+ MAX, MEAN, MEDIAN;
+ }
private Msa _msa;
private final SortedSet<String> _removed_seq_ids;
public final static MsaCompactor reduceGapAverage( final Msa msa,
final double max_gap_average,
final int step,
- final boolean realign ) throws IOException, InterruptedException {
+ final boolean realign,
+ final File out,
+ final int minimal_effective_length ) throws IOException,
+ InterruptedException {
final MsaCompactor mc = new MsaCompactor( msa );
- mc.removeViaGapAverage( max_gap_average, step, realign );
+ mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length );
return mc;
}
_msa = mafft.infer( _msa.asSequenceList(), opts );
}
- final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign )
- throws IOException, InterruptedException {
+ final private void removeViaGapAverage( final double mean_gapiness,
+ final int step,
+ final boolean realign,
+ final File outfile,
+ final int minimal_effective_length ) throws IOException,
+ InterruptedException {
if ( step < 1 ) {
throw new IllegalArgumentException( "step cannot be less than 1" );
}
throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" );
}
if ( VERBOSE ) {
- System.out.println( "start: " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( "orig: " + msaStatsAsSB() );
+ }
+ if ( minimal_effective_length > 1 ) {
+ _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length );
+ if ( VERBOSE ) {
+ System.out.println( "short seq removal: " + msaStatsAsSB() );
+ }
}
int counter = step;
- while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) {
+ double gr;
+ do {
removeWorstOffenders( step, 1, false );
if ( realign ) {
mafft();
}
+ gr = MsaMethods.calcGapRatio( _msa );
if ( VERBOSE ) {
- System.out.println( counter + ": " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( counter + ": " + msaStatsAsSB() );
}
+ write( outfile, gr );
counter += step;
+ } while ( gr > mean_gapiness );
+ if ( VERBOSE ) {
+ System.out.println( "final: " + msaStatsAsSB() );
}
}
+ final private void write( final File outfile, final double gr ) throws IOException {
+ writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
+ + ForesterUtil.roundToInt( gr * 100 ) + ".fasta" );
+ }
+
+ final private void writeMsa( final String outfile ) throws IOException {
+ final Writer w = ForesterUtil.createBufferedWriter( outfile );
+ _msa.write( w, MSA_FORMAT.FASTA );
+ w.close();
+ }
+
+ final private StringBuilder msaStatsAsSB() {
+ final StringBuilder sb = new StringBuilder();
+ sb.append( _msa.getLength() );
+ sb.append( "\t" );
+ sb.append( _msa.getNumberOfSequences() );
+ sb.append( "\t" );
+ sb.append( ForesterUtil.round( MsaMethods.calcGapRatio( _msa ), 4 ) );
+ sb.append( "\t" );
+ return sb;
+ }
+
final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
InterruptedException {
if ( step < 1 ) {
throw new IllegalArgumentException( "target length cannot be less than 1" );
}
if ( VERBOSE ) {
- System.out.println( "start: " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( "orig: " + msaStatsAsSB() );
}
int counter = step;
while ( _msa.getLength() > length ) {
mafft();
}
if ( VERBOSE ) {
- System.out.println( counter + ": " + _msa.getLength() + " "
- + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ System.out.println( counter + ": " + msaStatsAsSB() );
}
counter += step;
}
final private DescriptiveStatistics[] calcStats() {
final DescriptiveStatistics stats[] = calc();
sort( stats );
- for( int i = 0; i < stats.length; ++i ) {
- final DescriptiveStatistics s = stats[ i ];
- // System.out.print( s.getDescription() );
- // System.out.print( "\t" );
- // System.out.print( s.arithmeticMean() );
- // System.out.print( "\t(" );
- // System.out.print( s.arithmeticMean() );
- // System.out.print( ")" );
- // System.out.print( "\t" );
- // System.out.print( s.getMin() );
- // System.out.print( "\t" );
- // System.out.print( s.getMax() );
- // System.out.println();
+ for( final DescriptiveStatistics s : stats ) {
}
return stats;
}
private final static void sort( final DescriptiveStatistics stats[] ) {
- Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) );
+ Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MAX ) );
}
private final DescriptiveStatistics[] calc() {
final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
- final boolean _ascending;
+ final private boolean _ascending;
+ final private SORT_BY _sort_by;
- public DescriptiveStatisticsComparator( final boolean ascending ) {
+ public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) {
_ascending = ascending;
+ _sort_by = sort_by;
}
@Override
public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
- if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
- return _ascending ? -1 : 1;
- }
- else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
- return _ascending ? 1 : -1;
+ switch ( _sort_by ) {
+ case MAX:
+ if ( s0.getMax() < s1.getMax() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( s0.getMax() > s1.getMax() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ case MEAN:
+ if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ case MEDIAN:
+ if ( s0.median() < s1.median() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( s0.median() > s1.median() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ default:
+ return 0;
}
- return 0;
}
}
}