X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FMsaCompactor.java;h=73f0c6300e06da30934abd258a78d3820d25dbda;hb=3d6a64e0371ffedf725bf8467211ed860f298550;hp=9172696eaf9acf4fee3ab7973a56c9197918d28f;hpb=b1a74bd076c917d428002284a53a2c6f390226fb;p=jalview.git diff --git a/forester/java/src/org/forester/msa/MsaCompactor.java b/forester/java/src/org/forester/msa/MsaCompactor.java index 9172696..73f0c63 100644 --- a/forester/java/src/org/forester/msa/MsaCompactor.java +++ b/forester/java/src/org/forester/msa/MsaCompactor.java @@ -1,7 +1,9 @@ package org.forester.msa; +import java.io.File; import java.io.IOException; +import java.io.Writer; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -9,6 +11,7 @@ import java.util.List; import java.util.SortedSet; import java.util.TreeSet; +import org.forester.msa.Msa.MSA_FORMAT; import org.forester.sequence.Sequence; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; @@ -16,7 +19,11 @@ import org.forester.util.ForesterUtil; public class MsaCompactor { - private static final boolean VERBOSE = true; + private static final boolean VERBOSE = true; + + public static enum SORT_BY { + MAX, MEAN, MEDIAN; + } private Msa _msa; private final SortedSet _removed_seq_ids; @@ -45,9 +52,12 @@ public class MsaCompactor { public final static MsaCompactor reduceGapAverage( final Msa msa, final double max_gap_average, final int step, - final boolean realign ) throws IOException, InterruptedException { + final boolean realign, + final File out, + final int minimal_effective_length ) throws IOException, + InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); - mc.removeViaGapAverage( max_gap_average, step, realign ); + mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length ); return mc; } @@ -82,48 +92,94 @@ public class MsaCompactor { final private void mafft() throws IOException, InterruptedException { final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" ); final List opts = new ArrayList(); - opts.add( "--maxiterate" ); - opts.add( "1000" ); - opts.add( "--localpair" ); + // opts.add( "--maxiterate" ); + // opts.add( "1000" ); + // opts.add( "--localpair" ); opts.add( "--quiet" ); _msa = mafft.infer( _msa.asSequenceList(), opts ); } - final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign ) - throws IOException, InterruptedException { + final private void removeViaGapAverage( final double mean_gapiness, + final int step, + final boolean realign, + final File outfile, + final int minimal_effective_length ) throws IOException, + InterruptedException { + if ( step < 1 ) { + throw new IllegalArgumentException( "step cannot be less than 1" ); + } + if ( mean_gapiness < 0 ) { + throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" ); + } if ( VERBOSE ) { - System.out.println( "start: " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( "orig: " + msaStatsAsSB() ); + } + if ( minimal_effective_length > 1 ) { + _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length ); + if ( VERBOSE ) { + System.out.println( "short seq removal: " + msaStatsAsSB() ); + } } - int counter = 0; - while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) { + int counter = step; + double gr; + do { removeWorstOffenders( step, 1, false ); if ( realign ) { mafft(); } + gr = MsaMethods.calcGapRatio( _msa ); if ( VERBOSE ) { - System.out.println( counter + ": " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( counter + ": " + msaStatsAsSB() ); } + write( outfile, gr ); counter += step; + } while ( gr > mean_gapiness ); + if ( VERBOSE ) { + System.out.println( "final: " + msaStatsAsSB() ); } } + final private void write( final File outfile, final double gr ) throws IOException { + writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + + ForesterUtil.roundToInt( gr * 100 ) + ".fasta" ); + } + + final private void writeMsa( final String outfile ) throws IOException { + final Writer w = ForesterUtil.createBufferedWriter( outfile ); + _msa.write( w, MSA_FORMAT.FASTA ); + w.close(); + } + + final private StringBuilder msaStatsAsSB() { + final StringBuilder sb = new StringBuilder(); + sb.append( _msa.getLength() ); + sb.append( "\t" ); + sb.append( _msa.getNumberOfSequences() ); + sb.append( "\t" ); + sb.append( ForesterUtil.round( MsaMethods.calcGapRatio( _msa ), 4 ) ); + sb.append( "\t" ); + return sb; + } + final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException, InterruptedException { + if ( step < 1 ) { + throw new IllegalArgumentException( "step cannot be less than 1" ); + } + if ( length < 11 ) { + throw new IllegalArgumentException( "target length cannot be less than 1" ); + } if ( VERBOSE ) { - System.out.println( "start: " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( "orig: " + msaStatsAsSB() ); } - int counter = 0; + int counter = step; while ( _msa.getLength() > length ) { removeWorstOffenders( step, 1, false ); if ( realign ) { mafft(); } if ( VERBOSE ) { - System.out.println( counter + ": " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( counter + ": " + msaStatsAsSB() ); } counter += step; } @@ -132,25 +188,13 @@ public class MsaCompactor { final private DescriptiveStatistics[] calcStats() { final DescriptiveStatistics stats[] = calc(); sort( stats ); - for( int i = 0; i < stats.length; ++i ) { - final DescriptiveStatistics s = stats[ i ]; - // System.out.print( s.getDescription() ); - // System.out.print( "\t" ); - // System.out.print( s.arithmeticMean() ); - // System.out.print( "\t(" ); - // System.out.print( s.arithmeticMean() ); - // System.out.print( ")" ); - // System.out.print( "\t" ); - // System.out.print( s.getMin() ); - // System.out.print( "\t" ); - // System.out.print( s.getMax() ); - // System.out.println(); + for( final DescriptiveStatistics s : stats ) { } return stats; } private final static void sort( final DescriptiveStatistics stats[] ) { - Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) ); + Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MAX ) ); } private final DescriptiveStatistics[] calc() { @@ -178,21 +222,44 @@ public class MsaCompactor { final static class DescriptiveStatisticsComparator implements Comparator { - final boolean _ascending; + final private boolean _ascending; + final private SORT_BY _sort_by; - public DescriptiveStatisticsComparator( final boolean ascending ) { + public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) { _ascending = ascending; + _sort_by = sort_by; } @Override public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) { - if ( s0.arithmeticMean() < s1.arithmeticMean() ) { - return _ascending ? -1 : 1; - } - else if ( s0.arithmeticMean() > s1.arithmeticMean() ) { - return _ascending ? 1 : -1; + switch ( _sort_by ) { + case MAX: + if ( s0.getMax() < s1.getMax() ) { + return _ascending ? -1 : 1; + } + else if ( s0.getMax() > s1.getMax() ) { + return _ascending ? 1 : -1; + } + return 0; + case MEAN: + if ( s0.arithmeticMean() < s1.arithmeticMean() ) { + return _ascending ? -1 : 1; + } + else if ( s0.arithmeticMean() > s1.arithmeticMean() ) { + return _ascending ? 1 : -1; + } + return 0; + case MEDIAN: + if ( s0.median() < s1.median() ) { + return _ascending ? -1 : 1; + } + else if ( s0.median() > s1.median() ) { + return _ascending ? 1 : -1; + } + return 0; + default: + return 0; } - return 0; } } }