X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FMsaCompactor.java;h=a66c09b2dc6068e5c16b8e5639c40ca4ef47ea2d;hb=ea402c168ecf73c1824d023a91331494939f2641;hp=f6db9a05e770d2e63c13ac1939a44dd25415583a;hpb=10932f9386c7e444b2dffbac779282d5d1cc16a7;p=jalview.git diff --git a/forester/java/src/org/forester/msa/MsaCompactor.java b/forester/java/src/org/forester/msa/MsaCompactor.java index f6db9a0..a66c09b 100644 --- a/forester/java/src/org/forester/msa/MsaCompactor.java +++ b/forester/java/src/org/forester/msa/MsaCompactor.java @@ -1,28 +1,210 @@ + package org.forester.msa; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.sequence.Sequence; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; public class MsaCompactor { - - final private Msa _msa; - - public MsaCompactor( Msa msa ) { - + + private static final boolean VERBOSE = true; + private Msa _msa; + private final SortedSet _removed_seq_ids; + + private MsaCompactor( final Msa msa ) { _msa = msa; + _removed_seq_ids = new TreeSet(); + } + + final public SortedSet getRemovedSeqIds() { + return _removed_seq_ids; + } + + final public Msa getMsa() { + return _msa; + } + + public final static MsaCompactor removeWorstOffenders( final Msa msa, + final int worst_offenders_to_remove, + final boolean realign ) throws IOException, + InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign ); + return mc; + } + + public final static MsaCompactor reduceGapAverage( final Msa msa, + final double max_gap_average, + final int step, + final boolean realign ) throws IOException, InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeViaGapAverage( max_gap_average, step, realign ); + return mc; + } + + public final static MsaCompactor reduceLength( final Msa msa, + final int length, + final int step, + final boolean realign ) throws IOException, InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeViaLength( length, step, realign ); + return mc; + } + + final private void removeGapColumns() { + _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); + } + + final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign ) + throws IOException, InterruptedException { + final DescriptiveStatistics stats[] = calcStats(); + final List to_remove_ids = new ArrayList(); + for( int j = 0; j < to_remove; ++j ) { + to_remove_ids.add( stats[ j ].getDescription() ); + _removed_seq_ids.add( stats[ j ].getDescription() ); + } + _msa = MsaMethods.removeSequences( _msa, to_remove_ids ); + removeGapColumns(); + if ( realign ) { + mafft(); + } + } + + final private void mafft() throws IOException, InterruptedException { + final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" ); + final List opts = new ArrayList(); + // opts.add( "--maxiterate" ); + // opts.add( "1000" ); + // opts.add( "--localpair" ); + opts.add( "--quiet" ); + _msa = mafft.infer( _msa.asSequenceList(), opts ); + } + + final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign ) + throws IOException, InterruptedException { + if ( step < 1 ) { + throw new IllegalArgumentException( "step cannot be less than 1" ); + } + if ( mean_gapiness < 0 ) { + throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" ); + } + if ( VERBOSE ) { + System.out.println( "start: " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + int counter = step; + while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) { + removeWorstOffenders( step, 1, false ); + if ( realign ) { + mafft(); + } + if ( VERBOSE ) { + System.out.println( counter + ": " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + counter += step; + } } - - - - private calc - - private double[] calcGappiness() { + + final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException, + InterruptedException { + if ( step < 1 ) { + throw new IllegalArgumentException( "step cannot be less than 1" ); + } + if ( length < 11 ) { + throw new IllegalArgumentException( "target length cannot be less than 1" ); + } + if ( VERBOSE ) { + System.out.println( "start: " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + int counter = step; + while ( _msa.getLength() > length ) { + removeWorstOffenders( step, 1, false ); + if ( realign ) { + mafft(); + } + if ( VERBOSE ) { + System.out.println( counter + ": " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + counter += step; + } + } + + final private DescriptiveStatistics[] calcStats() { + final DescriptiveStatistics stats[] = calc(); + sort( stats ); + for( int i = 0; i < stats.length; ++i ) { + final DescriptiveStatistics s = stats[ i ]; + // System.out.print( s.getDescription() ); + // System.out.print( "\t" ); + // System.out.print( s.arithmeticMean() ); + // System.out.print( "\t(" ); + // System.out.print( s.arithmeticMean() ); + // System.out.print( ")" ); + // System.out.print( "\t" ); + // System.out.print( s.getMin() ); + // System.out.print( "\t" ); + // System.out.print( s.getMax() ); + // System.out.println(); + } + return stats; + } + + private final static void sort( final DescriptiveStatistics stats[] ) { + Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) ); + } + + private final DescriptiveStatistics[] calc() { + final double gappiness[] = calcGappiness(); + final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ]; + for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) { + stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) ); + for( int col = 0; col < _msa.getLength(); ++col ) { + if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) { + stats[ row ].addValue( gappiness[ col ] ); + } + } + } + return stats; + } + + private final double[] calcGappiness() { final double gappiness[] = new double[ _msa.getLength() ]; - final int seqs = _msa.getNumberOfSequences(); + final int seqs = _msa.getNumberOfSequences(); for( int i = 0; i < gappiness.length; ++i ) { - gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / _msa.getNumberOfSequences(); - + gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs; } return gappiness; - } - + + final static class DescriptiveStatisticsComparator implements Comparator { + + final boolean _ascending; + + public DescriptiveStatisticsComparator( final boolean ascending ) { + _ascending = ascending; + } + + @Override + public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) { + if ( s0.arithmeticMean() < s1.arithmeticMean() ) { + return _ascending ? -1 : 1; + } + else if ( s0.arithmeticMean() > s1.arithmeticMean() ) { + return _ascending ? 1 : -1; + } + return 0; + } + } }