X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FMsaCompactor.java;h=64c69577ba386ce56606f325e5521851ab26800f;hb=862ca59d36af9ccbed3ef284b497f9c04263ba97;hp=f6db9a05e770d2e63c13ac1939a44dd25415583a;hpb=10932f9386c7e444b2dffbac779282d5d1cc16a7;p=jalview.git diff --git a/forester/java/src/org/forester/msa/MsaCompactor.java b/forester/java/src/org/forester/msa/MsaCompactor.java index f6db9a0..64c6957 100644 --- a/forester/java/src/org/forester/msa/MsaCompactor.java +++ b/forester/java/src/org/forester/msa/MsaCompactor.java @@ -1,28 +1,277 @@ + package org.forester.msa; +import java.io.File; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.msa.Msa.MSA_FORMAT; +import org.forester.sequence.Sequence; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; public class MsaCompactor { - - final private Msa _msa; - - public MsaCompactor( Msa msa ) { - + + private static final boolean VERBOSE = true; + + public static enum SORT_BY { + MAX, MEAN, MEDIAN; + } + private Msa _msa; + private final SortedSet _removed_seq_ids; + + private MsaCompactor( final Msa msa ) { _msa = msa; + _removed_seq_ids = new TreeSet(); + } + + final public SortedSet getRemovedSeqIds() { + return _removed_seq_ids; + } + + final public Msa getMsa() { + return _msa; + } + + public final static MsaCompactor removeWorstOffenders( final Msa msa, + final int worst_offenders_to_remove, + final boolean realign ) throws IOException, + InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign ); + return mc; + } + + public final static MsaCompactor reduceGapAverage( final Msa msa, + final double max_gap_average, + final int step, + final boolean realign, + final File out, + final int minimal_effective_length ) throws IOException, + InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length ); + return mc; + } + + public final static MsaCompactor reduceLength( final Msa msa, + final int length, + final int step, + final boolean realign ) throws IOException, InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeViaLength( length, step, realign ); + return mc; + } + + final private void removeGapColumns() { + _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); + } + + final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign ) + throws IOException, InterruptedException { + final DescriptiveStatistics stats[] = calcStats(); + final List to_remove_ids = new ArrayList(); + for( int j = 0; j < to_remove; ++j ) { + to_remove_ids.add( stats[ j ].getDescription() ); + _removed_seq_ids.add( stats[ j ].getDescription() ); + } + _msa = MsaMethods.removeSequences( _msa, to_remove_ids ); + removeGapColumns(); + if ( realign ) { + mafft(); + } } - - - - private calc - - private double[] calcGappiness() { + + final private void mafft() throws IOException, InterruptedException { + final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" ); + final List opts = new ArrayList(); + // opts.add( "--maxiterate" ); + // opts.add( "1000" ); + // opts.add( "--localpair" ); + opts.add( "--quiet" ); + _msa = mafft.infer( _msa.asSequenceList(), opts ); + } + + final private void removeViaGapAverage( final double mean_gapiness, + final int step, + final boolean realign, + final File outfile, + final int minimal_effective_length ) throws IOException, + InterruptedException { + if ( step < 1 ) { + throw new IllegalArgumentException( "step cannot be less than 1" ); + } + if ( mean_gapiness < 0 ) { + throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" ); + } + if ( VERBOSE ) { + System.out.println( "orig: " + msaStatsAsSB() ); + } + if ( minimal_effective_length > 1 ) { + _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length ); + if ( VERBOSE ) { + System.out.println( "short seq removal: " + msaStatsAsSB() ); + } + } + int counter = step; + double gr; + do { + removeWorstOffenders( step, 1, false ); + if ( realign ) { + mafft(); + } + gr = MsaMethods.calcGapRatio( _msa ); + if ( VERBOSE ) { + System.out.println( counter + ": " + msaStatsAsSB() ); + } + write( outfile, gr ); + counter += step; + } while ( gr > mean_gapiness ); + if ( VERBOSE ) { + System.out.println( "final: " + msaStatsAsSB() ); + } + } + + final private void write( final File outfile, final double gr ) throws IOException { + writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + + ForesterUtil.roundToInt( gr * 100 ) + ".fasta" ); + } + + final private void writeMsa( final String outfile ) throws IOException { + final Writer w = ForesterUtil.createBufferedWriter( outfile ); + _msa.write( w, MSA_FORMAT.FASTA ); + w.close(); + } + + final private StringBuilder msaStatsAsSB() { + final StringBuilder sb = new StringBuilder(); + sb.append( _msa.getLength() ); + sb.append( "\t" ); + sb.append( _msa.getNumberOfSequences() ); + sb.append( "\t" ); + sb.append( ForesterUtil.round( MsaMethods.calcGapRatio( _msa ), 4 ) ); + sb.append( "\t" ); + return sb; + } + + final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException, + InterruptedException { + if ( step < 1 ) { + throw new IllegalArgumentException( "step cannot be less than 1" ); + } + if ( length < 11 ) { + throw new IllegalArgumentException( "target length cannot be less than 1" ); + } + if ( VERBOSE ) { + System.out.println( "orig: " + msaStatsAsSB() ); + } + int counter = step; + while ( _msa.getLength() > length ) { + removeWorstOffenders( step, 1, false ); + if ( realign ) { + mafft(); + } + if ( VERBOSE ) { + System.out.println( counter + ": " + msaStatsAsSB() ); + } + counter += step; + } + } + + final private DescriptiveStatistics[] calcStats() { + final DescriptiveStatistics stats[] = calc(); + sort( stats ); + for( int i = 0; i < stats.length; ++i ) { + final DescriptiveStatistics s = stats[ i ]; + // System.out.print( s.getDescription() ); + // System.out.print( "\t" ); + // System.out.print( s.arithmeticMean() ); + // System.out.print( "\t(" ); + // System.out.print( s.arithmeticMean() ); + // System.out.print( ")" ); + // System.out.print( "\t" ); + // System.out.print( s.getMin() ); + // System.out.print( "\t" ); + // System.out.print( s.getMax() ); + // System.out.println(); + } + return stats; + } + + private final static void sort( final DescriptiveStatistics stats[] ) { + Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MAX ) ); + } + + private final DescriptiveStatistics[] calc() { + final double gappiness[] = calcGappiness(); + final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ]; + for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) { + stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) ); + for( int col = 0; col < _msa.getLength(); ++col ) { + if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) { + stats[ row ].addValue( gappiness[ col ] ); + } + } + } + return stats; + } + + private final double[] calcGappiness() { final double gappiness[] = new double[ _msa.getLength() ]; - final int seqs = _msa.getNumberOfSequences(); + final int seqs = _msa.getNumberOfSequences(); for( int i = 0; i < gappiness.length; ++i ) { - gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / _msa.getNumberOfSequences(); - + gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs; } return gappiness; - } - + + final static class DescriptiveStatisticsComparator implements Comparator { + + final private boolean _ascending; + final private SORT_BY _sort_by; + + public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) { + _ascending = ascending; + _sort_by = sort_by; + } + + @Override + public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) { + switch ( _sort_by ) { + case MAX: + if ( s0.getMax() < s1.getMax() ) { + return _ascending ? -1 : 1; + } + else if ( s0.getMax() > s1.getMax() ) { + return _ascending ? 1 : -1; + } + return 0; + case MEAN: + if ( s0.arithmeticMean() < s1.arithmeticMean() ) { + return _ascending ? -1 : 1; + } + else if ( s0.arithmeticMean() > s1.arithmeticMean() ) { + return _ascending ? 1 : -1; + } + return 0; + case MEDIAN: + if ( s0.median() < s1.median() ) { + return _ascending ? -1 : 1; + } + else if ( s0.median() > s1.median() ) { + return _ascending ? 1 : -1; + } + return 0; + default: + return 0; + } + } + } }