2 package org.forester.msa;
5 import java.io.IOException;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.Comparator;
10 import java.util.List;
11 import java.util.SortedSet;
12 import java.util.TreeSet;
14 import org.forester.msa.Msa.MSA_FORMAT;
15 import org.forester.sequence.Sequence;
16 import org.forester.util.BasicDescriptiveStatistics;
17 import org.forester.util.DescriptiveStatistics;
18 import org.forester.util.ForesterUtil;
20 public class MsaCompactor {
22 private static final boolean VERBOSE = true;
24 public static enum SORT_BY {
28 private final SortedSet<String> _removed_seq_ids;
30 private MsaCompactor( final Msa msa ) {
32 _removed_seq_ids = new TreeSet<String>();
35 final public SortedSet<String> getRemovedSeqIds() {
36 return _removed_seq_ids;
39 final public Msa getMsa() {
43 public final static MsaCompactor removeWorstOffenders( final Msa msa,
44 final int worst_offenders_to_remove,
45 final boolean realign ) throws IOException,
46 InterruptedException {
47 final MsaCompactor mc = new MsaCompactor( msa );
48 mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign );
52 public final static MsaCompactor reduceGapAverage( final Msa msa,
53 final double max_gap_average,
55 final boolean realign,
57 final int minimal_effective_length ) throws IOException,
58 InterruptedException {
59 final MsaCompactor mc = new MsaCompactor( msa );
60 mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length );
64 public final static MsaCompactor reduceLength( final Msa msa,
67 final boolean realign ) throws IOException, InterruptedException {
68 final MsaCompactor mc = new MsaCompactor( msa );
69 mc.removeViaLength( length, step, realign );
73 final private void removeGapColumns() {
74 _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
77 final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign )
78 throws IOException, InterruptedException {
79 final DescriptiveStatistics stats[] = calcStats();
80 final List<String> to_remove_ids = new ArrayList<String>();
81 for( int j = 0; j < to_remove; ++j ) {
82 to_remove_ids.add( stats[ j ].getDescription() );
83 _removed_seq_ids.add( stats[ j ].getDescription() );
85 _msa = MsaMethods.removeSequences( _msa, to_remove_ids );
92 final private void mafft() throws IOException, InterruptedException {
93 final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" );
94 final List<String> opts = new ArrayList<String>();
95 // opts.add( "--maxiterate" );
96 // opts.add( "1000" );
97 // opts.add( "--localpair" );
98 opts.add( "--quiet" );
99 _msa = mafft.infer( _msa.asSequenceList(), opts );
102 final private void removeViaGapAverage( final double mean_gapiness,
104 final boolean realign,
106 final int minimal_effective_length ) throws IOException,
107 InterruptedException {
109 throw new IllegalArgumentException( "step cannot be less than 1" );
111 if ( mean_gapiness < 0 ) {
112 throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" );
115 System.out.println( "orig: " + msaStatsAsSB() );
117 if ( minimal_effective_length > 1 ) {
118 _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length );
120 System.out.println( "short seq removal: " + msaStatsAsSB() );
126 removeWorstOffenders( step, 1, false );
130 gr = MsaMethods.calcGapRatio( _msa );
132 System.out.println( counter + ": " + msaStatsAsSB() );
134 write( outfile, gr );
136 } while ( gr > mean_gapiness );
138 System.out.println( "final: " + msaStatsAsSB() );
142 final private void write( final File outfile, final double gr ) throws IOException {
143 writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + gr + ".aln" );
146 final private void writeMsa( final String outfile ) throws IOException {
147 final Writer w = ForesterUtil.createBufferedWriter( outfile );
148 _msa.write( w, MSA_FORMAT.PHYLIP );
152 final private StringBuilder msaStatsAsSB() {
153 final StringBuilder sb = new StringBuilder();
154 sb.append( _msa.getLength() );
156 sb.append( _msa.getNumberOfSequences() );
158 sb.append( ForesterUtil.round( MsaMethods.calcGapRatio( _msa ), 4 ) );
163 final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
164 InterruptedException {
166 throw new IllegalArgumentException( "step cannot be less than 1" );
169 throw new IllegalArgumentException( "target length cannot be less than 1" );
172 System.out.println( "orig: " + msaStatsAsSB() );
175 while ( _msa.getLength() > length ) {
176 removeWorstOffenders( step, 1, false );
181 System.out.println( counter + ": " + msaStatsAsSB() );
187 final private DescriptiveStatistics[] calcStats() {
188 final DescriptiveStatistics stats[] = calc();
190 for( int i = 0; i < stats.length; ++i ) {
191 final DescriptiveStatistics s = stats[ i ];
192 // System.out.print( s.getDescription() );
193 // System.out.print( "\t" );
194 // System.out.print( s.arithmeticMean() );
195 // System.out.print( "\t(" );
196 // System.out.print( s.arithmeticMean() );
197 // System.out.print( ")" );
198 // System.out.print( "\t" );
199 // System.out.print( s.getMin() );
200 // System.out.print( "\t" );
201 // System.out.print( s.getMax() );
202 // System.out.println();
207 private final static void sort( final DescriptiveStatistics stats[] ) {
208 Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MAX ) );
211 private final DescriptiveStatistics[] calc() {
212 final double gappiness[] = calcGappiness();
213 final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
214 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
215 stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
216 for( int col = 0; col < _msa.getLength(); ++col ) {
217 if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) {
218 stats[ row ].addValue( gappiness[ col ] );
225 private final double[] calcGappiness() {
226 final double gappiness[] = new double[ _msa.getLength() ];
227 final int seqs = _msa.getNumberOfSequences();
228 for( int i = 0; i < gappiness.length; ++i ) {
229 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
234 final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
236 final private boolean _ascending;
237 final private SORT_BY _sort_by;
239 public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) {
240 _ascending = ascending;
245 public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
246 switch ( _sort_by ) {
248 if ( s0.getMax() < s1.getMax() ) {
249 return _ascending ? -1 : 1;
251 else if ( s0.getMax() > s1.getMax() ) {
252 return _ascending ? 1 : -1;
256 if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
257 return _ascending ? -1 : 1;
259 else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
260 return _ascending ? 1 : -1;
264 if ( s0.median() < s1.median() ) {
265 return _ascending ? -1 : 1;
267 else if ( s0.median() > s1.median() ) {
268 return _ascending ? 1 : -1;