2 package org.forester.msa;
5 import java.io.IOException;
7 import java.math.RoundingMode;
8 import java.text.DecimalFormat;
9 import java.text.DecimalFormatSymbols;
10 import java.text.NumberFormat;
11 import java.util.ArrayList;
12 import java.util.Arrays;
13 import java.util.Comparator;
14 import java.util.List;
15 import java.util.SortedSet;
16 import java.util.TreeSet;
18 import org.forester.msa.Msa.MSA_FORMAT;
19 import org.forester.sequence.Sequence;
20 import org.forester.util.BasicDescriptiveStatistics;
21 import org.forester.util.DescriptiveStatistics;
22 import org.forester.util.ForesterUtil;
24 public class MsaCompactor {
26 private static final boolean VERBOSE = true;
28 public static enum SORT_BY {
32 private final SortedSet<String> _removed_seq_ids;
34 private MsaCompactor( final Msa msa ) {
36 _removed_seq_ids = new TreeSet<String>();
39 final public SortedSet<String> getRemovedSeqIds() {
40 return _removed_seq_ids;
43 final public Msa getMsa() {
47 public final static MsaCompactor removeWorstOffenders( final Msa msa,
48 final int worst_offenders_to_remove,
49 final boolean realign ) throws IOException,
50 InterruptedException {
51 final MsaCompactor mc = new MsaCompactor( msa );
52 mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign );
56 public final static MsaCompactor reduceGapAverage( final Msa msa,
57 final double max_gap_average,
59 final boolean realign,
61 final int minimal_effective_length ) throws IOException,
62 InterruptedException {
63 final MsaCompactor mc = new MsaCompactor( msa );
64 mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length );
68 public final static MsaCompactor reduceLength( final Msa msa,
71 final boolean realign ) throws IOException, InterruptedException {
72 final MsaCompactor mc = new MsaCompactor( msa );
73 mc.removeViaLength( length, step, realign );
77 final private void removeGapColumns() {
78 _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
81 final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign )
82 throws IOException, InterruptedException {
83 final DescriptiveStatistics stats[] = calcStats();
84 final List<String> to_remove_ids = new ArrayList<String>();
85 for( int j = 0; j < to_remove; ++j ) {
86 to_remove_ids.add( stats[ j ].getDescription() );
87 _removed_seq_ids.add( stats[ j ].getDescription() );
89 _msa = MsaMethods.removeSequences( _msa, to_remove_ids );
96 final private void mafft() throws IOException, InterruptedException {
97 final MsaInferrer mafft = Mafft.createInstance( "mafft" );
98 final List<String> opts = new ArrayList<String>();
99 // opts.add( "--maxiterate" );
100 // opts.add( "1000" );
101 // opts.add( "--localpair" );
102 opts.add( "--quiet" );
103 _msa = mafft.infer( _msa.asSequenceList(), opts );
106 final private void removeViaGapAverage( final double mean_gapiness,
108 final boolean realign,
110 final int minimal_effective_length ) throws IOException,
111 InterruptedException {
113 throw new IllegalArgumentException( "step cannot be less than 1" );
115 if ( mean_gapiness < 0 ) {
116 throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" );
119 System.out.println( "orig: " + msaStatsAsSB() );
121 if ( minimal_effective_length > 1 ) {
122 _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length );
124 System.out.println( "short seq removal: " + msaStatsAsSB() );
130 removeWorstOffenders( step, 1, false );
134 gr = MsaMethods.calcGapRatio( _msa );
136 System.out.println( counter + ": " + msaStatsAsSB() );
138 write( outfile, gr );
140 } while ( gr > mean_gapiness );
142 System.out.println( "final: " + msaStatsAsSB() );
146 final private void write( final File outfile, final double gr ) throws IOException {
147 writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
148 + ForesterUtil.roundToInt( gr * 100 ) + ".fasta" );
151 final private void writeMsa( final String outfile ) throws IOException {
152 final Writer w = ForesterUtil.createBufferedWriter( outfile );
153 _msa.write( w, MSA_FORMAT.FASTA );
157 final private StringBuilder msaStatsAsSB() {
158 final StringBuilder sb = new StringBuilder();
159 sb.append( _msa.getLength() );
161 sb.append( _msa.getNumberOfSequences() );
163 sb.append( ForesterUtil.round( MsaMethods.calcGapRatio( _msa ), 4 ) );
168 final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
169 InterruptedException {
171 throw new IllegalArgumentException( "step cannot be less than 1" );
174 throw new IllegalArgumentException( "target length cannot be less than 1" );
177 System.out.println( "orig: " + msaStatsAsSB() );
180 while ( _msa.getLength() > length ) {
181 removeWorstOffenders( step, 1, false );
186 System.out.println( counter + ": " + msaStatsAsSB() );
192 final private DescriptiveStatistics[] calcStats() {
193 final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
194 dfs.setDecimalSeparator( '.' );
195 final NumberFormat f = new DecimalFormat( "#.####", dfs );
196 f.setRoundingMode( RoundingMode.HALF_UP );
197 final DescriptiveStatistics stats[] = calcGapContribtions();
198 Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MEAN ) );
199 for( final DescriptiveStatistics stat : stats ) {
200 final StringBuilder sb = new StringBuilder();
201 sb.append( stat.getDescription() );
203 sb.append( f.format( stat.arithmeticMean() ) );
205 sb.append( f.format( stat.median() ) );
207 sb.append( f.format( stat.getMin() ) );
209 sb.append( f.format( stat.getMax() ) );
211 System.out.println( sb );
216 private final DescriptiveStatistics[] calcGapContribtions() {
217 final double gappiness[] = calcGappiness();
218 final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
219 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
220 stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
221 for( int col = 0; col < _msa.getLength(); ++col ) {
222 if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) {
223 stats[ row ].addValue( gappiness[ col ] );
230 private final double[] calcGappiness() {
231 final int l = _msa.getLength();
232 final double gappiness[] = new double[ l ];
233 final int seqs = _msa.getNumberOfSequences();
234 for( int i = 0; i < l; ++i ) {
235 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
240 final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
242 final private boolean _ascending;
243 final private SORT_BY _sort_by;
245 public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) {
246 _ascending = ascending;
251 public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
252 switch ( _sort_by ) {
254 if ( s0.getMax() < s1.getMax() ) {
255 return _ascending ? -1 : 1;
257 else if ( s0.getMax() > s1.getMax() ) {
258 return _ascending ? 1 : -1;
262 if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
263 return _ascending ? -1 : 1;
265 else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
266 return _ascending ? 1 : -1;
270 if ( s0.median() < s1.median() ) {
271 return _ascending ? -1 : 1;
273 else if ( s0.median() > s1.median() ) {
274 return _ascending ? 1 : -1;