2 package org.forester.msa_compactor;
5 import java.io.IOException;
7 import java.math.RoundingMode;
8 import java.text.DecimalFormat;
9 import java.text.NumberFormat;
10 import java.util.ArrayList;
11 import java.util.Arrays;
12 import java.util.Comparator;
13 import java.util.List;
14 import java.util.SortedSet;
15 import java.util.TreeSet;
17 import org.forester.msa.Mafft;
18 import org.forester.msa.Msa;
19 import org.forester.msa.Msa.MSA_FORMAT;
20 import org.forester.msa.MsaInferrer;
21 import org.forester.msa.MsaMethods;
22 import org.forester.sequence.Sequence;
23 import org.forester.util.BasicDescriptiveStatistics;
24 import org.forester.util.DescriptiveStatistics;
25 import org.forester.util.ForesterUtil;
27 public class MsaCompactor {
29 final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
30 final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
31 private static final boolean VERBOSE = true;
33 private final SortedSet<String> _removed_seq_ids;
35 NF_4.setRoundingMode( RoundingMode.HALF_UP );
36 NF_3.setRoundingMode( RoundingMode.HALF_UP );
39 private MsaCompactor( final Msa msa ) {
41 _removed_seq_ids = new TreeSet<String>();
44 final public Msa getMsa() {
48 final public SortedSet<String> getRemovedSeqIds() {
49 return _removed_seq_ids;
52 final public void writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException {
53 final Double gr = MsaMethods.calcGapRatio( _msa );
54 writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
55 + ForesterUtil.roundToInt( gr * 100 ) + suffix,
59 final int calcNonGapResidues( final Sequence seq ) {
61 for( int i = 0; i < seq.getLength(); ++i ) {
62 if ( !seq.isGapAt( i ) ) {
69 private final DescriptiveStatistics[] calcGapContribtionsX( final boolean normalize_for_effective_seq_length ) {
70 final double gappiness[] = calcGappiness();
71 final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
72 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
73 stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
74 final double l = calculateEffectiveLengthRatio( row );
75 for( int col = 0; col < _msa.getLength(); ++col ) {
76 if ( !_msa.isGapAt( row, col ) ) {
77 if ( normalize_for_effective_seq_length ) {
78 stats[ row ].addValue( gappiness[ col ] / l );
81 stats[ row ].addValue( gappiness[ col ] );
89 private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) {
90 final double gappiness[] = calcGappiness();
91 final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ];
92 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
93 stats[ row ] = new GapContribution( _msa.getIdentifier( row ) );
94 for( int col = 0; col < _msa.getLength(); ++col ) {
95 if ( !_msa.isGapAt( row, col ) ) {
96 stats[ row ].addToValue( gappiness[ col ] );
99 if ( normalize_for_effective_seq_length ) {
100 stats[ row ].divideValue( calculateEffectiveLengthRatio( row ) );
109 final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
110 final GapContribution stats[] = calcGapContribtions( norm );
111 Arrays.sort( stats );
112 for( final GapContribution stat : stats ) {
113 final StringBuilder sb = new StringBuilder();
114 sb.append( stat.getId() );
116 sb.append( NF_4.format( stat.getValue() ) );
118 // sb.append( NF_4.format( stat.median() ) );
119 // sb.append( "\t" );
120 // sb.append( NF_4.format( stat.getMin() ) );
121 // sb.append( "\t" );
122 // sb.append( NF_4.format( stat.getMax() ) );
124 System.out.println( sb );
129 private final double[] calcGappiness() {
130 final int l = _msa.getLength();
131 final double gappiness[] = new double[ l ];
132 final int seqs = _msa.getNumberOfSequences();
133 for( int i = 0; i < l; ++i ) {
134 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
139 private double calculateEffectiveLengthRatio( final int row ) {
140 return ( double ) calcNonGapResidues( _msa.getSequence( row ) ) / _msa.getLength();
143 final private void mafft() throws IOException, InterruptedException {
144 final MsaInferrer mafft = Mafft
145 .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
146 final List<String> opts = new ArrayList<String>();
147 // opts.add( "--maxiterate" );
148 // opts.add( "1000" );
149 // opts.add( "--localpair" );
150 opts.add( "--quiet" );
151 _msa = mafft.infer( _msa.asSequenceList(), opts );
154 private StringBuilder msaStatsAsSB() {
155 final StringBuilder sb = new StringBuilder();
156 sb.append( _msa.getNumberOfSequences() );
158 sb.append( _msa.getLength() );
160 sb.append( NF_3.format( MsaMethods.calcGapRatio( _msa ) ) );
164 final private void removeGapColumns() {
165 _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
168 final private void removeViaGapAverage( final double mean_gapiness,
170 final boolean realign,
172 final int minimal_effective_length ) throws IOException,
173 InterruptedException {
175 throw new IllegalArgumentException( "step cannot be less than 1" );
177 if ( mean_gapiness < 0 ) {
178 throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" );
181 System.out.println( "orig: " + msaStatsAsSB() );
183 if ( minimal_effective_length > 1 ) {
184 _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length );
186 System.out.println( "short seq removal: " + msaStatsAsSB() );
192 removeWorstOffenders( step, 1, false, false );
196 gr = MsaMethods.calcGapRatio( _msa );
198 System.out.println( counter + ": " + msaStatsAsSB() );
200 // write( outfile, gr );
202 } while ( gr > mean_gapiness );
204 System.out.println( "final: " + msaStatsAsSB() );
208 final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
209 InterruptedException {
211 throw new IllegalArgumentException( "step cannot be less than 1" );
214 throw new IllegalArgumentException( "target length cannot be less than 1" );
217 System.out.println( "orig: " + msaStatsAsSB() );
220 while ( _msa.getLength() > length ) {
221 removeWorstOffenders( step, 1, false, false );
226 System.out.println( counter + ": " + msaStatsAsSB() );
232 final private void removeWorstOffenders( final int to_remove,
234 final boolean realign,
235 final boolean norm ) throws IOException, InterruptedException {
236 final GapContribution stats[] = calcGapContribtionsStats( norm );
237 final List<String> to_remove_ids = new ArrayList<String>();
238 for( int j = 0; j < to_remove; ++j ) {
239 to_remove_ids.add( stats[ j ].getId() );
240 _removed_seq_ids.add( stats[ j ].getId() );
242 //TODO if verbose/interactve
243 for( final String id : to_remove_ids ) {
244 _msa = MsaMethods.removeSequence( _msa, id );
246 System.out.print( id );
247 System.out.print( "\t" );
248 final StringBuilder sb = msaStatsAsSB();
249 System.out.println( sb );
252 //_msa = MsaMethods.removeSequences( _msa, to_remove_ids );
253 //removeGapColumns();
259 final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException {
260 final Writer w = ForesterUtil.createBufferedWriter( outfile );
261 _msa.write( w, format );
265 public final static MsaCompactor reduceGapAverage( final Msa msa,
266 final double max_gap_average,
268 final boolean realign,
270 final int minimal_effective_length ) throws IOException,
271 InterruptedException {
272 final MsaCompactor mc = new MsaCompactor( msa );
273 mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length );
277 public final static MsaCompactor reduceLength( final Msa msa,
280 final boolean realign ) throws IOException, InterruptedException {
281 final MsaCompactor mc = new MsaCompactor( msa );
282 mc.removeViaLength( length, step, realign );
286 public final static MsaCompactor removeWorstOffenders( final Msa msa,
287 final int worst_offenders_to_remove,
288 final boolean realign,
289 final boolean norm ) throws IOException,
290 InterruptedException {
291 final MsaCompactor mc = new MsaCompactor( msa );
292 mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm );
296 public static enum SORT_BY {
300 final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
302 final private boolean _ascending;
303 final private SORT_BY _sort_by;
305 public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) {
306 _ascending = ascending;
311 public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
312 switch ( _sort_by ) {
314 if ( s0.getMax() < s1.getMax() ) {
315 return _ascending ? -1 : 1;
317 else if ( s0.getMax() > s1.getMax() ) {
318 return _ascending ? 1 : -1;
322 if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
323 return _ascending ? -1 : 1;
325 else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
326 return _ascending ? 1 : -1;
330 if ( s0.median() < s1.median() ) {
331 return _ascending ? -1 : 1;
333 else if ( s0.median() > s1.median() ) {
334 return _ascending ? 1 : -1;