2 package org.forester.msa_compactor;
5 import java.io.IOException;
7 import java.math.RoundingMode;
8 import java.text.DecimalFormat;
9 import java.text.NumberFormat;
10 import java.util.ArrayList;
11 import java.util.Arrays;
12 import java.util.List;
13 import java.util.SortedSet;
14 import java.util.TreeSet;
16 import org.forester.msa.Mafft;
17 import org.forester.msa.Msa;
18 import org.forester.msa.Msa.MSA_FORMAT;
19 import org.forester.msa.MsaInferrer;
20 import org.forester.msa.MsaMethods;
21 import org.forester.sequence.Sequence;
22 import org.forester.util.ForesterUtil;
24 public class MsaCompactor {
26 final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
27 final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
28 private static final boolean VERBOSE = true;
30 private final SortedSet<String> _removed_seq_ids;
32 NF_4.setRoundingMode( RoundingMode.HALF_UP );
33 NF_3.setRoundingMode( RoundingMode.HALF_UP );
36 private MsaCompactor( final Msa msa ) {
38 _removed_seq_ids = new TreeSet<String>();
41 final public Msa getMsa() {
45 final public SortedSet<String> getRemovedSeqIds() {
46 return _removed_seq_ids;
49 final public void writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException {
50 final Double gr = MsaMethods.calcGapRatio( _msa );
51 writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
52 + ForesterUtil.roundToInt( gr * 100 ) + suffix,
56 final int calcNonGapResidues( final Sequence seq ) {
58 for( int i = 0; i < seq.getLength(); ++i ) {
59 if ( !seq.isGapAt( i ) ) {
66 private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) {
67 final double gappiness[] = calcGappiness();
68 final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ];
69 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
70 stats[ row ] = new GapContribution( _msa.getIdentifier( row ) );
71 for( int col = 0; col < _msa.getLength(); ++col ) {
72 if ( !_msa.isGapAt( row, col ) ) {
73 stats[ row ].addToValue( gappiness[ col ] );
76 if ( normalize_for_effective_seq_length ) {
77 stats[ row ].divideValue( calcNonGapResidues( _msa.getSequence( row ) ) );
80 stats[ row ].divideValue( _msa.getLength() );
86 final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
87 final GapContribution stats[] = calcGapContribtions( norm );
89 for( final GapContribution stat : stats ) {
90 final StringBuilder sb = new StringBuilder();
91 sb.append( stat.getId() );
93 sb.append( NF_4.format( stat.getValue() ) );
95 // sb.append( NF_4.format( stat.median() ) );
97 // sb.append( NF_4.format( stat.getMin() ) );
99 // sb.append( NF_4.format( stat.getMax() ) );
101 System.out.println( sb );
106 private final double[] calcGappiness() {
107 final int l = _msa.getLength();
108 final double gappiness[] = new double[ l ];
109 final int seqs = _msa.getNumberOfSequences();
110 for( int i = 0; i < l; ++i ) {
111 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
116 final private void mafft() throws IOException, InterruptedException {
117 final MsaInferrer mafft = Mafft
118 .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
119 final List<String> opts = new ArrayList<String>();
120 // opts.add( "--maxiterate" );
121 // opts.add( "1000" );
122 // opts.add( "--localpair" );
123 opts.add( "--quiet" );
124 _msa = mafft.infer( _msa.asSequenceList(), opts );
127 private StringBuilder msaStatsAsSB() {
128 final StringBuilder sb = new StringBuilder();
129 sb.append( _msa.getNumberOfSequences() );
131 sb.append( _msa.getLength() );
133 sb.append( NF_3.format( MsaMethods.calcGapRatio( _msa ) ) );
137 final private void removeGapColumns() {
138 _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
141 final private void removeViaGapAverage( final double mean_gapiness,
143 final boolean realign,
145 final int minimal_effective_length ) throws IOException,
146 InterruptedException {
148 throw new IllegalArgumentException( "step cannot be less than 1" );
150 if ( mean_gapiness < 0 ) {
151 throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" );
154 System.out.println( "orig: " + msaStatsAsSB() );
156 if ( minimal_effective_length > 1 ) {
157 _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length );
159 System.out.println( "short seq removal: " + msaStatsAsSB() );
165 removeWorstOffenders( step, 1, false, false );
169 gr = MsaMethods.calcGapRatio( _msa );
171 System.out.println( counter + ": " + msaStatsAsSB() );
173 // write( outfile, gr );
175 } while ( gr > mean_gapiness );
177 System.out.println( "final: " + msaStatsAsSB() );
181 final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
182 InterruptedException {
184 throw new IllegalArgumentException( "step cannot be less than 1" );
187 throw new IllegalArgumentException( "target length cannot be less than 1" );
190 System.out.println( "orig: " + msaStatsAsSB() );
193 while ( _msa.getLength() > length ) {
194 removeWorstOffenders( step, 1, false, false );
199 System.out.println( counter + ": " + msaStatsAsSB() );
205 final private void removeWorstOffenders( final int to_remove,
207 final boolean realign,
208 final boolean norm ) throws IOException, InterruptedException {
209 final GapContribution stats[] = calcGapContribtionsStats( norm );
210 final List<String> to_remove_ids = new ArrayList<String>();
211 for( int j = 0; j < to_remove; ++j ) {
212 to_remove_ids.add( stats[ j ].getId() );
213 _removed_seq_ids.add( stats[ j ].getId() );
215 //TODO if verbose/interactve
216 for( final String id : to_remove_ids ) {
217 _msa = MsaMethods.removeSequence( _msa, id );
219 System.out.print( id );
220 System.out.print( "\t" );
221 final StringBuilder sb = msaStatsAsSB();
222 System.out.println( sb );
225 //_msa = MsaMethods.removeSequences( _msa, to_remove_ids );
226 //removeGapColumns();
232 final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException {
233 final Writer w = ForesterUtil.createBufferedWriter( outfile );
234 _msa.write( w, format );
238 public final static MsaCompactor reduceGapAverage( final Msa msa,
239 final double max_gap_average,
241 final boolean realign,
243 final int minimal_effective_length ) throws IOException,
244 InterruptedException {
245 final MsaCompactor mc = new MsaCompactor( msa );
246 mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length );
250 public final static MsaCompactor reduceLength( final Msa msa,
253 final boolean realign ) throws IOException, InterruptedException {
254 final MsaCompactor mc = new MsaCompactor( msa );
255 mc.removeViaLength( length, step, realign );
259 public final static MsaCompactor removeWorstOffenders( final Msa msa,
260 final int worst_offenders_to_remove,
261 final boolean realign,
262 final boolean norm ) throws IOException,
263 InterruptedException {
264 final MsaCompactor mc = new MsaCompactor( msa );
265 mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm );