2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2014 Christian M. Zmasek
6 // Copyright (C) 2014 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25 package org.forester.msa_compactor;
28 import java.io.IOException;
29 import java.io.Writer;
30 import java.math.RoundingMode;
31 import java.text.DecimalFormat;
32 import java.text.NumberFormat;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.List;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
39 import org.forester.evoinference.distance.NeighborJoiningF;
40 import org.forester.evoinference.distance.PairwiseDistanceCalculator;
41 import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANCE_METHOD;
42 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
43 import org.forester.evoinference.tools.BootstrapResampler;
44 import org.forester.msa.DeleteableMsa;
45 import org.forester.msa.Mafft;
46 import org.forester.msa.Msa;
47 import org.forester.msa.Msa.MSA_FORMAT;
48 import org.forester.msa.MsaInferrer;
49 import org.forester.msa.MsaMethods;
50 import org.forester.msa.ResampleableMsa;
51 import org.forester.phylogeny.Phylogeny;
52 import org.forester.phylogeny.PhylogenyMethods;
53 import org.forester.sequence.Sequence;
54 import org.forester.tools.ConfidenceAssessor;
55 import org.forester.util.ForesterUtil;
57 public class MsaCompactor {
59 final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
60 final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
61 private double _gap_ratio = -1;
63 private final String _maffts_opts = "--auto";
64 private int _min_length = -1;
66 private DeleteableMsa _msa = null;
67 private boolean _norm = true;
68 private File _out_file_base = null;
69 private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
70 private String _path_to_mafft = null;
72 private boolean _realign = false;
73 private final SortedSet<String> _removed_seq_ids;
74 private final File _removed_seqs_out_base = null;
75 private boolean _report_aln_mean_identity = false;
76 private int _step = -1;
77 private int _step_for_diagnostics = -1;
79 NF_4.setRoundingMode( RoundingMode.HALF_UP );
80 NF_3.setRoundingMode( RoundingMode.HALF_UP );
83 public MsaCompactor( final DeleteableMsa msa ) {
85 _removed_seq_ids = new TreeSet<String>();
88 final public Msa getMsa() {
92 final public SortedSet<String> getRemovedSeqIds() {
93 return _removed_seq_ids;
96 public final void removeViaGapAverage( final double mean_gapiness, final boolean verbose ) throws IOException,
97 InterruptedException {
98 final GapContribution stats[] = calcGapContribtionsStats( _norm );
99 final List<String> to_remove_ids = new ArrayList<String>();
100 for( final GapContribution gap_gontribution : stats ) {
101 to_remove_ids.add( gap_gontribution.getId() );
107 while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
108 final String id = to_remove_ids.get( i );
109 _msa.deleteRow( id );
111 if ( ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) )
112 || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
113 printMsaStatsWriteOutfileAndRealign( _realign, id );
115 else if ( verbose ) {
116 final MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
117 printMsaProperties( id, msa_prop );
120 System.out.println();
126 public void removeViaLength( final int length, final boolean verbose ) throws IOException, InterruptedException {
127 final GapContribution stats[] = calcGapContribtionsStats( _norm );
128 final List<String> to_remove_ids = new ArrayList<String>();
129 for( final GapContribution gap_gontribution : stats ) {
130 to_remove_ids.add( gap_gontribution.getId() );
136 while ( _msa.getLength() > length ) {
137 final String id = to_remove_ids.get( i );
138 _msa.deleteRow( id );
140 if ( ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) || ( _msa.getLength() <= length ) ) {
141 printMsaStatsWriteOutfileAndRealign( _realign, id );
143 else if ( verbose ) {
144 final MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
145 printMsaProperties( id, msa_prop );
148 System.out.println();
154 public final void removeWorstOffenders( final int to_remove ) throws IOException, InterruptedException {
155 final GapContribution stats[] = calcGapContribtionsStats( _norm );
156 final List<String> to_remove_ids = new ArrayList<String>();
157 for( int j = 0; j < to_remove; ++j ) {
158 to_remove_ids.add( stats[ j ].getId() );
159 _removed_seq_ids.add( stats[ j ].getId() );
162 for( int i = 0; i < to_remove_ids.size(); ++i ) {
163 final String id = to_remove_ids.get( i );
164 _msa.deleteRow( id );
166 if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
167 printMsaStatsWriteOutfileAndRealign( _realign, id );
168 System.out.println();
170 else if ( isPrintMsaStats( i ) ) {
171 final MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
172 printMsaProperties( id, msa_prop );
173 System.out.println();
178 public final List<MsaProperties> chart( final int step, final boolean realign, final boolean norm )
179 throws IOException, InterruptedException {
180 final GapContribution stats[] = calcGapContribtionsStats( norm );
181 final List<String> to_remove_ids = new ArrayList<String>();
182 final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
183 for( final GapContribution gap_gontribution : stats ) {
184 to_remove_ids.add( gap_gontribution.getId() );
188 final int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 );
189 MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
190 msa_props.add( msa_prop );
191 printMsaProperties( "", msa_prop );
192 System.out.println();
193 while ( _msa.getNumberOfSequences() > x ) {
194 final String id = to_remove_ids.get( i );
195 _msa.deleteRow( id );
196 if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) {
199 msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
200 msa_props.add( msa_prop );
201 printMsaProperties( id, msa_prop );
202 System.out.print( "(realigned)" );
203 System.out.println();
205 else if ( isPrintMsaStats( i ) ) {
207 msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
208 msa_props.add( msa_prop );
209 printMsaProperties( id, msa_prop );
210 System.out.println();
217 private final boolean isPrintMsaStats( final int i ) {
218 return ( ( _step_for_diagnostics < 2 ) || ( ( ( i + 1 ) % _step_for_diagnostics ) == 0 ) );
221 private final boolean isPrintMsaStatsWriteOutfileAndRealign( final int i ) {
222 return ( ( _step < 2 ) || ( ( ( i + 1 ) % _step ) == 0 ) );
225 public final void setGapRatio( final double gap_ratio ) {
226 _gap_ratio = gap_ratio;
229 public final void setMinLength( final int min_length ) {
230 _min_length = min_length;
233 public final void setNorm( final boolean norm ) {
237 final public void setOutFileBase( final File out_file_base ) {
238 _out_file_base = out_file_base;
241 public final void setOutputFormat( final MSA_FORMAT output_format ) {
242 _output_format = output_format;
245 public void setPathToMafft( final String path_to_mafft ) {
246 _path_to_mafft = path_to_mafft;
249 public final void setRealign( final boolean realign ) {
253 public final void setStep( final int step ) {
257 public final void setStepForDiagnostics( final int step_for_diagnostics ) {
258 _step_for_diagnostics = step_for_diagnostics;
261 public final void setReportAlnMeanIdentity( final boolean report_aln_mean_identity ) {
262 _report_aln_mean_identity = report_aln_mean_identity;
265 final public String writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException {
266 final Double gr = MsaMethods.calcGapRatio( _msa );
267 final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
268 + ForesterUtil.roundToInt( gr * 100 );
269 writeMsa( s + suffix, format );
273 final int calcNonGapResidues( final Sequence seq ) {
275 for( int i = 0; i < seq.getLength(); ++i ) {
276 if ( !seq.isGapAt( i ) ) {
283 private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) {
284 final double gappiness[] = calcGappiness();
285 final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ];
286 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
287 stats[ row ] = new GapContribution( _msa.getIdentifier( row ) );
288 for( int col = 0; col < _msa.getLength(); ++col ) {
289 if ( !_msa.isGapAt( row, col ) ) {
290 stats[ row ].addToValue( gappiness[ col ] );
293 if ( normalize_for_effective_seq_length ) {
294 stats[ row ].divideValue( calcNonGapResidues( _msa.getSequence( row ) ) );
297 stats[ row ].divideValue( _msa.getLength() );
303 final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
304 final GapContribution stats[] = calcGapContribtions( norm );
305 Arrays.sort( stats );
309 private final double[] calcGappiness() {
310 final int l = _msa.getLength();
311 final double gappiness[] = new double[ l ];
312 final int seqs = _msa.getNumberOfSequences();
313 for( int i = 0; i < l; ++i ) {
314 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
319 private final Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method,
321 final boolean write_matrix,
322 final String matrix_name ) {
323 BasicSymmetricalDistanceMatrix m = null;
324 switch ( pwd_distance_method ) {
325 case KIMURA_DISTANCE:
326 m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
328 case POISSON_DISTANCE:
329 m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
331 case FRACTIONAL_DISSIMILARITY:
332 m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
335 throw new IllegalArgumentException( "invalid pwd method" );
337 if ( write_matrix ) {
339 m.write( ForesterUtil.createBufferedWriter( matrix_name ) );
341 catch ( final IOException e ) {
345 final NeighborJoiningF nj = NeighborJoiningF.createInstance( false, 5 );
346 final Phylogeny phy = nj.execute( m );
350 private final Phylogeny pi( final String matrix ) {
351 final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix );
354 final ResampleableMsa resampleable_msa = new ResampleableMsa( _msa );
355 final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(),
358 final Phylogeny[] eval_phys = new Phylogeny[ n ];
359 for( int i = 0; i < n; ++i ) {
360 resampleable_msa.resample( resampled_column_positions[ i ] );
361 eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null );
363 ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
364 PhylogenyMethods.extractFastaInformation( master_phy );
368 private final static void printMsaProperties( final String id, final MsaProperties msa_properties ) {
369 System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
370 System.out.print( "\t" );
371 final StringBuilder sb = msaPropertiesAsSB( msa_properties );
372 System.out.print( sb );
373 System.out.print( "\t" );
376 private final static StringBuilder msaPropertiesAsSB( final MsaProperties msa_properties ) {
377 final StringBuilder sb = new StringBuilder();
378 sb.append( msa_properties.getNumberOfSequences() );
380 sb.append( msa_properties.getLength() );
382 sb.append( NF_4.format( msa_properties.getGapRatio() ) );
383 if ( msa_properties.getAverageIdentityRatio() >= 0 ) {
385 sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) );
390 final private void printMsaStatsWriteOutfileAndRealign( final boolean realign, final String id )
391 throws IOException, InterruptedException {
395 final MsaProperties msa_prop = new MsaProperties( _msa, _report_aln_mean_identity );
396 printMsaProperties( id, msa_prop );
397 final String s = writeOutfile();
398 System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) );
401 final private void realignWithMafft() throws IOException, InterruptedException {
402 // final MsaInferrer mafft = Mafft
403 // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
404 final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft );
405 final List<String> opts = new ArrayList<String>();
406 for( final String o : _maffts_opts.split( "\\s" ) ) {
409 _msa = DeleteableMsa.createInstance( mafft.infer( _msa.asSequenceList(), opts ) );
412 final private void removeGapColumns() {
413 _msa.deleteGapOnlyColumns();
416 final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException {
417 final Writer w = ForesterUtil.createBufferedWriter( outfile );
418 _msa.write( w, format );
422 private final String writeOutfile() throws IOException {
423 final String s = writeMsa( _out_file_base, MSA_FORMAT.PHYLIP, ".aln" );
424 //writeMsa( _out_file_base, MSA_FORMAT.FASTA, ".fasta" );
428 // Returns null if not path found.
429 final public static String guessPathToMafft() {
431 if ( ForesterUtil.OS_NAME.toLowerCase().indexOf( "win" ) >= 0 ) {
432 path = "C:\\Program Files\\mafft-win\\mafft.bat";
433 if ( MsaInferrer.isInstalled( path ) ) {
437 path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft";
438 if ( MsaInferrer.isInstalled( path ) ) {
441 path = "/usr/local/bin/mafft";
442 if ( MsaInferrer.isInstalled( path ) ) {
445 path = "/usr/bin/mafft";
446 if ( MsaInferrer.isInstalled( path ) ) {
450 if ( MsaInferrer.isInstalled( path ) ) {
454 if ( MsaInferrer.isInstalled( path ) ) {
460 private final static void printTableHeader() {
461 System.out.print( ForesterUtil.pad( "Id", 20, ' ', false ) );
462 System.out.print( "\t" );
463 System.out.print( "Seqs" );
464 System.out.print( "\t" );
465 System.out.print( "Length" );
466 System.out.print( "\t" );
467 System.out.print( "Gaps" );
468 System.out.print( "\t" );
469 System.out.print( "MSA qual" );
470 System.out.print( "\t" );
471 System.out.println();