2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2014 Christian M. Zmasek
6 // Copyright (C) 2014 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25 package org.forester.msa_compactor;
28 import java.io.IOException;
29 import java.io.Writer;
30 import java.math.RoundingMode;
31 import java.text.DecimalFormat;
32 import java.text.NumberFormat;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.List;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
39 import org.forester.evoinference.distance.NeighborJoiningF;
40 import org.forester.evoinference.distance.PairwiseDistanceCalculator;
41 import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANCE_METHOD;
42 import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
43 import org.forester.evoinference.tools.BootstrapResampler;
44 import org.forester.msa.DeleteableMsa;
45 import org.forester.msa.Mafft;
46 import org.forester.msa.Msa;
47 import org.forester.msa.Msa.MSA_FORMAT;
48 import org.forester.msa.MsaInferrer;
49 import org.forester.msa.MsaMethods;
50 import org.forester.msa.ResampleableMsa;
51 import org.forester.phylogeny.Phylogeny;
52 import org.forester.phylogeny.PhylogenyMethods;
53 import org.forester.sequence.Sequence;
54 import org.forester.tools.ConfidenceAssessor;
55 import org.forester.util.ForesterUtil;
57 public class MsaCompactor {
59 final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
60 final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
62 private final String _maffts_opts = "--auto";
63 private int _step = 1;
65 private boolean _realign = false;
66 private boolean _norm = true;
67 private int _step_for_diagnostics = 1;
68 private int _min_length = -1;
69 private double _gap_ratio = -1;
70 private final boolean _report_aln_mean_identity = false;
71 private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
72 private final File _removed_seqs_out_base = null;
74 private DeleteableMsa _msa;
75 private File _out_file_base;
76 private String _path_to_mafft;
77 private final SortedSet<String> _removed_seq_ids;
79 NF_4.setRoundingMode( RoundingMode.HALF_UP );
80 NF_3.setRoundingMode( RoundingMode.HALF_UP );
83 public MsaCompactor( final DeleteableMsa msa ) {
85 _removed_seq_ids = new TreeSet<String>();
88 final public Msa getMsa() {
92 final public SortedSet<String> getRemovedSeqIds() {
93 return _removed_seq_ids;
96 final public void setOutFileBase( final File out_file_base ) {
97 _out_file_base = out_file_base;
100 final public String writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException {
101 final Double gr = MsaMethods.calcGapRatio( _msa );
102 final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
103 + ForesterUtil.roundToInt( gr * 100 );
104 writeMsa( s + suffix, format );
108 final int calcNonGapResidues( final Sequence seq ) {
110 for( int i = 0; i < seq.getLength(); ++i ) {
111 if ( !seq.isGapAt( i ) ) {
118 Phylogeny pi( final String matrix ) {
119 final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix );
122 final ResampleableMsa resampleable_msa = new ResampleableMsa( _msa );
123 final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(),
126 final Phylogeny[] eval_phys = new Phylogeny[ n ];
127 for( int i = 0; i < n; ++i ) {
128 resampleable_msa.resample( resampled_column_positions[ i ] );
129 eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null );
131 ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
132 PhylogenyMethods.extractFastaInformation( master_phy );
136 private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) {
137 final double gappiness[] = calcGappiness();
138 final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ];
139 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
140 stats[ row ] = new GapContribution( _msa.getIdentifier( row ) );
141 for( int col = 0; col < _msa.getLength(); ++col ) {
142 if ( !_msa.isGapAt( row, col ) ) {
143 stats[ row ].addToValue( gappiness[ col ] );
146 if ( normalize_for_effective_seq_length ) {
147 stats[ row ].divideValue( calcNonGapResidues( _msa.getSequence( row ) ) );
150 stats[ row ].divideValue( _msa.getLength() );
156 final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
157 final GapContribution stats[] = calcGapContribtions( norm );
158 Arrays.sort( stats );
162 private final double[] calcGappiness() {
163 final int l = _msa.getLength();
164 final double gappiness[] = new double[ l ];
165 final int seqs = _msa.getNumberOfSequences();
166 for( int i = 0; i < l; ++i ) {
167 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
172 public final List<MsaProperties> chart( final int step,
173 final boolean realign,
175 final boolean verbose ) throws IOException, InterruptedException {
176 final GapContribution stats[] = calcGapContribtionsStats( norm );
177 final List<String> to_remove_ids = new ArrayList<String>();
178 final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
179 for( final GapContribution gap_gontribution : stats ) {
180 to_remove_ids.add( gap_gontribution.getId() );
186 final int s = _msa.getNumberOfSequences();
187 final int x = ForesterUtil.roundToInt( s / 20.0 );
188 while ( _msa.getNumberOfSequences() > x ) {
189 final String id = to_remove_ids.get( i );
190 //~_msa = MsaMethods.removeSequence( _msa, id );
191 _msa.deleteRow( id );
192 if ( ( s < 500 ) || ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) {
194 if ( realign && ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) {
196 msa_props.add( new MsaProperties( _msa ) );
201 System.out.print( "(realigned)" );
205 msa_props.add( new MsaProperties( _msa ) );
211 System.out.println();
219 private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method,
221 final boolean write_matrix,
222 final String matrix_name ) {
223 BasicSymmetricalDistanceMatrix m = null;
224 switch ( pwd_distance_method ) {
225 case KIMURA_DISTANCE:
226 m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
228 case POISSON_DISTANCE:
229 m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
231 case FRACTIONAL_DISSIMILARITY:
232 m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
235 throw new IllegalArgumentException( "invalid pwd method" );
237 if ( write_matrix ) {
239 m.write( ForesterUtil.createBufferedWriter( matrix_name ) );
241 catch ( final IOException e ) {
245 final NeighborJoiningF nj = NeighborJoiningF.createInstance( false, 5 );
246 final Phylogeny phy = nj.execute( m );
250 private StringBuilder msaStatsAsSB() {
251 final StringBuilder sb = new StringBuilder();
252 sb.append( _msa.getNumberOfSequences() );
254 sb.append( _msa.getLength() );
256 sb.append( NF_4.format( MsaMethods.calcGapRatio( _msa ) ) );
258 sb.append( NF_4.format( MsaMethods.calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) );
262 private final void printMsaStats( final String id ) {
263 System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
264 System.out.print( "\t" );
265 final StringBuilder sb = msaStatsAsSB();
266 System.out.print( sb );
267 System.out.print( "\t" );
270 final private void printMsaStatsWriteOutfileAndRealign( final boolean realign,
271 final boolean verbose,
272 final String id ) throws IOException, InterruptedException {
279 final String s = writeOutfile();
281 System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) );
285 final private void realignWithMafft() throws IOException, InterruptedException {
286 // final MsaInferrer mafft = Mafft
287 // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
288 final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft );
289 final List<String> opts = new ArrayList<String>();
290 for( final String o : _maffts_opts.split( "\\s" ) ) {
293 _msa = DeleteableMsa.createInstance( mafft.infer( _msa.asSequenceList(), opts ) );
296 final private void removeGapColumns() {
297 _msa.deleteGapOnlyColumns();
300 public final void removeViaGapAverage( final double mean_gapiness, final boolean verbose ) throws IOException,
301 InterruptedException {
302 final GapContribution stats[] = calcGapContribtionsStats( _norm );
303 final List<String> to_remove_ids = new ArrayList<String>();
304 for( final GapContribution gap_gontribution : stats ) {
305 to_remove_ids.add( gap_gontribution.getId() );
311 while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
312 final String id = to_remove_ids.get( i );
313 _msa.deleteRow( id );
315 if ( ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) )
316 || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
317 printMsaStatsWriteOutfileAndRealign( _realign, verbose, id );
319 else if ( verbose ) {
323 System.out.println();
329 public void removeViaLength( final int length, final boolean verbose ) throws IOException, InterruptedException {
330 final GapContribution stats[] = calcGapContribtionsStats( _norm );
331 final List<String> to_remove_ids = new ArrayList<String>();
332 for( final GapContribution gap_gontribution : stats ) {
333 to_remove_ids.add( gap_gontribution.getId() );
339 while ( _msa.getLength() > length ) {
340 final String id = to_remove_ids.get( i );
341 _msa.deleteRow( id );
343 if ( ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) || ( _msa.getLength() <= length ) ) {
344 printMsaStatsWriteOutfileAndRealign( _realign, verbose, id );
346 else if ( verbose ) {
350 System.out.println();
356 public final void removeWorstOffenders( final int to_remove, final boolean verbose ) throws IOException,
357 InterruptedException {
358 final GapContribution stats[] = calcGapContribtionsStats( _norm );
359 final List<String> to_remove_ids = new ArrayList<String>();
360 for( int j = 0; j < to_remove; ++j ) {
361 to_remove_ids.add( stats[ j ].getId() );
362 _removed_seq_ids.add( stats[ j ].getId() );
367 for( int i = 0; i < to_remove_ids.size(); ++i ) {
368 final String id = to_remove_ids.get( i );
369 _msa.deleteRow( id );
371 if ( ( ( _step > 0 ) && ( ( ( i + 1 ) % _step ) == 0 ) ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
372 printMsaStatsWriteOutfileAndRealign( _realign, verbose, id );
374 else if ( verbose ) {
378 System.out.println();
383 public void setPathToMafft( final String path_to_mafft ) {
384 _path_to_mafft = path_to_mafft;
387 final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException {
388 final Writer w = ForesterUtil.createBufferedWriter( outfile );
389 _msa.write( w, format );
393 private String writeOutfile() throws IOException {
394 final String s = writeMsa( _out_file_base, MSA_FORMAT.PHYLIP, ".aln" );
395 //writeMsa( _out_file_base, MSA_FORMAT.FASTA, ".fasta" );
399 // Returns null if not path found.
400 final public static String guessPathToMafft() {
402 if ( ForesterUtil.OS_NAME.toLowerCase().indexOf( "win" ) >= 0 ) {
403 path = "C:\\Program Files\\mafft-win\\mafft.bat";
404 if ( MsaInferrer.isInstalled( path ) ) {
408 path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft";
409 if ( MsaInferrer.isInstalled( path ) ) {
412 path = "/usr/local/bin/mafft";
413 if ( MsaInferrer.isInstalled( path ) ) {
416 path = "/usr/bin/mafft";
417 if ( MsaInferrer.isInstalled( path ) ) {
421 if ( MsaInferrer.isInstalled( path ) ) {
425 if ( MsaInferrer.isInstalled( path ) ) {
431 public final void setStep( int step ) {
435 public final void setNorm( boolean norm ) {
439 public final void setStepForDiagnostics( int step_for_diagnostics ) {
440 _step_for_diagnostics = step_for_diagnostics;
443 public final void setMinLength( int min_length ) {
444 _min_length = min_length;
447 public final void setGapRatio( double gap_ratio ) {
448 _gap_ratio = gap_ratio;
451 public final void setOutputFormat( MSA_FORMAT output_format ) {
452 _output_format = output_format;
455 public final void setRealign( boolean realign ) {
459 private final static void printTableHeader() {
460 System.out.print( ForesterUtil.pad( "Id", 20, ' ', false ) );
461 System.out.print( "\t" );
462 System.out.print( "Seqs" );
463 System.out.print( "\t" );
464 System.out.print( "Length" );
465 System.out.print( "\t" );
466 System.out.print( "Gaps" );
467 System.out.print( "\t" );
468 System.out.print( "MSA qual" );
469 System.out.print( "\t" );
470 System.out.println();