+// $Id:
+// FORESTER -- software libraries and applications
+// for evolutionary biology research and applications.
+//
+// Copyright (C) 2014 Christian M. Zmasek
+// Copyright (C) 2014 Sanford-Burnham Medical Research Institute
+// All rights reserved
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+//
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.msa_compactor;
import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.tools.BootstrapResampler;
import org.forester.msa.BasicMsa;
+import org.forester.msa.DeleteableMsa;
import org.forester.msa.Mafft;
import org.forester.msa.Msa;
import org.forester.msa.Msa.MSA_FORMAT;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.sequence.Sequence;
import org.forester.tools.ConfidenceAssessor;
-import org.forester.util.BasicDescriptiveStatistics;
-import org.forester.util.DescriptiveStatistics;
import org.forester.util.ForesterUtil;
public class MsaCompactor {
- final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
- final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
- private Msa _msa;
+ final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
+ final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
+ // private final String _maffts_opts = "--retree 1";
+ private final String _maffts_opts = "--auto";
+ private DeleteableMsa _msa;
private File _out_file_base;
private String _path_to_mafft;
private final SortedSet<String> _removed_seq_ids;
NF_3.setRoundingMode( RoundingMode.HALF_UP );
}
- private MsaCompactor( final Msa msa ) {
+ private MsaCompactor( final DeleteableMsa msa ) {
_msa = msa;
_removed_seq_ids = new TreeSet<String>();
}
final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix );
final int seed = 15;
final int n = 100;
- final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
+ final ResampleableMsa resampleable_msa = new ResampleableMsa( _msa );
final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(),
n,
seed );
return gappiness;
}
+ final private List<MsaProperties> chart( final int step,
+ final boolean realign,
+ final boolean norm,
+ final boolean verbose ) throws IOException, InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( norm );
+ final List<String> to_remove_ids = new ArrayList<String>();
+ final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
+ for( final GapContribution gap_gontribution : stats ) {
+ to_remove_ids.add( gap_gontribution.getId() );
+ }
+ if ( verbose ) {
+ printTableHeader();
+ }
+ int i = 0;
+ final int s = _msa.getNumberOfSequences();
+ final int x = ForesterUtil.roundToInt( s / 20.0 );
+ while ( _msa.getNumberOfSequences() > x ) {
+ final String id = to_remove_ids.get( i );
+ //~_msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
+ if ( ( s < 500 ) || ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) {
+ removeGapColumns();
+ if ( realign && ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) {
+ realignWithMafft();
+ msa_props.add( new MsaProperties( _msa ) );
+ if ( verbose ) {
+ printMsaStats( id );
+ }
+ if ( verbose ) {
+ System.out.print( "(realigned)" );
+ }
+ }
+ else {
+ msa_props.add( new MsaProperties( _msa ) );
+ if ( verbose ) {
+ printMsaStats( id );
+ }
+ }
+ if ( verbose ) {
+ System.out.println();
+ }
+ }
+ ++i;
+ }
+ return msa_props;
+ }
+
private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method,
final Msa msa,
final boolean write_matrix,
m.write( ForesterUtil.createBufferedWriter( matrix_name ) );
}
catch ( final IOException e ) {
- // TODO Auto-generated catch block
e.printStackTrace();
}
}
sb.append( "\t" );
sb.append( _msa.getLength() );
sb.append( "\t" );
- sb.append( NF_3.format( MsaMethods.calcGapRatio( _msa ) ) );
+ sb.append( NF_4.format( MsaMethods.calcGapRatio( _msa ) ) );
sb.append( "\t" );
- sb.append( NF_3.format( calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) );
+ sb.append( NF_4.format( MsaMethods.calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) );
return sb;
}
+ private final void printMsaStats( final String id ) {
+ System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
+ System.out.print( "\t" );
+ final StringBuilder sb = msaStatsAsSB();
+ System.out.print( sb );
+ System.out.print( "\t" );
+ }
+
+ final private void printMsaStatsWriteOutfileAndRealign( final boolean realign,
+ final boolean verbose,
+ final String id ) throws IOException, InterruptedException {
+ if ( realign ) {
+ realignWithMafft();
+ }
+ if ( verbose ) {
+ printMsaStats( id );
+ }
+ final String s = writeOutfile();
+ if ( verbose ) {
+ System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) );
+ }
+ }
+
final private void realignWithMafft() throws IOException, InterruptedException {
// final MsaInferrer mafft = Mafft
// .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft );
final List<String> opts = new ArrayList<String>();
- opts.add( "--maxiterate" );
- opts.add( "1000" );
- opts.add( "--localpair" );
- opts.add( "--quiet" );
- _msa = mafft.infer( _msa.asSequenceList(), opts );
+ for( final String o : _maffts_opts.split( "\\s" ) ) {
+ opts.add( o );
+ }
+ //opts.add( "--maxiterate" );
+ //opts.add( "1000" );
+ //opts.add( "--localpair" );
+ //opts.add( "--quiet" );
+ _msa = new DeleteableMsa( ( BasicMsa ) mafft.infer( _msa.asSequenceList(), opts ) );
}
final private void removeGapColumns() {
- _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
+ //~ _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
+ MsaMethods.removeGapColumns( 1, _msa );
}
final private void removeViaGapAverage( final double mean_gapiness,
for( final GapContribution gap_gontribution : stats ) {
to_remove_ids.add( gap_gontribution.getId() );
}
+ if ( verbose ) {
+ printTableHeader();
+ }
int i = 0;
while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
final String id = to_remove_ids.get( i );
- _msa = MsaMethods.removeSequence( _msa, id );
+ //`_msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
removeGapColumns();
- if ( verbose ) {
- System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
- System.out.print( "\t" );
- final StringBuilder sb = msaStatsAsSB();
- System.out.print( sb );
- System.out.print( "\t" );
+ if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) )
+ || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
+ printMsaStatsWriteOutfileAndRealign( realign, verbose, id );
}
- if ( ( ( ( i + 1 ) % step ) == 0 ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
- if ( realign ) {
- realignWithMafft();
- }
- final String s = writeOutfile();
- if ( verbose ) {
- System.out.print( "-> " + s );
- }
+ else if ( verbose ) {
+ printMsaStats( id );
}
if ( verbose ) {
System.out.println();
for( final GapContribution gap_gontribution : stats ) {
to_remove_ids.add( gap_gontribution.getId() );
}
+ if ( verbose ) {
+ printTableHeader();
+ }
int i = 0;
while ( _msa.getLength() > length ) {
final String id = to_remove_ids.get( i );
- _msa = MsaMethods.removeSequence( _msa, id );
+ //~_msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
removeGapColumns();
- if ( verbose ) {
- System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
- System.out.print( "\t" );
- final StringBuilder sb = msaStatsAsSB();
- System.out.print( sb );
- System.out.print( "\t" );
+ if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( _msa.getLength() <= length ) ) {
+ printMsaStatsWriteOutfileAndRealign( realign, verbose, id );
}
- if ( ( ( ( i + 1 ) % step ) == 0 ) || ( _msa.getLength() <= length ) ) {
- if ( realign ) {
- realignWithMafft();
- }
- final String s = writeOutfile();
- if ( verbose ) {
- System.out.print( "-> " + s );
- }
+ else if ( verbose ) {
+ printMsaStats( id );
}
if ( verbose ) {
System.out.println();
to_remove_ids.add( stats[ j ].getId() );
_removed_seq_ids.add( stats[ j ].getId() );
}
+ if ( verbose ) {
+ printTableHeader();
+ }
for( int i = 0; i < to_remove_ids.size(); ++i ) {
final String id = to_remove_ids.get( i );
- _msa = MsaMethods.removeSequence( _msa, id );
+ //~ _msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
removeGapColumns();
- if ( verbose ) {
- System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
- System.out.print( "\t" );
- final StringBuilder sb = msaStatsAsSB();
- System.out.print( sb );
- System.out.print( "\t" );
+ if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
+ printMsaStatsWriteOutfileAndRealign( realign, verbose, id );
}
- if ( ( ( ( i + 1 ) % step ) == 0 ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
- if ( realign ) {
- realignWithMafft();
- }
- final String s = writeOutfile();
- if ( verbose ) {
- System.out.print( "-> " + s );
- }
+ else if ( verbose ) {
+ printMsaStats( id );
}
if ( verbose ) {
System.out.println();
return s;
}
+ public final static MsaCompactor chart( final DeleteableMsa msa,
+ final int step,
+ final boolean realign,
+ final boolean norm,
+ final String path_to_mafft ) throws IOException, InterruptedException {
+ final int initial_number_of_seqs = msa.getNumberOfSequences();
+ final MsaCompactor mc = new MsaCompactor( msa );
+ if ( realign ) {
+ mc.setPathToMafft( path_to_mafft );
+ }
+ final List<MsaProperties> msa_props = mc.chart( step, realign, norm, true );
+ Chart.display( msa_props, initial_number_of_seqs );
+ return mc;
+ }
+
// Returns null if not path found.
final public static String guessPathToMafft() {
String path;
return path;
}
}
+ path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft";
+ if ( MsaInferrer.isInstalled( path ) ) {
+ return path;
+ }
path = "/usr/local/bin/mafft";
if ( MsaInferrer.isInstalled( path ) ) {
return path;
return null;
}
- public final static MsaCompactor reduceGapAverage( final Msa msa,
+ public final static MsaCompactor reduceGapAverage( final DeleteableMsa msa,
final double max_gap_average,
final int step,
final boolean realign,
return mc;
}
- public final static MsaCompactor reduceLength( final Msa msa,
+ public final static MsaCompactor reduceLength( final DeleteableMsa msa,
final int length,
final int step,
final boolean realign,
return mc;
}
- public final static MsaCompactor removeWorstOffenders( final Msa msa,
+ public final static MsaCompactor removeWorstOffenders( final DeleteableMsa msa,
final int worst_offenders_to_remove,
final int step,
final boolean realign,
return mc;
}
- private static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
- final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
- for( int c = from; c <= to; ++c ) {
- stats.addValue( MsaMethods.calculateIdentityRatio( msa, c ) );
- }
- return stats;
+ private final static void printTableHeader() {
+ System.out.print( ForesterUtil.pad( "Id", 20, ' ', false ) );
+ System.out.print( "\t" );
+ System.out.print( "Seqs" );
+ System.out.print( "\t" );
+ System.out.print( "Length" );
+ System.out.print( "\t" );
+ System.out.print( "Gaps" );
+ System.out.print( "\t" );
+ System.out.print( "MSA qual" );
+ System.out.print( "\t" );
+ System.out.println();
}
}