X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa_compactor%2FMsaCompactor.java;h=0ba6b888cdb0b840c9cad15098b1c28d225f8f29;hb=d0bb37d418d945966304afe70431185c3873635a;hp=5d3a55c6d229e51887e8800aa308f3a0620fe33c;hpb=6479c35c4734850f517a6ef8de0fce500fdd6693;p=jalview.git diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 5d3a55c..0ba6b88 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -1,3 +1,26 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2014 Christian M. Zmasek +// Copyright (C) 2014 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.msa_compactor; @@ -29,18 +52,18 @@ import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.sequence.Sequence; import org.forester.tools.ConfidenceAssessor; -import org.forester.util.BasicDescriptiveStatistics; -import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; public class MsaCompactor { - final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); - final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); - private static final boolean VERBOSE = true; + final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); + final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); + // private final String _maffts_opts = "--retree 1"; + private final String _maffts_opts = "--auto"; private Msa _msa; - private final SortedSet _removed_seq_ids; + private File _out_file_base; private String _path_to_mafft; + private final SortedSet _removed_seq_ids; static { NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); @@ -59,11 +82,16 @@ public class MsaCompactor { return _removed_seq_ids; } - final public void writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException { + final public void setOutFileBase( final File out_file_base ) { + _out_file_base = out_file_base; + } + + final public String writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException { final Double gr = MsaMethods.calcGapRatio( _msa ); - writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" - + ForesterUtil.roundToInt( gr * 100 ) + suffix, - format ); + final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + + ForesterUtil.roundToInt( gr * 100 ); + writeMsa( s + suffix, format ); + return s; } final int calcNonGapResidues( final Sequence seq ) { @@ -76,6 +104,24 @@ public class MsaCompactor { return ng; } + Phylogeny pi( final String matrix ) { + final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix ); + final int seed = 15; + final int n = 100; + final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa ); + final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(), + n, + seed ); + final Phylogeny[] eval_phys = new Phylogeny[ n ]; + for( int i = 0; i < n; ++i ) { + resampleable_msa.resample( resampled_column_positions[ i ] ); + eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null ); + } + ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 ); + PhylogenyMethods.extractFastaInformation( master_phy ); + return master_phy; + } + private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) { final double gappiness[] = calcGappiness(); final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ]; @@ -99,28 +145,6 @@ public class MsaCompactor { final private GapContribution[] calcGapContribtionsStats( final boolean norm ) { final GapContribution stats[] = calcGapContribtions( norm ); Arrays.sort( stats ); - // for( final GapContribution stat : stats ) { - // final StringBuilder sb = new StringBuilder(); - // sb.append( stat.getId() ); - // sb.append( "\t" ); - // sb.append( NF_4.format( stat.getValue() ) ); - // sb.append( "\t" ); - // sb.append( NF_4.format( stat.median() ) ); - // sb.append( "\t" ); - // sb.append( NF_4.format( stat.getMin() ) ); - // sb.append( "\t" ); - // sb.append( NF_4.format( stat.getMax() ) ); - //sb.append( "\t" ); - //System.out.println( sb ); - // } - return stats; - } - - private static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( int c = from; c <= to; ++c ) { - stats.addValue( MsaMethods.calculateIdentityRatio( msa, c ) ); - } return stats; } @@ -134,44 +158,81 @@ public class MsaCompactor { return gappiness; } - // Returns null if not path found. - final public static String guessPathToMafft() { - String path; - if ( ForesterUtil.OS_NAME.toLowerCase().indexOf( "win" ) >= 0 ) { - path = "C:\\Program Files\\mafft-win\\mafft.bat"; - if ( MsaInferrer.isInstalled( path ) ) { - return path; - } - } - path = "/usr/local/bin/mafft"; - if ( MsaInferrer.isInstalled( path ) ) { - return path; - } - path = "/usr/bin/mafft"; - if ( MsaInferrer.isInstalled( path ) ) { - return path; + final private List chart( final int step, + final boolean realign, + final boolean norm, + final boolean verbose ) throws IOException, InterruptedException { + final GapContribution stats[] = calcGapContribtionsStats( norm ); + final List to_remove_ids = new ArrayList(); + final List msa_props = new ArrayList(); + for( final GapContribution gap_gontribution : stats ) { + to_remove_ids.add( gap_gontribution.getId() ); } - path = "/bin/mafft"; - if ( MsaInferrer.isInstalled( path ) ) { - return path; + if ( verbose ) { + printTableHeader(); } - path = "mafft"; - if ( MsaInferrer.isInstalled( path ) ) { - return path; + int i = 0; + final int s = _msa.getNumberOfSequences(); + final int x = ForesterUtil.roundToInt( s / 20.0 ); + while ( _msa.getNumberOfSequences() > x ) { + final String id = to_remove_ids.get( i ); + _msa = MsaMethods.removeSequence( _msa, id ); + if ( ( s < 500 ) || ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) { + removeGapColumns(); + if ( realign && ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) { + realignWithMafft(); + msa_props.add( new MsaProperties( _msa ) ); + if ( verbose ) { + printMsaStats( id ); + } + if ( verbose ) { + System.out.print( "(realigned)" ); + } + } + else { + msa_props.add( new MsaProperties( _msa ) ); + if ( verbose ) { + printMsaStats( id ); + } + } + if ( verbose ) { + System.out.println(); + } + } + ++i; } - return null; + return msa_props; } - final private void mafft() throws IOException, InterruptedException { - // final MsaInferrer mafft = Mafft - // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); - final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft ); - final List opts = new ArrayList(); - opts.add( "--maxiterate" ); - opts.add( "1000" ); - opts.add( "--localpair" ); - opts.add( "--quiet" ); - _msa = mafft.infer( _msa.asSequenceList(), opts ); + private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method, + final Msa msa, + final boolean write_matrix, + final String matrix_name ) { + BasicSymmetricalDistanceMatrix m = null; + switch ( pwd_distance_method ) { + case KIMURA_DISTANCE: + m = PairwiseDistanceCalculator.calcKimuraDistances( msa ); + break; + case POISSON_DISTANCE: + m = PairwiseDistanceCalculator.calcPoissonDistances( msa ); + break; + case FRACTIONAL_DISSIMILARITY: + m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa ); + break; + default: + throw new IllegalArgumentException( "invalid pwd method" ); + } + if ( write_matrix ) { + try { + m.write( ForesterUtil.createBufferedWriter( matrix_name ) ); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } + final NeighborJoiningF nj = NeighborJoiningF.createInstance( false, 5 ); + final Phylogeny phy = nj.execute( m ); + return phy; } private StringBuilder msaStatsAsSB() { @@ -180,12 +241,50 @@ public class MsaCompactor { sb.append( "\t" ); sb.append( _msa.getLength() ); sb.append( "\t" ); - sb.append( NF_3.format( MsaMethods.calcGapRatio( _msa ) ) ); + sb.append( NF_4.format( MsaMethods.calcGapRatio( _msa ) ) ); sb.append( "\t" ); - sb.append( NF_3.format( calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) ); + sb.append( NF_4.format( MsaMethods.calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) ); return sb; } + private final void printMsaStats( final String id ) { + System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); + System.out.print( "\t" ); + final StringBuilder sb = msaStatsAsSB(); + System.out.print( sb ); + System.out.print( "\t" ); + } + + final private void printMsaStatsWriteOutfileAndRealign( final boolean realign, + final boolean verbose, + final String id ) throws IOException, InterruptedException { + if ( realign ) { + realignWithMafft(); + } + if ( verbose ) { + printMsaStats( id ); + } + final String s = writeOutfile(); + if ( verbose ) { + System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) ); + } + } + + final private void realignWithMafft() throws IOException, InterruptedException { + // final MsaInferrer mafft = Mafft + // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); + final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft ); + final List opts = new ArrayList(); + for( final String o : _maffts_opts.split( "\\s" ) ) { + opts.add( o ); + } + //opts.add( "--maxiterate" ); + //opts.add( "1000" ); + //opts.add( "--localpair" ); + //opts.add( "--quiet" ); + _msa = mafft.infer( _msa.asSequenceList(), opts ); + } + final private void removeGapColumns() { _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); } @@ -193,147 +292,98 @@ public class MsaCompactor { final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign, - final File outfile, - final int minimal_effective_length ) throws IOException, - InterruptedException { - if ( step < 1 ) { - throw new IllegalArgumentException( "step cannot be less than 1" ); - } - if ( mean_gapiness < 0 ) { - throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" ); - } - if ( VERBOSE ) { - System.out.println( "orig: " + msaStatsAsSB() ); - } - if ( minimal_effective_length > 1 ) { - _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length ); - if ( VERBOSE ) { - System.out.println( "short seq removal: " + msaStatsAsSB() ); - } + final boolean norm, + final boolean verbose ) throws IOException, InterruptedException { + final GapContribution stats[] = calcGapContribtionsStats( norm ); + final List to_remove_ids = new ArrayList(); + for( final GapContribution gap_gontribution : stats ) { + to_remove_ids.add( gap_gontribution.getId() ); } - int counter = step; - double gr; - do { - removeWorstOffenders( step, 1, false, false ); - if ( realign ) { - mafft(); + if ( verbose ) { + printTableHeader(); + } + int i = 0; + while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) { + final String id = to_remove_ids.get( i ); + _msa = MsaMethods.removeSequence( _msa, id ); + removeGapColumns(); + if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) + || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) { + printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); + } + else if ( verbose ) { + printMsaStats( id ); } - gr = MsaMethods.calcGapRatio( _msa ); - if ( VERBOSE ) { - System.out.println( counter + ": " + msaStatsAsSB() ); + if ( verbose ) { + System.out.println(); } - // write( outfile, gr ); - counter += step; - } while ( gr > mean_gapiness ); - if ( VERBOSE ) { - System.out.println( "final: " + msaStatsAsSB() ); + ++i; } } - final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException, - InterruptedException { - if ( step < 1 ) { - throw new IllegalArgumentException( "step cannot be less than 1" ); - } - if ( length < 11 ) { - throw new IllegalArgumentException( "target length cannot be less than 1" ); + final private void removeViaLength( final int length, + final int step, + final boolean realign, + final boolean norm, + final boolean verbose ) throws IOException, InterruptedException { + final GapContribution stats[] = calcGapContribtionsStats( norm ); + final List to_remove_ids = new ArrayList(); + for( final GapContribution gap_gontribution : stats ) { + to_remove_ids.add( gap_gontribution.getId() ); } - if ( VERBOSE ) { - System.out.println( "orig: " + msaStatsAsSB() ); + if ( verbose ) { + printTableHeader(); } - int counter = step; + int i = 0; while ( _msa.getLength() > length ) { - removeWorstOffenders( step, 1, false, false ); - if ( realign ) { - mafft(); + final String id = to_remove_ids.get( i ); + _msa = MsaMethods.removeSequence( _msa, id ); + removeGapColumns(); + if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( _msa.getLength() <= length ) ) { + printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); } - if ( VERBOSE ) { - System.out.println( counter + ": " + msaStatsAsSB() ); + else if ( verbose ) { + printMsaStats( id ); } - counter += step; - } - } - - Phylogeny pi( final String matrix ) { - final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix ); - final int seed = 15; - final int n = 100; - final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa ); - final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(), - n, - seed ); - final Phylogeny[] eval_phys = new Phylogeny[ n ]; - for( int i = 0; i < n; ++i ) { - resampleable_msa.resample( resampled_column_positions[ i ] ); - eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null ); - } - ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 ); - PhylogenyMethods.extractFastaInformation( master_phy ); - return master_phy; - } - - private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method, - final Msa msa, - final boolean write_matrix, - final String matrix_name ) { - BasicSymmetricalDistanceMatrix m = null; - switch ( pwd_distance_method ) { - case KIMURA_DISTANCE: - m = PairwiseDistanceCalculator.calcKimuraDistances( msa ); - break; - case POISSON_DISTANCE: - m = PairwiseDistanceCalculator.calcPoissonDistances( msa ); - break; - case FRACTIONAL_DISSIMILARITY: - m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa ); - break; - default: - throw new IllegalArgumentException( "invalid pwd method" ); - } - if ( write_matrix ) { - try { - m.write( ForesterUtil.createBufferedWriter( matrix_name ) ); - } - catch ( final IOException e ) { - // TODO Auto-generated catch block - e.printStackTrace(); + if ( verbose ) { + System.out.println(); } + ++i; } - final NeighborJoiningF nj = NeighborJoiningF.createInstance( false, 5 ); - final Phylogeny phy = nj.execute( m ); - return phy; } final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign, - final boolean norm ) throws IOException, InterruptedException { - //final Phylogeny a = pi( "a.pwd" ); - //Archaeopteryx.createApplication( a ); + final boolean norm, + final boolean verbose ) throws IOException, InterruptedException { final GapContribution stats[] = calcGapContribtionsStats( norm ); final List to_remove_ids = new ArrayList(); for( int j = 0; j < to_remove; ++j ) { to_remove_ids.add( stats[ j ].getId() ); _removed_seq_ids.add( stats[ j ].getId() ); } - //TODO if verbose/interactive - for( final String id : to_remove_ids ) { + if ( verbose ) { + printTableHeader(); + } + for( int i = 0; i < to_remove_ids.size(); ++i ) { + final String id = to_remove_ids.get( i ); _msa = MsaMethods.removeSequence( _msa, id ); removeGapColumns(); - //System.out.print( id ); - System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); - System.out.print( "\t" ); - final StringBuilder sb = msaStatsAsSB(); - System.out.println( sb ); - } - //TODO else: - //_msa = MsaMethods.removeSequences( _msa, to_remove_ids ); - //removeGapColumns(); - if ( realign ) { - mafft(); + if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( i == ( to_remove_ids.size() - 1 ) ) ) { + printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); + } + else if ( verbose ) { + printMsaStats( id ); + } + if ( verbose ) { + System.out.println(); + } } - //final Phylogeny b = pi( "b.pwd" ); - //Archaeopteryx.createApplication( b ); + } + + private void setPathToMafft( final String path_to_mafft ) { + _path_to_mafft = path_to_mafft; } final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException { @@ -342,19 +392,72 @@ public class MsaCompactor { w.close(); } + private String writeOutfile() throws IOException { + final String s = writeMsa( _out_file_base, MSA_FORMAT.PHYLIP, ".aln" ); + //writeMsa( _out_file_base, MSA_FORMAT.FASTA, ".fasta" ); + return s; + } + + public final static MsaCompactor chart( final Msa msa, + final int step, + final boolean realign, + final boolean norm, + final String path_to_mafft ) throws IOException, InterruptedException { + final int initial_number_of_seqs = msa.getNumberOfSequences(); + final MsaCompactor mc = new MsaCompactor( msa ); + if ( realign ) { + mc.setPathToMafft( path_to_mafft ); + } + final List msa_props = mc.chart( step, realign, norm, true ); + Chart.display( msa_props, initial_number_of_seqs ); + return mc; + } + + // Returns null if not path found. + final public static String guessPathToMafft() { + String path; + if ( ForesterUtil.OS_NAME.toLowerCase().indexOf( "win" ) >= 0 ) { + path = "C:\\Program Files\\mafft-win\\mafft.bat"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + } + path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + path = "/usr/local/bin/mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + path = "/usr/bin/mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + path = "/bin/mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + path = "mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + return null; + } + public final static MsaCompactor reduceGapAverage( final Msa msa, final double max_gap_average, final int step, final boolean realign, - final File out, - final int minimal_effective_length, - final String path_to_mafft ) throws IOException, - InterruptedException { + final boolean norm, + final String path_to_mafft, + final File out ) throws IOException, InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); if ( realign ) { mc.setPathToMafft( path_to_mafft ); } - mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length ); + mc.setOutFileBase( out ); + mc.removeViaGapAverage( max_gap_average, step, realign, norm, true ); return mc; } @@ -362,31 +465,45 @@ public class MsaCompactor { final int length, final int step, final boolean realign, - final String path_to_mafft ) throws IOException, - InterruptedException { + final boolean norm, + final String path_to_mafft, + final File out ) throws IOException, InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); if ( realign ) { mc.setPathToMafft( path_to_mafft ); } - mc.removeViaLength( length, step, realign ); + mc.setOutFileBase( out ); + mc.removeViaLength( length, step, realign, norm, true ); return mc; } public final static MsaCompactor removeWorstOffenders( final Msa msa, final int worst_offenders_to_remove, + final int step, final boolean realign, final boolean norm, - final String path_to_mafft ) throws IOException, - InterruptedException { + final String path_to_mafft, + final File out ) throws IOException, InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); if ( realign ) { mc.setPathToMafft( path_to_mafft ); } - mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm ); + mc.setOutFileBase( out ); + mc.removeWorstOffenders( worst_offenders_to_remove, step, realign, norm, true ); return mc; } - private void setPathToMafft( final String path_to_mafft ) { - _path_to_mafft = path_to_mafft; + private final static void printTableHeader() { + System.out.print( ForesterUtil.pad( "Id", 20, ' ', false ) ); + System.out.print( "\t" ); + System.out.print( "Seqs" ); + System.out.print( "\t" ); + System.out.print( "Length" ); + System.out.print( "\t" ); + System.out.print( "Gaps" ); + System.out.print( "\t" ); + System.out.print( "MSA qual" ); + System.out.print( "\t" ); + System.out.println(); } }