X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa_compactor%2FMsaCompactor.java;h=689eb1c178419307716858672eb430924a28e670;hb=cb3fc11f9567fa1f17a0ca64eafc31c4a9075210;hp=88deac871f7e38eb0cd52af96909e21a3003ed78;hpb=36de46cc85f62604f71a8b788d2aabdfe6b80935;p=jalview.git diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 88deac8..689eb1c 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -1,3 +1,26 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2014 Christian M. Zmasek +// Copyright (C) 2014 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.msa_compactor; @@ -19,6 +42,7 @@ import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANC import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.evoinference.tools.BootstrapResampler; import org.forester.msa.BasicMsa; +import org.forester.msa.DeleteableMsa; import org.forester.msa.Mafft; import org.forester.msa.Msa; import org.forester.msa.Msa.MSA_FORMAT; @@ -29,25 +53,24 @@ import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.sequence.Sequence; import org.forester.tools.ConfidenceAssessor; -import org.forester.util.BasicDescriptiveStatistics; -import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; public class MsaCompactor { final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); - private Msa _msa; + // private final String _maffts_opts = "--retree 1"; + private final String _maffts_opts = "--auto"; + private DeleteableMsa _msa; private File _out_file_base; private String _path_to_mafft; private final SortedSet _removed_seq_ids; - private final String _maffts_opts = "--retree 1"; static { NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); } - private MsaCompactor( final Msa msa ) { + private MsaCompactor( final DeleteableMsa msa ) { _msa = msa; _removed_seq_ids = new TreeSet(); } @@ -86,7 +109,7 @@ public class MsaCompactor { final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix ); final int seed = 15; final int n = 100; - final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa ); + final ResampleableMsa resampleable_msa = new ResampleableMsa( _msa ); final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(), n, seed ); @@ -136,6 +159,53 @@ public class MsaCompactor { return gappiness; } + final private List chart( final int step, + final boolean realign, + final boolean norm, + final boolean verbose ) throws IOException, InterruptedException { + final GapContribution stats[] = calcGapContribtionsStats( norm ); + final List to_remove_ids = new ArrayList(); + final List msa_props = new ArrayList(); + for( final GapContribution gap_gontribution : stats ) { + to_remove_ids.add( gap_gontribution.getId() ); + } + if ( verbose ) { + printTableHeader(); + } + int i = 0; + final int s = _msa.getNumberOfSequences(); + final int x = ForesterUtil.roundToInt( s / 20.0 ); + while ( _msa.getNumberOfSequences() > x ) { + final String id = to_remove_ids.get( i ); + //~_msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); + if ( ( s < 500 ) || ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) { + removeGapColumns(); + if ( realign && ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) { + realignWithMafft(); + msa_props.add( new MsaProperties( _msa ) ); + if ( verbose ) { + printMsaStats( id ); + } + if ( verbose ) { + System.out.print( "(realigned)" ); + } + } + else { + msa_props.add( new MsaProperties( _msa ) ); + if ( verbose ) { + printMsaStats( id ); + } + } + if ( verbose ) { + System.out.println(); + } + } + ++i; + } + return msa_props; + } + private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method, final Msa msa, final boolean write_matrix, @@ -175,10 +245,33 @@ public class MsaCompactor { sb.append( "\t" ); sb.append( NF_4.format( MsaMethods.calcGapRatio( _msa ) ) ); sb.append( "\t" ); - sb.append( NF_4.format( calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) ); + sb.append( NF_4.format( MsaMethods.calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) ); return sb; } + private final void printMsaStats( final String id ) { + System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); + System.out.print( "\t" ); + final StringBuilder sb = msaStatsAsSB(); + System.out.print( sb ); + System.out.print( "\t" ); + } + + final private void printMsaStatsWriteOutfileAndRealign( final boolean realign, + final boolean verbose, + final String id ) throws IOException, InterruptedException { + if ( realign ) { + realignWithMafft(); + } + if ( verbose ) { + printMsaStats( id ); + } + final String s = writeOutfile(); + if ( verbose ) { + System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) ); + } + } + final private void realignWithMafft() throws IOException, InterruptedException { // final MsaInferrer mafft = Mafft // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); @@ -191,11 +284,12 @@ public class MsaCompactor { //opts.add( "1000" ); //opts.add( "--localpair" ); //opts.add( "--quiet" ); - _msa = mafft.infer( _msa.asSequenceList(), opts ); + _msa = new DeleteableMsa( ( BasicMsa ) mafft.infer( _msa.asSequenceList(), opts ) ); } final private void removeGapColumns() { - _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); + //~ _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); + MsaMethods.removeGapColumns( 1, _msa ); } final private void removeViaGapAverage( final double mean_gapiness, @@ -214,23 +308,15 @@ public class MsaCompactor { int i = 0; while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) { final String id = to_remove_ids.get( i ); - _msa = MsaMethods.removeSequence( _msa, id ); + //`_msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); removeGapColumns(); - if ( verbose ) { - System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); - System.out.print( "\t" ); - final StringBuilder sb = msaStatsAsSB(); - System.out.print( sb ); - System.out.print( "\t" ); + if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) + || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) { + printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); } - if ( ( ( ( i + 1 ) % step ) == 0 ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) { - if ( realign ) { - realignWithMafft(); - } - final String s = writeOutfile(); - if ( verbose ) { - System.out.print( "-> " + s ); - } + else if ( verbose ) { + printMsaStats( id ); } if ( verbose ) { System.out.println(); @@ -239,20 +325,6 @@ public class MsaCompactor { } } - private final static void printTableHeader() { - System.out.print( ForesterUtil.pad( "Id", 20, ' ', false ) ); - System.out.print( "\t" ); - System.out.print( "Seqs" ); - System.out.print( "\t" ); - System.out.print( "Length" ); - System.out.print( "\t" ); - System.out.print( "Gaps" ); - System.out.print( "\t" ); - System.out.print( "MSA qual" ); - System.out.print( "\t" ); - System.out.println(); - } - final private void removeViaLength( final int length, final int step, final boolean realign, @@ -269,23 +341,14 @@ public class MsaCompactor { int i = 0; while ( _msa.getLength() > length ) { final String id = to_remove_ids.get( i ); - _msa = MsaMethods.removeSequence( _msa, id ); + //~_msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); removeGapColumns(); - if ( verbose ) { - System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); - System.out.print( "\t" ); - final StringBuilder sb = msaStatsAsSB(); - System.out.print( sb ); - System.out.print( "\t" ); + if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( _msa.getLength() <= length ) ) { + printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); } - if ( ( ( ( i + 1 ) % step ) == 0 ) || ( _msa.getLength() <= length ) ) { - if ( realign ) { - realignWithMafft(); - } - final String s = writeOutfile(); - if ( verbose ) { - System.out.print( "-> " + s ); - } + else if ( verbose ) { + printMsaStats( id ); } if ( verbose ) { System.out.println(); @@ -310,23 +373,14 @@ public class MsaCompactor { } for( int i = 0; i < to_remove_ids.size(); ++i ) { final String id = to_remove_ids.get( i ); - _msa = MsaMethods.removeSequence( _msa, id ); + //~ _msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); removeGapColumns(); - if ( verbose ) { - System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); - System.out.print( "\t" ); - final StringBuilder sb = msaStatsAsSB(); - System.out.print( sb ); - System.out.print( "\t" ); + if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( i == ( to_remove_ids.size() - 1 ) ) ) { + printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); } - if ( ( ( ( i + 1 ) % step ) == 0 ) || ( i == ( to_remove_ids.size() - 1 ) ) ) { - if ( realign ) { - realignWithMafft(); - } - final String s = writeOutfile(); - if ( verbose ) { - System.out.print( "-> " + s ); - } + else if ( verbose ) { + printMsaStats( id ); } if ( verbose ) { System.out.println(); @@ -350,6 +404,21 @@ public class MsaCompactor { return s; } + public final static MsaCompactor chart( final DeleteableMsa msa, + final int step, + final boolean realign, + final boolean norm, + final String path_to_mafft ) throws IOException, InterruptedException { + final int initial_number_of_seqs = msa.getNumberOfSequences(); + final MsaCompactor mc = new MsaCompactor( msa ); + if ( realign ) { + mc.setPathToMafft( path_to_mafft ); + } + final List msa_props = mc.chart( step, realign, norm, true ); + Chart.display( msa_props, initial_number_of_seqs ); + return mc; + } + // Returns null if not path found. final public static String guessPathToMafft() { String path; @@ -382,7 +451,7 @@ public class MsaCompactor { return null; } - public final static MsaCompactor reduceGapAverage( final Msa msa, + public final static MsaCompactor reduceGapAverage( final DeleteableMsa msa, final double max_gap_average, final int step, final boolean realign, @@ -398,7 +467,7 @@ public class MsaCompactor { return mc; } - public final static MsaCompactor reduceLength( final Msa msa, + public final static MsaCompactor reduceLength( final DeleteableMsa msa, final int length, final int step, final boolean realign, @@ -414,7 +483,7 @@ public class MsaCompactor { return mc; } - public final static MsaCompactor removeWorstOffenders( final Msa msa, + public final static MsaCompactor removeWorstOffenders( final DeleteableMsa msa, final int worst_offenders_to_remove, final int step, final boolean realign, @@ -430,11 +499,17 @@ public class MsaCompactor { return mc; } - private static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( int c = from; c <= to; ++c ) { - stats.addValue( MsaMethods.calculateIdentityRatio( msa, c ) ); - } - return stats; + private final static void printTableHeader() { + System.out.print( ForesterUtil.pad( "Id", 20, ' ', false ) ); + System.out.print( "\t" ); + System.out.print( "Seqs" ); + System.out.print( "\t" ); + System.out.print( "Length" ); + System.out.print( "\t" ); + System.out.print( "Gaps" ); + System.out.print( "\t" ); + System.out.print( "MSA qual" ); + System.out.print( "\t" ); + System.out.println(); } }