2 package org.forester.msa;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.Comparator;
9 import java.util.SortedSet;
10 import java.util.TreeSet;
12 import org.forester.sequence.Sequence;
13 import org.forester.util.BasicDescriptiveStatistics;
14 import org.forester.util.DescriptiveStatistics;
15 import org.forester.util.ForesterUtil;
17 public class MsaCompactor {
19 private static final boolean VERBOSE = true;
21 private final SortedSet<String> _removed_seq_ids;
23 private MsaCompactor( final Msa msa ) {
25 _removed_seq_ids = new TreeSet<String>();
28 final public SortedSet<String> getRemovedSeqIds() {
29 return _removed_seq_ids;
32 final public Msa getMsa() {
36 public final static MsaCompactor removeWorstOffenders( final Msa msa,
37 final int worst_offenders_to_remove,
38 final boolean realign ) throws IOException,
39 InterruptedException {
40 final MsaCompactor mc = new MsaCompactor( msa );
41 mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign );
45 public final static MsaCompactor reduceGapAverage( final Msa msa,
46 final double max_gap_average,
48 final boolean realign ) throws IOException, InterruptedException {
49 final MsaCompactor mc = new MsaCompactor( msa );
50 mc.removeViaGapAverage( max_gap_average, step, realign );
54 public final static MsaCompactor reduceLength( final Msa msa,
57 final boolean realign ) throws IOException, InterruptedException {
58 final MsaCompactor mc = new MsaCompactor( msa );
59 mc.removeViaLength( length, step, realign );
63 final private void removeGapColumns() {
64 _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
67 final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign )
68 throws IOException, InterruptedException {
69 final DescriptiveStatistics stats[] = calcStats();
70 final List<String> to_remove_ids = new ArrayList<String>();
71 for( int j = 0; j < to_remove; ++j ) {
72 to_remove_ids.add( stats[ j ].getDescription() );
73 _removed_seq_ids.add( stats[ j ].getDescription() );
75 _msa = MsaMethods.removeSequences( _msa, to_remove_ids );
82 final private void mafft() throws IOException, InterruptedException {
83 final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" );
84 final List<String> opts = new ArrayList<String>();
85 // opts.add( "--maxiterate" );
86 // opts.add( "1000" );
87 // opts.add( "--localpair" );
88 opts.add( "--quiet" );
89 _msa = mafft.infer( _msa.asSequenceList(), opts );
92 final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign )
93 throws IOException, InterruptedException {
95 throw new IllegalArgumentException( "step cannot be less than 1" );
97 if ( mean_gapiness < 0 ) {
98 throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" );
101 System.out.println( "start: " + _msa.getLength() + " "
102 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
105 while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) {
106 removeWorstOffenders( step, 1, false );
111 System.out.println( counter + ": " + _msa.getLength() + " "
112 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
118 final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
119 InterruptedException {
121 throw new IllegalArgumentException( "step cannot be less than 1" );
124 throw new IllegalArgumentException( "target length cannot be less than 1" );
127 System.out.println( "start: " + _msa.getLength() + " "
128 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
131 while ( _msa.getLength() > length ) {
132 removeWorstOffenders( step, 1, false );
137 System.out.println( counter + ": " + _msa.getLength() + " "
138 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
144 final private DescriptiveStatistics[] calcStats() {
145 final DescriptiveStatistics stats[] = calc();
147 for( int i = 0; i < stats.length; ++i ) {
148 final DescriptiveStatistics s = stats[ i ];
149 // System.out.print( s.getDescription() );
150 // System.out.print( "\t" );
151 // System.out.print( s.arithmeticMean() );
152 // System.out.print( "\t(" );
153 // System.out.print( s.arithmeticMean() );
154 // System.out.print( ")" );
155 // System.out.print( "\t" );
156 // System.out.print( s.getMin() );
157 // System.out.print( "\t" );
158 // System.out.print( s.getMax() );
159 // System.out.println();
164 private final static void sort( final DescriptiveStatistics stats[] ) {
165 Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) );
168 private final DescriptiveStatistics[] calc() {
169 final double gappiness[] = calcGappiness();
170 final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
171 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
172 stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
173 for( int col = 0; col < _msa.getLength(); ++col ) {
174 if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) {
175 stats[ row ].addValue( gappiness[ col ] );
182 private final double[] calcGappiness() {
183 final double gappiness[] = new double[ _msa.getLength() ];
184 final int seqs = _msa.getNumberOfSequences();
185 for( int i = 0; i < gappiness.length; ++i ) {
186 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
191 final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
193 final boolean _ascending;
195 public DescriptiveStatisticsComparator( final boolean ascending ) {
196 _ascending = ascending;
200 public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
201 if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
202 return _ascending ? -1 : 1;
204 else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
205 return _ascending ? 1 : -1;