2 package org.forester.msa;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.Comparator;
9 import java.util.SortedSet;
10 import java.util.TreeSet;
12 import org.forester.sequence.Sequence;
13 import org.forester.util.BasicDescriptiveStatistics;
14 import org.forester.util.DescriptiveStatistics;
15 import org.forester.util.ForesterUtil;
17 public class MsaCompactor {
19 private static final boolean VERBOSE = true;
21 private final SortedSet<String> _removed_seq_ids;
23 private MsaCompactor( final Msa msa ) {
25 _removed_seq_ids = new TreeSet<String>();
28 final public SortedSet<String> getRemovedSeqIds() {
29 return _removed_seq_ids;
32 final public Msa getMsa() {
36 public final static MsaCompactor removeWorstOffenders( final Msa msa,
37 final int worst_offenders_to_remove,
38 final boolean realign ) throws IOException,
39 InterruptedException {
40 final MsaCompactor mc = new MsaCompactor( msa );
41 mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign );
45 public final static MsaCompactor reduceGapAverage( final Msa msa,
46 final double max_gap_average,
48 final boolean realign ) throws IOException, InterruptedException {
49 final MsaCompactor mc = new MsaCompactor( msa );
50 mc.removeViaGapAverage( max_gap_average, step, realign );
54 public final static MsaCompactor reduceLength( final Msa msa,
57 final boolean realign ) throws IOException, InterruptedException {
58 final MsaCompactor mc = new MsaCompactor( msa );
59 mc.removeViaLength( length, step, realign );
63 final private void removeGapColumns() {
64 _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
67 final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign )
68 throws IOException, InterruptedException {
69 final DescriptiveStatistics stats[] = calcStats();
70 final List<String> to_remove_ids = new ArrayList<String>();
71 for( int j = 0; j < to_remove; ++j ) {
72 to_remove_ids.add( stats[ j ].getDescription() );
73 _removed_seq_ids.add( stats[ j ].getDescription() );
75 _msa = MsaMethods.removeSequences( _msa, to_remove_ids );
82 final private void mafft() throws IOException, InterruptedException {
83 final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" );
84 final List<String> opts = new ArrayList<String>();
85 opts.add( "--maxiterate" );
87 opts.add( "--localpair" );
88 opts.add( "--quiet" );
89 _msa = mafft.infer( _msa.asSequenceList(), opts );
92 final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign )
93 throws IOException, InterruptedException {
95 System.out.println( "start: " + _msa.getLength() + " "
96 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
99 while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) {
100 removeWorstOffenders( step, 1, false );
105 System.out.println( counter + ": " + _msa.getLength() + " "
106 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
112 final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
113 InterruptedException {
115 System.out.println( "start: " + _msa.getLength() + " "
116 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
119 while ( _msa.getLength() > length ) {
120 removeWorstOffenders( step, 1, false );
125 System.out.println( counter + ": " + _msa.getLength() + " "
126 + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
132 final private DescriptiveStatistics[] calcStats() {
133 final DescriptiveStatistics stats[] = calc();
135 for( int i = 0; i < stats.length; ++i ) {
136 final DescriptiveStatistics s = stats[ i ];
137 // System.out.print( s.getDescription() );
138 // System.out.print( "\t" );
139 // System.out.print( s.arithmeticMean() );
140 // System.out.print( "\t(" );
141 // System.out.print( s.arithmeticMean() );
142 // System.out.print( ")" );
143 // System.out.print( "\t" );
144 // System.out.print( s.getMin() );
145 // System.out.print( "\t" );
146 // System.out.print( s.getMax() );
147 // System.out.println();
152 private final static void sort( final DescriptiveStatistics stats[] ) {
153 Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) );
156 private final DescriptiveStatistics[] calc() {
157 final double gappiness[] = calcGappiness();
158 final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
159 for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
160 stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
161 for( int col = 0; col < _msa.getLength(); ++col ) {
162 if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) {
163 stats[ row ].addValue( gappiness[ col ] );
170 private final double[] calcGappiness() {
171 final double gappiness[] = new double[ _msa.getLength() ];
172 final int seqs = _msa.getNumberOfSequences();
173 for( int i = 0; i < gappiness.length; ++i ) {
174 gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
179 final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
181 final boolean _ascending;
183 public DescriptiveStatisticsComparator( final boolean ascending ) {
184 _ascending = ascending;
188 public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
189 if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
190 return _ascending ? -1 : 1;
192 else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
193 return _ascending ? 1 : -1;