import org.forester.io.parsers.FastaParser;
import org.forester.io.parsers.GeneralMsaParser;
-import org.forester.msa.Msa;
+import org.forester.msa.BasicMsa;
+import org.forester.msa.DeleteableMsa;
import org.forester.msa.MsaInferrer;
import org.forester.msa_compactor.MsaCompactor;
import org.forester.util.CommandLineArguments;
if ( dissallowed_options.length() > 0 ) {
ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
}
- Msa msa = null;
+ DeleteableMsa msa = null;
final FileInputStream is = new FileInputStream( in );
if ( FastaParser.isLikelyFasta( in ) ) {
- msa = FastaParser.parseMsa( is );
+ msa = new DeleteableMsa( ( BasicMsa ) FastaParser.parseMsa( is ) );
}
else {
- msa = GeneralMsaParser.parse( is );
+ msa = new DeleteableMsa( ( BasicMsa ) GeneralMsaParser.parse( is ) );
}
if ( cla.isOptionSet( REMOVE_WORST_OFFENDERS_OPTION ) ) {
worst_remove = cla.getOptionValueAsInt( REMOVE_WORST_OFFENDERS_OPTION );
final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() );
for( int row = 0; row < seqs.size(); ++row ) {
final Sequence seq = seqs.get( row );
+ //
+ // int x = length - seq.getLength();
+ // if ( x > 0 ) {
+ // String a = "";
+ // for( int i = 0; i < x; i++ ) {
+ // a += "-";
+ // }
+ // seq = BasicSequence.createAaSequence( seq.getIdentifier(), seq.getMolecularSequenceAsString() + a );
+ // }
+ // else {
+ // seq = BasicSequence.createAaSequence( seq.getIdentifier(), seq.getMolecularSequenceAsString()
+ // .substring( 0, length ) );
+ // }
+ //
if ( seq.getLength() != length ) {
throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length ["
+ seq.getIdentifier() + "]" );
--- /dev/null
+// / $Id:
+// FORESTER -- software libraries and applications
+// for evolutionary biology research and applications.
+//
+// Copyright (C) 2014 Christian M. Zmasek
+// Copyright (C) 2014 Sanford-Burnham Medical Research Institute
+// All rights reserved
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+//
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
+
+package org.forester.msa;
+
+import java.util.HashMap;
+
+public final class DeleteableMsa extends BasicMsa {
+
+ private int _length = 0;
+ private int _mapped_col_positions[] = null;
+ private int _mapped_row_positions[] = null;
+ private int _seqs = 0;
+ private HashMap<String, Integer> _seq_id_to_row_map = null;
+
+ public DeleteableMsa( final BasicMsa msa ) {
+ super( msa );
+ _mapped_col_positions = new int[ msa.getLength() ];
+ _mapped_row_positions = new int[ msa.getNumberOfSequences() ];
+ for( int i = 0; i < _mapped_col_positions.length; ++i ) {
+ _mapped_col_positions[ i ] = i;
+ }
+ for( int i = 0; i < _mapped_row_positions.length; ++i ) {
+ _mapped_row_positions[ i ] = i;
+ }
+ _seq_id_to_row_map = new HashMap<String, Integer>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ _seq_id_to_row_map.put( msa.getIdentifier( row ), row );
+ }
+ _length = msa.getLength();
+ _seqs = msa.getNumberOfSequences();
+ }
+
+ public void deleteColumn( final int col ) {
+ if ( col >= _length || col < 0 ) {
+ throw new IllegalArgumentException( "column " + col + " is out of range" );
+ }
+ for( int c = col; c < _length - 1; ++c ) {
+ _mapped_col_positions[ c ] = _mapped_col_positions[ c + 1 ];
+ }
+ --_length;
+ }
+
+ public void deleteRow( final int row ) {
+ if ( row >= _seqs || row < 0 ) {
+ throw new IllegalArgumentException( "row " + row + " is out of range" );
+ }
+ for( int r = row; r < _seqs - 1; ++r ) {
+ _mapped_row_positions[ r ] = _mapped_row_positions[ r + 1 ];
+ }
+ --_seqs;
+ }
+
+ public void deleteRow( final String id ) {
+ int row = -1;
+ for( int r = 0; r < getNumberOfSequences(); ++r ) {
+ if ( getIdentifier( r ).equals( id ) ) {
+ row = r;
+ break;
+ }
+ }
+ if ( row < 0 ) {
+ throw new IllegalArgumentException( "id [" + id + "] not found" );
+ }
+ deleteRow( row );
+ }
+
+ @Override
+ public String getIdentifier( final int row ) {
+ return super.getIdentifier( _mapped_row_positions[ row ] );
+ }
+
+ @Override
+ public int getLength() {
+ return _length;
+ }
+
+ @Override
+ public int getNumberOfSequences() {
+ return _seqs;
+ }
+
+ @Override
+ public char getResidueAt( final int row, final int col ) {
+ return super.getResidueAt( _mapped_row_positions[ row ], _mapped_col_positions[ col ] );
+ }
+
+ @Override
+ public void setIdentifier( final int row, final String id ) {
+ super.setIdentifier( _mapped_row_positions[ row ], id );
+ }
+
+ @Override
+ public void setResidueAt( final int row, final int col, final char residue ) {
+ super.setResidueAt( _mapped_row_positions[ row ], _mapped_col_positions[ col ], residue );
+ }
+}
final List<Sequence> seqs = new ArrayList<Sequence>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
- seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ seqs.add( msa.getSequence( row ) );
}
}
if ( seqs.size() < 1 ) {
final List<Sequence> seqs = new ArrayList<Sequence>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
- seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ seqs.add( msa.getSequence( row ) );
}
}
if ( seqs.size() < 1 ) {
final List<Sequence> seqs = new ArrayList<Sequence>();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
if ( !to_remove_rows.contains( row ) ) {
- seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ seqs.add( msa.getSequence( row ) );
}
}
if ( seqs.size() < 1 ) {
return BasicMsa.createInstance( seqs );
}
+ synchronized final public static void removeGapColumns( final double max_allowed_gap_ratio, final DeleteableMsa msa ) {
+ if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
+ throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
+ }
+ // final boolean ignore_too_short_seqs = min_allowed_length > 0;
+ for( int col = 0; col < msa.getLength(); ++col ) {
+ final boolean delete = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio;
+ if ( delete ) {
+ msa.deleteColumn( col );
+ }
+ }
+ }
+
public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
for( int c = from; c <= to; ++c ) {
import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.tools.BootstrapResampler;
import org.forester.msa.BasicMsa;
+import org.forester.msa.DeleteableMsa;
import org.forester.msa.Mafft;
import org.forester.msa.Msa;
import org.forester.msa.Msa.MSA_FORMAT;
final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
// private final String _maffts_opts = "--retree 1";
private final String _maffts_opts = "--auto";
- private Msa _msa;
+ private DeleteableMsa _msa;
private File _out_file_base;
private String _path_to_mafft;
private final SortedSet<String> _removed_seq_ids;
NF_3.setRoundingMode( RoundingMode.HALF_UP );
}
- private MsaCompactor( final Msa msa ) {
+ private MsaCompactor( final DeleteableMsa msa ) {
_msa = msa;
_removed_seq_ids = new TreeSet<String>();
}
final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix );
final int seed = 15;
final int n = 100;
- final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
+ final ResampleableMsa resampleable_msa = new ResampleableMsa( _msa );
final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(),
n,
seed );
final int x = ForesterUtil.roundToInt( s / 20.0 );
while ( _msa.getNumberOfSequences() > x ) {
final String id = to_remove_ids.get( i );
- _msa = MsaMethods.removeSequence( _msa, id );
+ //~_msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
if ( ( s < 500 ) || ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) {
removeGapColumns();
if ( realign && ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) {
//opts.add( "1000" );
//opts.add( "--localpair" );
//opts.add( "--quiet" );
- _msa = mafft.infer( _msa.asSequenceList(), opts );
+ _msa = new DeleteableMsa( ( BasicMsa ) mafft.infer( _msa.asSequenceList(), opts ) );
}
final private void removeGapColumns() {
- _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
+ //~ _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
+ MsaMethods.removeGapColumns( 1, _msa );
}
final private void removeViaGapAverage( final double mean_gapiness,
int i = 0;
while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
final String id = to_remove_ids.get( i );
- _msa = MsaMethods.removeSequence( _msa, id );
+ //`_msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
removeGapColumns();
if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) )
|| ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
int i = 0;
while ( _msa.getLength() > length ) {
final String id = to_remove_ids.get( i );
- _msa = MsaMethods.removeSequence( _msa, id );
+ //~_msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
removeGapColumns();
if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( _msa.getLength() <= length ) ) {
printMsaStatsWriteOutfileAndRealign( realign, verbose, id );
}
for( int i = 0; i < to_remove_ids.size(); ++i ) {
final String id = to_remove_ids.get( i );
- _msa = MsaMethods.removeSequence( _msa, id );
+ //~ _msa = MsaMethods.removeSequence( _msa, id );
+ _msa.deleteRow( id );
removeGapColumns();
if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
printMsaStatsWriteOutfileAndRealign( realign, verbose, id );
return s;
}
- public final static MsaCompactor chart( final Msa msa,
+ public final static MsaCompactor chart( final DeleteableMsa msa,
final int step,
final boolean realign,
final boolean norm,
return null;
}
- public final static MsaCompactor reduceGapAverage( final Msa msa,
+ public final static MsaCompactor reduceGapAverage( final DeleteableMsa msa,
final double max_gap_average,
final int step,
final boolean realign,
return mc;
}
- public final static MsaCompactor reduceLength( final Msa msa,
+ public final static MsaCompactor reduceLength( final DeleteableMsa msa,
final int length,
final int step,
final boolean realign,
return mc;
}
- public final static MsaCompactor removeWorstOffenders( final Msa msa,
+ public final static MsaCompactor removeWorstOffenders( final DeleteableMsa msa,
final int worst_offenders_to_remove,
final int step,
final boolean realign,