From cb3fc11f9567fa1f17a0ca64eafc31c4a9075210 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Mon, 21 Apr 2014 21:43:37 +0000 Subject: [PATCH] inprogress (not working) --- .../org/forester/application/msa_compactor.java | 9 +- forester/java/src/org/forester/msa/BasicMsa.java | 14 +++ .../java/src/org/forester/msa/DeleteableMsa.java | 118 ++++++++++++++++++++ forester/java/src/org/forester/msa/MsaMethods.java | 19 +++- .../org/forester/msa_compactor/MsaCompactor.java | 32 +++--- 5 files changed, 172 insertions(+), 20 deletions(-) create mode 100644 forester/java/src/org/forester/msa/DeleteableMsa.java diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index f9c86a3..4b2aaa2 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -31,7 +31,8 @@ import java.util.List; import org.forester.io.parsers.FastaParser; import org.forester.io.parsers.GeneralMsaParser; -import org.forester.msa.Msa; +import org.forester.msa.BasicMsa; +import org.forester.msa.DeleteableMsa; import org.forester.msa.MsaInferrer; import org.forester.msa_compactor.MsaCompactor; import org.forester.util.CommandLineArguments; @@ -87,13 +88,13 @@ public class msa_compactor { if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); } - Msa msa = null; + DeleteableMsa msa = null; final FileInputStream is = new FileInputStream( in ); if ( FastaParser.isLikelyFasta( in ) ) { - msa = FastaParser.parseMsa( is ); + msa = new DeleteableMsa( ( BasicMsa ) FastaParser.parseMsa( is ) ); } else { - msa = GeneralMsaParser.parse( is ); + msa = new DeleteableMsa( ( BasicMsa ) GeneralMsaParser.parse( is ) ); } if ( cla.isOptionSet( REMOVE_WORST_OFFENDERS_OPTION ) ) { worst_remove = cla.getOptionValueAsInt( REMOVE_WORST_OFFENDERS_OPTION ); diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index bda6424..a7367a2 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -191,6 +191,20 @@ public class BasicMsa implements Msa { final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() ); for( int row = 0; row < seqs.size(); ++row ) { final Sequence seq = seqs.get( row ); + // + // int x = length - seq.getLength(); + // if ( x > 0 ) { + // String a = ""; + // for( int i = 0; i < x; i++ ) { + // a += "-"; + // } + // seq = BasicSequence.createAaSequence( seq.getIdentifier(), seq.getMolecularSequenceAsString() + a ); + // } + // else { + // seq = BasicSequence.createAaSequence( seq.getIdentifier(), seq.getMolecularSequenceAsString() + // .substring( 0, length ) ); + // } + // if ( seq.getLength() != length ) { throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length [" + seq.getIdentifier() + "]" ); diff --git a/forester/java/src/org/forester/msa/DeleteableMsa.java b/forester/java/src/org/forester/msa/DeleteableMsa.java new file mode 100644 index 0000000..3041994 --- /dev/null +++ b/forester/java/src/org/forester/msa/DeleteableMsa.java @@ -0,0 +1,118 @@ +// / $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2014 Christian M. Zmasek +// Copyright (C) 2014 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester + +package org.forester.msa; + +import java.util.HashMap; + +public final class DeleteableMsa extends BasicMsa { + + private int _length = 0; + private int _mapped_col_positions[] = null; + private int _mapped_row_positions[] = null; + private int _seqs = 0; + private HashMap _seq_id_to_row_map = null; + + public DeleteableMsa( final BasicMsa msa ) { + super( msa ); + _mapped_col_positions = new int[ msa.getLength() ]; + _mapped_row_positions = new int[ msa.getNumberOfSequences() ]; + for( int i = 0; i < _mapped_col_positions.length; ++i ) { + _mapped_col_positions[ i ] = i; + } + for( int i = 0; i < _mapped_row_positions.length; ++i ) { + _mapped_row_positions[ i ] = i; + } + _seq_id_to_row_map = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + _seq_id_to_row_map.put( msa.getIdentifier( row ), row ); + } + _length = msa.getLength(); + _seqs = msa.getNumberOfSequences(); + } + + public void deleteColumn( final int col ) { + if ( col >= _length || col < 0 ) { + throw new IllegalArgumentException( "column " + col + " is out of range" ); + } + for( int c = col; c < _length - 1; ++c ) { + _mapped_col_positions[ c ] = _mapped_col_positions[ c + 1 ]; + } + --_length; + } + + public void deleteRow( final int row ) { + if ( row >= _seqs || row < 0 ) { + throw new IllegalArgumentException( "row " + row + " is out of range" ); + } + for( int r = row; r < _seqs - 1; ++r ) { + _mapped_row_positions[ r ] = _mapped_row_positions[ r + 1 ]; + } + --_seqs; + } + + public void deleteRow( final String id ) { + int row = -1; + for( int r = 0; r < getNumberOfSequences(); ++r ) { + if ( getIdentifier( r ).equals( id ) ) { + row = r; + break; + } + } + if ( row < 0 ) { + throw new IllegalArgumentException( "id [" + id + "] not found" ); + } + deleteRow( row ); + } + + @Override + public String getIdentifier( final int row ) { + return super.getIdentifier( _mapped_row_positions[ row ] ); + } + + @Override + public int getLength() { + return _length; + } + + @Override + public int getNumberOfSequences() { + return _seqs; + } + + @Override + public char getResidueAt( final int row, final int col ) { + return super.getResidueAt( _mapped_row_positions[ row ], _mapped_col_positions[ col ] ); + } + + @Override + public void setIdentifier( final int row, final String id ) { + super.setIdentifier( _mapped_row_positions[ row ], id ); + } + + @Override + public void setResidueAt( final int row, final int col, final char residue ) { + super.setResidueAt( _mapped_row_positions[ row ], _mapped_col_positions[ col ], residue ); + } +} diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index 45c9697..2280320 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -76,7 +76,7 @@ public final class MsaMethods { final List seqs = new ArrayList(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) { - seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) ); + seqs.add( msa.getSequence( row ) ); } } if ( seqs.size() < 1 ) { @@ -89,7 +89,7 @@ public final class MsaMethods { final List seqs = new ArrayList(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { - seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) ); + seqs.add( msa.getSequence( row ) ); } } if ( seqs.size() < 1 ) { @@ -102,7 +102,7 @@ public final class MsaMethods { final List seqs = new ArrayList(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { if ( !to_remove_rows.contains( row ) ) { - seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) ); + seqs.add( msa.getSequence( row ) ); } } if ( seqs.size() < 1 ) { @@ -159,6 +159,19 @@ public final class MsaMethods { return BasicMsa.createInstance( seqs ); } + synchronized final public static void removeGapColumns( final double max_allowed_gap_ratio, final DeleteableMsa msa ) { + if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) { + throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio ); + } + // final boolean ignore_too_short_seqs = min_allowed_length > 0; + for( int col = 0; col < msa.getLength(); ++col ) { + final boolean delete = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio; + if ( delete ) { + msa.deleteColumn( col ); + } + } + } + public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( int c = from; c <= to; ++c ) { diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 0ba6b88..689eb1c 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -42,6 +42,7 @@ import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANC import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.evoinference.tools.BootstrapResampler; import org.forester.msa.BasicMsa; +import org.forester.msa.DeleteableMsa; import org.forester.msa.Mafft; import org.forester.msa.Msa; import org.forester.msa.Msa.MSA_FORMAT; @@ -60,7 +61,7 @@ public class MsaCompactor { final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); // private final String _maffts_opts = "--retree 1"; private final String _maffts_opts = "--auto"; - private Msa _msa; + private DeleteableMsa _msa; private File _out_file_base; private String _path_to_mafft; private final SortedSet _removed_seq_ids; @@ -69,7 +70,7 @@ public class MsaCompactor { NF_3.setRoundingMode( RoundingMode.HALF_UP ); } - private MsaCompactor( final Msa msa ) { + private MsaCompactor( final DeleteableMsa msa ) { _msa = msa; _removed_seq_ids = new TreeSet(); } @@ -108,7 +109,7 @@ public class MsaCompactor { final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix ); final int seed = 15; final int n = 100; - final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa ); + final ResampleableMsa resampleable_msa = new ResampleableMsa( _msa ); final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(), n, seed ); @@ -176,7 +177,8 @@ public class MsaCompactor { final int x = ForesterUtil.roundToInt( s / 20.0 ); while ( _msa.getNumberOfSequences() > x ) { final String id = to_remove_ids.get( i ); - _msa = MsaMethods.removeSequence( _msa, id ); + //~_msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); if ( ( s < 500 ) || ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) { removeGapColumns(); if ( realign && ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) ) { @@ -282,11 +284,12 @@ public class MsaCompactor { //opts.add( "1000" ); //opts.add( "--localpair" ); //opts.add( "--quiet" ); - _msa = mafft.infer( _msa.asSequenceList(), opts ); + _msa = new DeleteableMsa( ( BasicMsa ) mafft.infer( _msa.asSequenceList(), opts ) ); } final private void removeGapColumns() { - _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); + //~ _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); + MsaMethods.removeGapColumns( 1, _msa ); } final private void removeViaGapAverage( final double mean_gapiness, @@ -305,7 +308,8 @@ public class MsaCompactor { int i = 0; while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) { final String id = to_remove_ids.get( i ); - _msa = MsaMethods.removeSequence( _msa, id ); + //`_msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); removeGapColumns(); if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) { @@ -337,7 +341,8 @@ public class MsaCompactor { int i = 0; while ( _msa.getLength() > length ) { final String id = to_remove_ids.get( i ); - _msa = MsaMethods.removeSequence( _msa, id ); + //~_msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); removeGapColumns(); if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( _msa.getLength() <= length ) ) { printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); @@ -368,7 +373,8 @@ public class MsaCompactor { } for( int i = 0; i < to_remove_ids.size(); ++i ) { final String id = to_remove_ids.get( i ); - _msa = MsaMethods.removeSequence( _msa, id ); + //~ _msa = MsaMethods.removeSequence( _msa, id ); + _msa.deleteRow( id ); removeGapColumns(); if ( ( ( step > 0 ) && ( ( ( i + 1 ) % step ) == 0 ) ) || ( i == ( to_remove_ids.size() - 1 ) ) ) { printMsaStatsWriteOutfileAndRealign( realign, verbose, id ); @@ -398,7 +404,7 @@ public class MsaCompactor { return s; } - public final static MsaCompactor chart( final Msa msa, + public final static MsaCompactor chart( final DeleteableMsa msa, final int step, final boolean realign, final boolean norm, @@ -445,7 +451,7 @@ public class MsaCompactor { return null; } - public final static MsaCompactor reduceGapAverage( final Msa msa, + public final static MsaCompactor reduceGapAverage( final DeleteableMsa msa, final double max_gap_average, final int step, final boolean realign, @@ -461,7 +467,7 @@ public class MsaCompactor { return mc; } - public final static MsaCompactor reduceLength( final Msa msa, + public final static MsaCompactor reduceLength( final DeleteableMsa msa, final int length, final int step, final boolean realign, @@ -477,7 +483,7 @@ public class MsaCompactor { return mc; } - public final static MsaCompactor removeWorstOffenders( final Msa msa, + public final static MsaCompactor removeWorstOffenders( final DeleteableMsa msa, final int worst_offenders_to_remove, final int step, final boolean realign, -- 1.7.10.2