From cea9261d689486e31e8abd13a25ffbff3c826e2e Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Thu, 8 May 2014 02:47:02 +0000 Subject: [PATCH] inprogress --- .../org/forester/application/msa_compactor.java | 22 +- forester/java/src/org/forester/msa/MsaMethods.java | 323 +++++++++++++++----- .../java/src/org/forester/msa_compactor/Chart.java | 73 +++-- .../org/forester/msa_compactor/MsaCompactor.java | 86 +++--- .../org/forester/msa_compactor/MsaProperties.java | 27 +- forester/java/src/org/forester/test/Test.java | 73 +++++ 6 files changed, 434 insertions(+), 170 deletions(-) diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index 316daa4..2ef806f 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -63,7 +63,7 @@ public class msa_compactor { final static private String STEP_FOR_DIAGNOSTICS_OPTION = "sd"; final static private String MIN_LENGTH_OPTION = "ml"; final static private String GAP_RATIO_LENGTH_OPTION = "gr"; - final static private String REPORT_ALN_MEAN_IDENTITY = "q"; + final static private String REPORT_ENTROPY = "e"; final static private String OUTPUT_FORMAT_PHYLIP_OPTION = "p"; final static private String OUTPUT_REMOVED_SEQS_OPTION = "ro"; final static private String MAFFT_OPTIONS = "mo"; @@ -101,7 +101,7 @@ public class msa_compactor { int step_for_diagnostics = 1; int min_length = -1; double gap_ratio = -1; - boolean report_aln_mean_identity = false; + boolean report_entropy = false; MSA_FORMAT output_format = MSA_FORMAT.FASTA; File removed_seqs_out_base = null; String mafft_options = "--auto"; @@ -117,7 +117,7 @@ public class msa_compactor { allowed_options.add( STEP_FOR_DIAGNOSTICS_OPTION ); allowed_options.add( MIN_LENGTH_OPTION ); allowed_options.add( GAP_RATIO_LENGTH_OPTION ); - allowed_options.add( REPORT_ALN_MEAN_IDENTITY ); + allowed_options.add( REPORT_ENTROPY ); allowed_options.add( OUTPUT_FORMAT_PHYLIP_OPTION ); allowed_options.add( OUTPUT_REMOVED_SEQS_OPTION ); allowed_options.add( MAFFT_OPTIONS ); @@ -215,8 +215,8 @@ public class msa_compactor { ForesterUtil.fatalError( PRG_NAME, "gap ratio is out of range: " + gap_ratio ); } } - if ( cla.isOptionSet( REPORT_ALN_MEAN_IDENTITY ) ) { - report_aln_mean_identity = true; + if ( cla.isOptionSet( REPORT_ENTROPY ) ) { + report_entropy = true; } if ( cla.isOptionSet( OUTPUT_FORMAT_PHYLIP_OPTION ) ) { output_format = MSA_FORMAT.PHYLIP; @@ -316,7 +316,7 @@ public class msa_compactor { } } System.out.println( "Step for diagnostics reports : " + step_for_diagnostics ); - System.out.println( "Calculate mean identity : " + report_aln_mean_identity ); + System.out.println( "Calculate normalized Shannon Entropy : " + report_entropy ); if ( !norm ) { System.out.println( "Normalize : " + norm ); } @@ -338,7 +338,7 @@ public class msa_compactor { } mc.setStep( step ); mc.setStepForDiagnostics( step_for_diagnostics ); - mc.setReportAlnMeanIdentity( report_aln_mean_identity ); + mc.setCalculateNormalizedShannonEntropy( report_entropy ); mc.setPeformPhylogenticInference( perform_phylogenetic_inference ); if ( ( worst_remove > 0 ) || ( av_gap > 0 ) || ( length > 0 ) ) { mc.setOutputFormat( output_format ); @@ -363,7 +363,7 @@ public class msa_compactor { else { msa_props = mc.chart( step, realign, norm ); } - Chart.display( msa_props, initial_number_of_seqs, report_aln_mean_identity, in.getName() ); + Chart.display( msa_props, initial_number_of_seqs, report_entropy, in.getName() ); } catch ( final IllegalArgumentException iae ) { // iae.printStackTrace(); //TODO remove me @@ -423,10 +423,8 @@ public class msa_compactor { System.out.println( " -" + STEP_OPTION + "= step for output and re-aligning (default: 1)" ); System.out.println( " -" + STEP_FOR_DIAGNOSTICS_OPTION + "= step for diagnostics reports (default: 1)" ); - System.out - .println( " -" - + REPORT_ALN_MEAN_IDENTITY - + " to calculate mean MSA column identity (\"MSA quality\") (not recommended for very large alignments)" ); + System.out.println( " -" + REPORT_ENTROPY + + " to calculate normalized Shannon Entropy (not recommended for very large alignments)" ); System.out.println( " -" + OUTPUT_FORMAT_PHYLIP_OPTION + " to write output alignments in phylip format instead of fasta" ); System.out.println( " -" + OUTPUT_REMOVED_SEQS_OPTION + "= to output the removed sequences" ); diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index 1a94b74..c94869e 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -26,6 +26,7 @@ package org.forester.msa; import java.util.ArrayList; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -41,76 +42,15 @@ public final class MsaMethods { private ArrayList _ignored_seqs_ids; - synchronized public ArrayList getIgnoredSequenceIds() { - return _ignored_seqs_ids; - } - - synchronized public static MsaMethods createInstance() { - return new MsaMethods(); - } - private MsaMethods() { init(); } - synchronized private void init() { - _ignored_seqs_ids = new ArrayList(); - } - @Override public Object clone() { throw new NoSuchMethodError(); } - public static int calcGapSumPerColumn( final Msa msa, final int col ) { - int gap_rows = 0; - for( int j = 0; j < msa.getNumberOfSequences(); ++j ) { - if ( msa.isGapAt( j, col ) ) { - gap_rows++; - } - } - return gap_rows; - } - - final public static Msa removeSequence( final Msa msa, final String to_remove_id ) { - final List seqs = new ArrayList(); - for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { - if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) { - seqs.add( msa.getSequence( row ) ); - } - } - if ( seqs.size() < 1 ) { - return null; - } - return BasicMsa.createInstance( seqs ); - } - - final public static Msa removeSequences( final Msa msa, final List to_remove_ids ) { - final List seqs = new ArrayList(); - for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { - if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { - seqs.add( msa.getSequence( row ) ); - } - } - if ( seqs.size() < 1 ) { - return null; - } - return BasicMsa.createInstance( seqs ); - } - - final public static Msa removeSequencesByRow( final Msa msa, final List to_remove_rows ) { - final List seqs = new ArrayList(); - for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { - if ( !to_remove_rows.contains( row ) ) { - seqs.add( msa.getSequence( row ) ); - } - } - if ( seqs.size() < 1 ) { - return null; - } - return BasicMsa.createInstance( seqs ); - } - synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio, final int min_allowed_length, final Msa msa ) { @@ -159,14 +99,95 @@ public final class MsaMethods { return BasicMsa.createInstance( seqs ); } - final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { + synchronized public ArrayList getIgnoredSequenceIds() { + return _ignored_seqs_ids; + } + + synchronized private void init() { + _ignored_seqs_ids = new ArrayList(); + } + + public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( int c = from; c <= to; ++c ) { - stats.addValue( calculateIdentityRatio( msa, c ) ); + for( int i = 0; i < msa.getLength(); ++i ) { + stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() ); } return stats; } + public static double calcGapRatio( final Msa msa ) { + int gaps = 0; + for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { + for( int i = 0; i < msa.getLength(); ++i ) { + if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) { + gaps++; + } + } + } + return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() ); + } + + public static int calcGapSumPerColumn( final Msa msa, final int col ) { + int gap_rows = 0; + for( int j = 0; j < msa.getNumberOfSequences(); ++j ) { + if ( msa.isGapAt( j, col ) ) { + gap_rows++; + } + } + return gap_rows; + } + + final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa ) { + double s = 0; + for( int col = 0; col < msa.getLength(); ++col ) { + s += calcNormalizedShannonsEntropy( k, msa, col ); + } + return s / msa.getLength(); + } + + final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa, final int col ) { + // http://www.ebi.ac.uk/thornton-srv/databases/valdarprograms/scorecons_server_help.html + // n: number of residues in column + // k: number of residue types + // na: number of residues of type a + // pa = na/n + // S=-sum pa log2 pa + double s = 0; + final double n = msa.getNumberOfSequences(); + HashMap dist = null; + if ( k == 6 ) { + dist = calcResidueDistribution6( msa, col ); + } + else if ( k == 7 ) { + dist = calcResidueDistribution7( msa, col ); + } + else if ( k == 20 ) { + dist = calcResidueDistribution20( msa, col ); + } + else if ( k == 21 ) { + dist = calcResidueDistribution21( msa, col ); + } + else { + throw new IllegalArgumentException( "illegal value for k: " + k ); + } + if ( dist.size() == 1 ) { + return 0; + } + // if ( dist.size() == n ) { + // return 0; + // } + for( final int na : dist.values() ) { + final double pa = na / n; + s += pa * Math.log( pa ); + } + if ( n < k ) { + return -( s / ( Math.log( n ) ) ); + } + else { + return -( s / ( Math.log( k ) ) ); + } + } + final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { @@ -176,6 +197,14 @@ public final class MsaMethods { return stats; } + final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( int c = from; c <= to; ++c ) { + stats.addValue( calculateIdentityRatio( msa, c ) ); + } + return stats; + } + final public static double calculateIdentityRatio( final Msa msa, final int column ) { final SortedMap dist = calculateResidueDestributionPerColumn( msa, column ); int majority_count = 0; @@ -204,12 +233,34 @@ public final class MsaMethods { return map; } - public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( int i = 0; i < msa.getLength(); ++i ) { - stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() ); + synchronized public static MsaMethods createInstance() { + return new MsaMethods(); + } + + final public static Msa removeSequence( final Msa msa, final String to_remove_id ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) { + seqs.add( msa.getSequence( row ) ); + } } - return stats; + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + final public static Msa removeSequences( final Msa msa, final List to_remove_ids ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { + seqs.add( msa.getSequence( row ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); } public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) { @@ -228,15 +279,135 @@ public final class MsaMethods { return removeSequencesByRow( msa, to_remove_rows ); } - public static double calcGapRatio( final Msa msa ) { - int gaps = 0; - for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { - for( int i = 0; i < msa.getLength(); ++i ) { - if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) { - gaps++; + final public static Msa removeSequencesByRow( final Msa msa, final List to_remove_rows ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_rows.contains( row ) ) { + seqs.add( msa.getSequence( row ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + final private static HashMap calcResidueDistribution20( final Msa msa, final int col ) { + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + if ( c != Sequence.GAP ) { + if ( !counts.containsKey( c ) ) { + counts.put( c, 1 ); + } + else { + counts.put( c, 1 + counts.get( c ) ); } } } - return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() ); + return counts; + } + + final private static HashMap calcResidueDistribution21( final Msa msa, final int col ) { + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + if ( !counts.containsKey( c ) ) { + counts.put( c, 1 ); + } + else { + counts.put( c, 1 + counts.get( c ) ); + } + } + return counts; + } + + final private static HashMap calcResidueDistribution6( final Msa msa, final int col ) { + // Residues are classified into one of tex2html_wrap199 types: + // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], + // special conformations [GP] and gaps. This convention follows that + // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196). + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + char x; + if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) { + // aliphatic + x = 'a'; + } + else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) { + // aromatic + x = 'r'; + } + else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) { + // polar + x = 'p'; + } + else if ( ( c == 'K' ) || ( c == 'R' ) ) { + // positive + x = 'o'; + } + else if ( ( c == 'D' ) || ( c == 'E' ) ) { + // negative + x = 'e'; + } + else if ( ( c == 'G' ) || ( c == 'P' ) ) { + // aliphatic - special conformation + x = 's'; + } + else { + continue; + } + if ( !counts.containsKey( x ) ) { + counts.put( x, 1 ); + } + else { + counts.put( x, 1 + counts.get( x ) ); + } + } + return counts; + } + + final private static HashMap calcResidueDistribution7( final Msa msa, final int col ) { + // Residues are classified into one of tex2html_wrap199 types: + // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], + // special conformations [GP] and gaps. This convention follows that + // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196). + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + char x = '-'; + if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) { + // aliphatic + x = 'a'; + } + else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) { + // aromatic + x = 'r'; + } + else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) { + // polar + x = 'p'; + } + else if ( ( c == 'K' ) || ( c == 'R' ) ) { + // positive + x = 'o'; + } + else if ( ( c == 'D' ) || ( c == 'E' ) ) { + // negative + x = 'e'; + } + else if ( ( c == 'G' ) || ( c == 'P' ) ) { + // aliphatic - special conformation + x = 's'; + } + if ( !counts.containsKey( x ) ) { + counts.put( x, 1 ); + } + else { + counts.put( x, 1 + counts.get( x ) ); + } + } + return counts; } } diff --git a/forester/java/src/org/forester/msa_compactor/Chart.java b/forester/java/src/org/forester/msa_compactor/Chart.java index 2f0607c..38da2b4 100644 --- a/forester/java/src/org/forester/msa_compactor/Chart.java +++ b/forester/java/src/org/forester/msa_compactor/Chart.java @@ -26,7 +26,6 @@ package org.forester.msa_compactor; import java.awt.BorderLayout; import java.awt.event.ActionListener; -import java.util.ArrayList; import java.util.List; import javax.swing.JDialog; @@ -46,13 +45,13 @@ import com.approximatrix.charting.swing.ChartPanel; public final class Chart extends JDialog implements ActionListener { - private static final long serialVersionUID = -5292420246132943515L; - private ChartPanel _chart_panel = null; - private final JMenuItem _m_exit = new JMenuItem(); - private List _msa_props; - private final boolean _show_msa_qual; - private final int _initial_number_of_seqs; - private final String _title; + private static final long serialVersionUID = -5292420246132943515L; + private ChartPanel _chart_panel = null; + private final int _initial_number_of_seqs; + private final JMenuItem _m_exit = new JMenuItem(); + private final List _msa_props; + private final boolean _show_msa_qual; + private final String _title; private Chart( final List msa_props, final int initial_number_of_seqs, @@ -91,43 +90,63 @@ public final class Chart extends JDialog implements ActionListener { private ChartPanel obtainChartPanel() { if ( _chart_panel == null ) { final MultiScatterDataModel model = new MultiScatterDataModel(); - if ( ( _msa_props == null ) || _msa_props.isEmpty() ) { - _msa_props = new ArrayList(); - final MsaProperties p0 = new MsaProperties( 10, 200, 0.5, 0.1, "" ); - final MsaProperties p1 = new MsaProperties( 9, 190, 0.49, 0.2, "" ); - final MsaProperties p2 = new MsaProperties( 8, 150, 0.2, 0.3, "" ); - final MsaProperties p3 = new MsaProperties( 7, 145, 0.2, 0.4, "" ); - _msa_props.add( p0 ); - _msa_props.add( p1 ); - _msa_props.add( p2 ); - _msa_props.add( p3 ); - } final double[][] seqs_length = new double[ _msa_props.size() ][ 2 ]; + int max_length = -1; for( int i = 0; i < _msa_props.size(); ++i ) { seqs_length[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); seqs_length[ i ][ 1 ] = _msa_props.get( i ).getLength(); + if ( _msa_props.get( i ).getLength() > max_length ) { + max_length = _msa_props.get( i ).getLength(); + } } model.addData( seqs_length, "Length" ); model.setSeriesLine( "Series " + "Length", true ); model.setSeriesMarker( "Series " + "Length", false ); final double[][] seqs_gaps = new double[ _msa_props.size() ][ 2 ]; + double max_gap_ratio = -1; + double max_ent7 = -1; + double max_ent21 = -1; + for( int i = 0; i < _msa_props.size(); ++i ) { + if ( _msa_props.get( i ).getGapRatio() > max_gap_ratio ) { + max_gap_ratio = _msa_props.get( i ).getGapRatio(); + } + if ( _show_msa_qual ) { + if ( _msa_props.get( i ).getEntropy7() > max_ent7 ) { + max_ent7 = _msa_props.get( i ).getEntropy7(); + } + if ( _msa_props.get( i ).getEntropy21() > max_ent21 ) { + max_ent21 = _msa_props.get( i ).getEntropy21(); + } + } + } + final double gap_ratio_factor = ( max_length / 2.0 ) / max_gap_ratio; + final double ent7_factor = ( max_length / 2.0 ) / max_ent7; + final double ent21_factor = ( max_length / 2.0 ) / max_ent21; for( int i = 0; i < _msa_props.size(); ++i ) { seqs_gaps[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); - seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * 200 ); + seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * gap_ratio_factor ); } model.addData( seqs_gaps, "Gap ratio" ); model.setSeriesLine( "Series " + "Gap ratio", true ); model.setSeriesMarker( "Series " + "Gap ratio", false ); if ( _show_msa_qual ) { - final double[][] seqs_identity = new double[ _msa_props.size() ][ 2 ]; + final double[][] entropy7 = new double[ _msa_props.size() ][ 2 ]; + for( int i = 0; i < _msa_props.size(); ++i ) { + entropy7[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); + entropy7[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy7() * ent7_factor ); + } + model.addData( entropy7, "Entropy norm 7" ); + model.setSeriesLine( "Series " + "Entropy norm 7", true ); + model.setSeriesMarker( "Series " + "Entropy norm 7", false ); + // + final double[][] entropy21 = new double[ _msa_props.size() ][ 2 ]; for( int i = 0; i < _msa_props.size(); ++i ) { - seqs_identity[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); - seqs_identity[ i ][ 1 ] = ForesterUtil - .roundToInt( _msa_props.get( i ).getAverageIdentityRatio() * 200 ); + entropy21[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); + entropy21[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy21() * ent21_factor ); } - model.addData( seqs_identity, "mean MSA column identity" ); - model.setSeriesLine( "Series " + "mean MSA column identity", true ); - model.setSeriesMarker( "Series " + "mean MSA column identity", false ); + model.addData( entropy21, "Entropy norm 21" ); + model.setSeriesLine( "Series " + "Entropy norm 21", true ); + model.setSeriesMarker( "Series " + "Entropy norm 21", false ); } final BoxCoordSystem coord = new BoxCoordSystem( model ); coord.setUnitFont( coord.getUnitFont().deriveFont( 16.0f ) ); diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 4411fd2..a1c61c1 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -71,29 +71,27 @@ import org.forester.util.ForesterUtil; public class MsaCompactor { - final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); - final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); - private double _gap_ratio = -1; + final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); + final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); + private boolean _calculate_shannon_entropy = false; // - private String _infile_name = null; + private String _infile_name = null; private final short _longest_id_length; // - private String _maffts_opts = "--auto"; - private int _min_length = -1; - private DeleteableMsa _msa = null; - private boolean _norm = true; - private File _out_file_base = null; - private MSA_FORMAT _output_format = MSA_FORMAT.FASTA; - private String _path_to_mafft = null; - private boolean _phylogentic_inference = false; + private String _maffts_opts = "--auto"; + private DeleteableMsa _msa = null; + private boolean _norm = true; + private File _out_file_base = null; + private MSA_FORMAT _output_format = MSA_FORMAT.FASTA; + private String _path_to_mafft = null; + private boolean _phylogentic_inference = false; // - private boolean _realign = false; + private boolean _realign = false; private final SortedSet _removed_seq_ids; private final ArrayList _removed_seqs; - private File _removed_seqs_out_base = null; - private boolean _report_aln_mean_identity = false; - private int _step = -1; - private int _step_for_diagnostics = -1; + private File _removed_seqs_out_base = null; + private int _step = -1; + private int _step_for_diagnostics = -1; static { NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); @@ -146,11 +144,11 @@ public class MsaCompactor { if ( !_realign ) { _step = -1; } - int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 ); - if ( x < 1 ) { - x = 1; + int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 10.0 ); + if ( x < 2 ) { + x = 2; } - MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity ); + MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy ); msa_props.add( msa_prop ); printTableHeader(); printMsaProperties( msa_prop ); @@ -162,7 +160,7 @@ public class MsaCompactor { if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) { removeGapColumns(); realignWithMafft(); - msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity ); + msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy ); msa_props.add( msa_prop ); printMsaProperties( msa_prop ); System.out.print( "(realigned)" ); @@ -170,7 +168,7 @@ public class MsaCompactor { } else if ( isPrintMsaStats( i ) ) { removeGapColumns(); - msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity ); + msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy ); msa_props.add( msa_prop ); printMsaProperties( msa_prop ); System.out.println(); @@ -263,11 +261,11 @@ public class MsaCompactor { } public final void removeSequencesByMinimalLength( final int min_effective_length ) { - printMsaProperties( new MsaProperties( _msa, "", _report_aln_mean_identity ) ); + printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) ); System.out.println(); _msa = DeleteableMsa.createInstance( MsaMethods.removeSequencesByMinimalLength( _msa, min_effective_length ) ); removeGapColumns(); - printMsaProperties( new MsaProperties( _msa, "", _report_aln_mean_identity ) ); + printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) ); System.out.println(); } @@ -286,7 +284,7 @@ public class MsaCompactor { phy = calcTree(); } printTableHeader(); - MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity ); + MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy ); msa_props.add( msa_prop ); printMsaProperties( msa_prop ); System.out.println(); @@ -303,7 +301,7 @@ public class MsaCompactor { System.out.println(); } else if ( isPrintMsaStats( i ) ) { - msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity ); + msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy ); msa_props.add( msa_prop ); printMsaProperties( msa_prop ); System.out.println(); @@ -336,7 +334,7 @@ public class MsaCompactor { phy = calcTree(); } printTableHeader(); - MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity ); + MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy ); msa_props.add( msa_prop ); printMsaProperties( msa_prop ); System.out.println(); @@ -353,7 +351,7 @@ public class MsaCompactor { System.out.println(); } else if ( isPrintMsaStats( i ) ) { - msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity ); + msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy ); printMsaProperties( msa_prop ); msa_props.add( msa_prop ); System.out.println(); @@ -387,7 +385,7 @@ public class MsaCompactor { phy = calcTree(); } printTableHeader(); - MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity ); + MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy ); msa_props.add( msa_prop ); printMsaProperties( msa_prop ); System.out.println(); @@ -403,7 +401,7 @@ public class MsaCompactor { System.out.println(); } else if ( isPrintMsaStats( i ) ) { - msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity ); + msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy ); msa_props.add( msa_prop ); printMsaProperties( msa_prop ); System.out.println(); @@ -421,8 +419,8 @@ public class MsaCompactor { return msa_props; } - public final void setGapRatio( final double gap_ratio ) { - _gap_ratio = gap_ratio; + public final void setCalculateNormalizedShannonEntropy( final boolean calculate_shannon_entropy ) { + _calculate_shannon_entropy = calculate_shannon_entropy; } public void setInfileName( final String infile_name ) { @@ -433,10 +431,6 @@ public class MsaCompactor { _maffts_opts = maffts_opts; } - public final void setMinLength( final int min_length ) { - _min_length = min_length; - } - public final void setNorm( final boolean norm ) { _norm = norm; } @@ -465,10 +459,6 @@ public class MsaCompactor { _removed_seqs_out_base = removed_seqs_out_base; } - public final void setReportAlnMeanIdentity( final boolean report_aln_mean_identity ) { - _report_aln_mean_identity = report_aln_mean_identity; - } - public final void setStep( final int step ) { _step = step; } @@ -600,9 +590,11 @@ public class MsaCompactor { sb.append( msa_properties.getLength() ); sb.append( "\t" ); sb.append( NF_4.format( msa_properties.getGapRatio() ) ); - if ( _report_aln_mean_identity ) { + if ( _calculate_shannon_entropy ) { + sb.append( "\t" ); + sb.append( NF_4.format( msa_properties.getEntropy7() ) ); sb.append( "\t" ); - sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) ); + sb.append( NF_4.format( msa_properties.getEntropy21() ) ); } return sb; } @@ -649,7 +641,7 @@ public class MsaCompactor { if ( realign ) { realignWithMafft(); } - final MsaProperties msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity ); + final MsaProperties msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy ); printMsaProperties( msa_prop ); final String s = writeOutfile(); System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) ); @@ -667,8 +659,10 @@ public class MsaCompactor { System.out.print( "\t" ); System.out.print( "Gaps" ); System.out.print( "\t" ); - if ( _report_aln_mean_identity ) { - System.out.print( "MSA qual" ); + if ( _calculate_shannon_entropy ) { + System.out.print( "entn7" ); + System.out.print( "\t" ); + System.out.print( "entn21" ); System.out.print( "\t" ); } System.out.println(); diff --git a/forester/java/src/org/forester/msa_compactor/MsaProperties.java b/forester/java/src/org/forester/msa_compactor/MsaProperties.java index f24f60f..ae575b2 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaProperties.java +++ b/forester/java/src/org/forester/msa_compactor/MsaProperties.java @@ -29,7 +29,8 @@ import org.forester.msa.MsaMethods; public final class MsaProperties { - final private double _average_identity_ratio; + final private double _entropy21; + final private double _entropy7; final private double _gap_ratio; final private int _length; final private int _number_of_sequences; @@ -38,30 +39,38 @@ public final class MsaProperties { public MsaProperties( final int number_of_sequences, final int length, final double gap_ratio, - final double average_identity_ratio, + final double entropy7, + final double entropy21, final String removed_seq ) { _number_of_sequences = number_of_sequences; _length = length; _gap_ratio = gap_ratio; - _average_identity_ratio = average_identity_ratio; + _entropy7 = entropy7; + _entropy21 = entropy21; _removed_seq = removed_seq; } - public MsaProperties( final Msa msa, final String removed_seq, final boolean calculate_aln_mean_identity ) { + public MsaProperties( final Msa msa, final String removed_seq, final boolean calculate_normalized_shannon_entropy ) { _number_of_sequences = msa.getNumberOfSequences(); _length = msa.getLength(); _gap_ratio = MsaMethods.calcGapRatio( msa ); _removed_seq = removed_seq; - if ( calculate_aln_mean_identity ) { - _average_identity_ratio = MsaMethods.calculateIdentityRatio( 0, msa.getLength() - 1, msa ).arithmeticMean(); + if ( calculate_normalized_shannon_entropy ) { + _entropy7 = MsaMethods.calcNormalizedShannonsEntropy( 7, msa ); + _entropy21 = MsaMethods.calcNormalizedShannonsEntropy( 21, msa ); } else { - _average_identity_ratio = -1; + _entropy7 = -1; + _entropy21 = -1; } } - public final double getAverageIdentityRatio() { - return _average_identity_ratio; + public final double getEntropy21() { + return _entropy21; + } + + public final double getEntropy7() { + return _entropy7; } public final double getGapRatio() { diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index df8fce2..000e3fa 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -180,6 +180,16 @@ public final class Test { System.exit( -1 ); } final long start_time = new Date().getTime(); + System.out.print( "MSA entropy: " ); + if ( Test.testMsaEntropy() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.exit( 0 ); System.out.print( "Basic node methods: " ); if ( Test.testBasicNodeMethods() ) { System.out.println( "OK." ); @@ -6088,6 +6098,69 @@ public final class Test { return true; } + private static boolean testMsaEntropy() { + try { + final Sequence s0 = BasicSequence.createAaSequence( "a", "AAAAAAA" ); + final Sequence s1 = BasicSequence.createAaSequence( "b", "AAAIACC" ); + final Sequence s2 = BasicSequence.createAaSequence( "c", "AAIIIIF" ); + final Sequence s3 = BasicSequence.createAaSequence( "d", "AIIIVVW" ); + final List l = new ArrayList(); + l.add( s0 ); + l.add( s1 ); + l.add( s2 ); + l.add( s3 ); + final Msa msa = BasicMsa.createInstance( l ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 0 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 1 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 2 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 3 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 4 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 5 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 6 ) ); + System.out.println(); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 0 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 1 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 2 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 3 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 4 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 5 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 6 ) ); + final List l2 = new ArrayList(); + l2.add( BasicSequence.createAaSequence( "1", "AAAAAAA" ) ); + l2.add( BasicSequence.createAaSequence( "2", "AAAIACC" ) ); + l2.add( BasicSequence.createAaSequence( "3", "AAIIIIF" ) ); + l2.add( BasicSequence.createAaSequence( "4", "AIIIVVW" ) ); + l2.add( BasicSequence.createAaSequence( "5", "AAAAAAA" ) ); + l2.add( BasicSequence.createAaSequence( "6", "AAAIACC" ) ); + l2.add( BasicSequence.createAaSequence( "7", "AAIIIIF" ) ); + l2.add( BasicSequence.createAaSequence( "8", "AIIIVVW" ) ); + l2.add( BasicSequence.createAaSequence( "9", "AAAAAAA" ) ); + l2.add( BasicSequence.createAaSequence( "10", "AAAIACC" ) ); + l2.add( BasicSequence.createAaSequence( "11", "AAIIIIF" ) ); + l2.add( BasicSequence.createAaSequence( "12", "AIIIVVW" ) ); + l2.add( BasicSequence.createAaSequence( "13", "AAIIIIF" ) ); + l2.add( BasicSequence.createAaSequence( "14", "AIIIVVW" ) ); + l2.add( BasicSequence.createAaSequence( "15", "AAAAAAA" ) ); + l2.add( BasicSequence.createAaSequence( "16", "AAAIACC" ) ); + l2.add( BasicSequence.createAaSequence( "17", "AAIIIIF" ) ); + l2.add( BasicSequence.createAaSequence( "18", "AIIIVVW" ) ); + l2.add( BasicSequence.createAaSequence( "19", "AAAAAAA" ) ); + l2.add( BasicSequence.createAaSequence( "20", "AAAIACC" ) ); + l2.add( BasicSequence.createAaSequence( "21", "AAIIIIF" ) ); + l2.add( BasicSequence.createAaSequence( "22", "AIIIVVW" ) ); + final Msa msa2 = BasicMsa.createInstance( l2 ); + System.out.println(); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa2, 0 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa2, 1 ) ); + System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa2, 2 ) ); + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + private static boolean testDeleteableMsa() { try { final Sequence s0 = BasicSequence.createAaSequence( "a", "AAAA" ); -- 1.7.10.2