From 4d33637f6bc5a76ca65dea7e21c9e2f7267ab2a3 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Thu, 29 Jan 2015 01:25:18 +0000 Subject: [PATCH] compactor work --- .../org/forester/application/msa_compactor.java | 14 ++-- .../org/forester/archaeopteryx/Archaeopteryx.java | 2 +- .../org/forester/archaeopteryx/Configuration.java | 4 + .../archaeopteryx/MainFrameApplication.java | 2 +- forester/java/src/org/forester/msa/MsaMethods.java | 5 +- .../java/src/org/forester/msa_compactor/Chart.java | 88 +++++++++++++++----- .../org/forester/msa_compactor/MsaCompactor.java | 23 ++--- .../org/forester/msa_compactor/MsaProperties.java | 12 +-- 8 files changed, 104 insertions(+), 46 deletions(-) diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index daa6166..8184e15 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -418,17 +418,21 @@ public class msa_compactor { WWW, ForesterUtil.getForesterLibraryInformation() ); System.out.println( "Input MSA : " + in ); + printMsaInfo( msa, initial_msa_stats ); + } + + private static void printMsaInfo( DeleteableMsa msa, final DescriptiveStatistics msa_stats ) { System.out.println( "MSA length : " + msa.getLength() ); System.out.println( "Number of sequences : " + msa.getNumberOfSequences() ); - System.out.println( "Median sequence length : " + NF_1.format( initial_msa_stats.median() ) ); + System.out.println( "Median sequence length : " + NF_1.format( msa_stats.median() ) ); System.out.println( "Mean sequence length : " - + NF_1.format( initial_msa_stats.arithmeticMean() ) ); - System.out.println( "Max sequence length : " + ( ( int ) initial_msa_stats.getMax() ) ); - System.out.println( "Min sequence length : " + ( ( int ) initial_msa_stats.getMin() ) ); + + NF_1.format( msa_stats.arithmeticMean() ) ); + System.out.println( "Max sequence length : " + ( ( int ) msa_stats.getMax() ) ); + System.out.println( "Min sequence length : " + ( ( int ) msa_stats.getMin() ) ); System.out.println( "Gap ratio : " + NF_4.format( MsaMethods.calcGapRatio( msa ) ) ); System.out.println( "Mean gap count per 100 residues : " - + NF_1.format( MsaMethods.calcNumberOfGapsPer100Stats( msa ).arithmeticMean() ) ); + + NF_1.format( MsaMethods.calcNumberOfGapsStats( msa ).arithmeticMean() ) ); System.out.println( "Normalized Shannon Entropy (entn7) : " + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 7, msa ) ) ); System.out.println( "Normalized Shannon Entropy (entn21): " diff --git a/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java b/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java index 648fd65..8a3eed6 100644 --- a/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java +++ b/forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java @@ -59,7 +59,7 @@ public final class Archaeopteryx { phylogenies[ 0 ] = phylogeny; return MainFrameApplication.createInstance( phylogenies, config, title ); } - + public static void main( final String args[] ) { Phylogeny[] phylogenies = null; String config_filename = null; diff --git a/forester/java/src/org/forester/archaeopteryx/Configuration.java b/forester/java/src/org/forester/archaeopteryx/Configuration.java index adb0f40..52c644c 100644 --- a/forester/java/src/org/forester/archaeopteryx/Configuration.java +++ b/forester/java/src/org/forester/archaeopteryx/Configuration.java @@ -998,6 +998,10 @@ public final class Configuration { public void setDynamicallyHideData( final boolean b ) { display_options[ dynamically_hide_data ][ 2 ] = b ? "yes" : "no"; } + + public void setDisplayMultipleSequenceAlignment( final boolean b ) { + display_options[ show_mol_seqs ][ 2 ] = b ? "yes" : "no"; + } private void setEditable( final boolean editable ) { _editable = editable; diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index bccdd9d..fd49edc 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -1197,7 +1197,7 @@ public final class MainFrameApplication extends MainFrame { _contentpane.removeAll(); setVisible( false ); dispose(); - System.exit( 0 ); + // System.exit( 0 ); //TODO reconfirm that this is OK, then remove. } void readPhylogeniesFromURL() { diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index 881e622..2b47020 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -107,12 +107,11 @@ public final class MsaMethods { _ignored_seqs_ids = new ArrayList(); } - public static final DescriptiveStatistics calcNumberOfGapsPer100Stats( final Msa msa ) { + public static final DescriptiveStatistics calcNumberOfGapsStats( final Msa msa ) { final int[] gaps = calcNumberOfGapsInMsa( msa ); final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - final double n = 100.0 / msa.getLength(); for( final int gap : gaps ) { - stats.addValue( n * gap ); + stats.addValue( gap ); } return stats; } diff --git a/forester/java/src/org/forester/msa_compactor/Chart.java b/forester/java/src/org/forester/msa_compactor/Chart.java index 38da2b4..3ea86f4 100644 --- a/forester/java/src/org/forester/msa_compactor/Chart.java +++ b/forester/java/src/org/forester/msa_compactor/Chart.java @@ -26,6 +26,8 @@ package org.forester.msa_compactor; import java.awt.BorderLayout; import java.awt.event.ActionListener; +import java.text.DecimalFormat; +import java.text.NumberFormat; import java.util.List; import javax.swing.JDialog; @@ -45,6 +47,7 @@ import com.approximatrix.charting.swing.ChartPanel; public final class Chart extends JDialog implements ActionListener { + final private static NumberFormat NF_1 = new DecimalFormat( "0.##" ); private static final long serialVersionUID = -5292420246132943515L; private ChartPanel _chart_panel = null; private final int _initial_number_of_seqs; @@ -92,50 +95,93 @@ public final class Chart extends JDialog implements ActionListener { final MultiScatterDataModel model = new MultiScatterDataModel(); final double[][] seqs_length = new double[ _msa_props.size() ][ 2 ]; int max_length = -1; + int min_length = Integer.MAX_VALUE; + double max_gap_ratio = -1; + double min_gap_ratio = Double.MAX_VALUE; + double max_avg_gap_count = -1; + double min_avg_gap_count = Double.MAX_VALUE; for( int i = 0; i < _msa_props.size(); ++i ) { seqs_length[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); - seqs_length[ i ][ 1 ] = _msa_props.get( i ).getLength(); - if ( _msa_props.get( i ).getLength() > max_length ) { - max_length = _msa_props.get( i ).getLength(); + // + final int length = _msa_props.get( i ).getLength(); + seqs_length[ i ][ 1 ] = length; + if ( length > max_length ) { + max_length = length; + } + if ( length < min_length ) { + min_length = length; + } + // + final double gap_ratio = _msa_props.get( i ).getGapRatio(); + if ( gap_ratio > max_gap_ratio ) { + max_gap_ratio = gap_ratio; + } + if ( gap_ratio < min_gap_ratio ) { + min_gap_ratio = gap_ratio; + } + // + final double avg_gap_count = _msa_props.get( i ).getAvgNumberOfGaps(); + if ( avg_gap_count > max_avg_gap_count ) { + max_avg_gap_count = avg_gap_count; + } + if ( avg_gap_count < min_avg_gap_count ) { + min_avg_gap_count = avg_gap_count; } } - model.addData( seqs_length, "Length" ); + model.addData( seqs_length, "Length" + " (" + minMaxToString( min_length, max_length ) + ")" ); model.setSeriesLine( "Series " + "Length", true ); model.setSeriesMarker( "Series " + "Length", false ); final double[][] seqs_gaps = new double[ _msa_props.size() ][ 2 ]; - double max_gap_ratio = -1; double max_ent7 = -1; double max_ent21 = -1; - for( int i = 0; i < _msa_props.size(); ++i ) { - if ( _msa_props.get( i ).getGapRatio() > max_gap_ratio ) { - max_gap_ratio = _msa_props.get( i ).getGapRatio(); - } - if ( _show_msa_qual ) { - if ( _msa_props.get( i ).getEntropy7() > max_ent7 ) { - max_ent7 = _msa_props.get( i ).getEntropy7(); + double min_ent7 = Double.MAX_VALUE; + double min_ent21 = Double.MAX_VALUE; + if ( _show_msa_qual ) { + for( int i = 0; i < _msa_props.size(); ++i ) { + final double ent7 = _msa_props.get( i ).getEntropy7(); + if ( ent7 > max_ent7 ) { + max_ent7 = ent7; + } + if ( ent7 < max_ent7 ) { + min_ent7 = ent7; } - if ( _msa_props.get( i ).getEntropy21() > max_ent21 ) { - max_ent21 = _msa_props.get( i ).getEntropy21(); + final double ent21 = _msa_props.get( i ).getEntropy21(); + if ( ent21 > min_ent21 ) { + max_ent21 = ent21; + } + if ( ent21 < min_ent21 ) { + min_ent21 = ent21; } } } final double gap_ratio_factor = ( max_length / 2.0 ) / max_gap_ratio; + final double avg_gaps_counts_factor = ( max_length / 2.0 ) / max_avg_gap_count; final double ent7_factor = ( max_length / 2.0 ) / max_ent7; final double ent21_factor = ( max_length / 2.0 ) / max_ent21; for( int i = 0; i < _msa_props.size(); ++i ) { seqs_gaps[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * gap_ratio_factor ); } - model.addData( seqs_gaps, "Gap ratio" ); - model.setSeriesLine( "Series " + "Gap ratio", true ); - model.setSeriesMarker( "Series " + "Gap ratio", false ); + model.addData( seqs_gaps, "Gap Ratio" + " (" + minMaxToString( min_gap_ratio, max_gap_ratio ) + ")" ); + model.setSeriesLine( "Series " + "Gap Ratio", true ); + model.setSeriesMarker( "Series " + "Gap Ratio", false ); + final double[][] gap_counts = new double[ _msa_props.size() ][ 2 ]; + for( int i = 0; i < _msa_props.size(); ++i ) { + gap_counts[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); + gap_counts[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getAvgNumberOfGaps() + * avg_gaps_counts_factor ); + } + model.addData( gap_counts, "Mean Gap Count" + " (" + minMaxToString( min_avg_gap_count, max_avg_gap_count ) + + ")" ); + model.setSeriesLine( "Series " + "Mean Gap Count", true ); + model.setSeriesMarker( "Series " + "Mean Gap Count", false ); if ( _show_msa_qual ) { final double[][] entropy7 = new double[ _msa_props.size() ][ 2 ]; for( int i = 0; i < _msa_props.size(); ++i ) { entropy7[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); entropy7[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy7() * ent7_factor ); } - model.addData( entropy7, "Entropy norm 7" ); + model.addData( entropy7, "Entropy norm 7" + " (" + minMaxToString( min_ent7, max_ent7 ) + ")" ); model.setSeriesLine( "Series " + "Entropy norm 7", true ); model.setSeriesMarker( "Series " + "Entropy norm 7", false ); // @@ -144,7 +190,7 @@ public final class Chart extends JDialog implements ActionListener { entropy21[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences(); entropy21[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy21() * ent21_factor ); } - model.addData( entropy21, "Entropy norm 21" ); + model.addData( entropy21, "Entropy norm 21" + " (" + minMaxToString( min_ent21, max_ent21 ) + ")" ); model.setSeriesLine( "Series " + "Entropy norm 21", true ); model.setSeriesMarker( "Series " + "Entropy norm 21", false ); } @@ -162,6 +208,10 @@ public final class Chart extends JDialog implements ActionListener { return _chart_panel; } + private final static String minMaxToString( final double min, final double max ) { + return NF_1.format( min ) + "-" + NF_1.format( max ); + } + public static void display( final List msa_props, final int initial_number_of_seqs, final boolean show_msa_qual, diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 687951c..6209da1 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -245,14 +245,14 @@ public class MsaCompactor { } else { n.getNodeData() - .getNodeVisualData() - .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(), - min, - max, - mean, - min_color, - max_color, - mean_color ) ); + .getNodeVisualData() + .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(), + min, + max, + mean, + min_color, + max_color, + mean_color ) ); } } } @@ -272,6 +272,7 @@ public class MsaCompactor { config.setDisplaySequenceNames( false ); config.setDisplaySequenceSymbols( false ); config.setDisplayGeneNames( false ); + config.setDisplayMultipleSequenceAlignment( true ); config.setShowScale( true ); config.setAddTaxonomyImagesCB( false ); config.setBaseFontSize( 9 ); @@ -302,7 +303,7 @@ public class MsaCompactor { } public final List removeViaGapAverage( final double mean_gapiness ) throws IOException, - InterruptedException { + InterruptedException { final GapContribution stats[] = calcGapContribtionsStats( _norm ); final List to_remove_ids = new ArrayList(); final List msa_props = new ArrayList(); @@ -405,7 +406,7 @@ public class MsaCompactor { } public final List removeWorstOffenders( final int to_remove ) throws IOException, - InterruptedException { + InterruptedException { final GapContribution stats[] = calcGapContribtionsStats( _norm ); final List to_remove_ids = new ArrayList(); final List msa_props = new ArrayList(); @@ -637,7 +638,7 @@ public class MsaCompactor { sb.append( "\t" ); sb.append( NF_4.format( msa_properties.getGapRatio() ) ); sb.append( "\t" ); - sb.append( NF_1.format( msa_properties.getAvgNumberOfGapsPer100() ) ); + sb.append( NF_1.format( msa_properties.getAvgNumberOfGaps() ) ); if ( _calculate_shannon_entropy ) { sb.append( "\t" ); sb.append( NF_4.format( msa_properties.getEntropy7() ) ); diff --git a/forester/java/src/org/forester/msa_compactor/MsaProperties.java b/forester/java/src/org/forester/msa_compactor/MsaProperties.java index 5343aec..1abc638 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaProperties.java +++ b/forester/java/src/org/forester/msa_compactor/MsaProperties.java @@ -34,7 +34,7 @@ public final class MsaProperties { final private double _gap_ratio; final private int _length; final private int _number_of_sequences; - final private double _avg_number_of_gaps_per_100; + final private double _avg_number_of_gaps; final private String _removed_seq; public MsaProperties( final int number_of_sequences, @@ -42,14 +42,14 @@ public final class MsaProperties { final double gap_ratio, final double entropy7, final double entropy21, - final double avg_number_of_gaps_per_100, + final double avg_number_of_gaps, final String removed_seq ) { _number_of_sequences = number_of_sequences; _length = length; _gap_ratio = gap_ratio; _entropy7 = entropy7; _entropy21 = entropy21; - _avg_number_of_gaps_per_100 = avg_number_of_gaps_per_100; + _avg_number_of_gaps = avg_number_of_gaps; _removed_seq = removed_seq; } @@ -58,7 +58,7 @@ public final class MsaProperties { _length = msa.getLength(); _gap_ratio = MsaMethods.calcGapRatio( msa ); _removed_seq = removed_seq; - _avg_number_of_gaps_per_100 = MsaMethods.calcNumberOfGapsPer100Stats( msa ).arithmeticMean(); + _avg_number_of_gaps = MsaMethods.calcNumberOfGapsStats( msa ).arithmeticMean(); if ( calculate_normalized_shannon_entropy ) { _entropy7 = MsaMethods.calcNormalizedShannonsEntropy( 7, msa ); _entropy21 = MsaMethods.calcNormalizedShannonsEntropy( 21, msa ); @@ -81,8 +81,8 @@ public final class MsaProperties { return _gap_ratio; } - public final double getAvgNumberOfGapsPer100() { - return _avg_number_of_gaps_per_100; + public final double getAvgNumberOfGaps() { + return _avg_number_of_gaps; } public final int getLength() { -- 1.7.10.2