compactor work
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 29 Jan 2015 01:25:18 +0000 (01:25 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 29 Jan 2015 01:25:18 +0000 (01:25 +0000)
forester/java/src/org/forester/application/msa_compactor.java
forester/java/src/org/forester/archaeopteryx/Archaeopteryx.java
forester/java/src/org/forester/archaeopteryx/Configuration.java
forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java
forester/java/src/org/forester/msa/MsaMethods.java
forester/java/src/org/forester/msa_compactor/Chart.java
forester/java/src/org/forester/msa_compactor/MsaCompactor.java
forester/java/src/org/forester/msa_compactor/MsaProperties.java

index daa6166..8184e15 100644 (file)
@@ -418,17 +418,21 @@ public class msa_compactor {
                                               WWW,
                                               ForesterUtil.getForesterLibraryInformation() );
         System.out.println( "Input MSA                            : " + in );
+        printMsaInfo( msa, initial_msa_stats );
+    }
+
+    private static void printMsaInfo( DeleteableMsa msa, final DescriptiveStatistics msa_stats ) {
         System.out.println( "MSA length                         : " + msa.getLength() );
         System.out.println( "Number of sequences                : " + msa.getNumberOfSequences() );
-        System.out.println( "Median sequence length             : " + NF_1.format( initial_msa_stats.median() ) );
+        System.out.println( "Median sequence length             : " + NF_1.format( msa_stats.median() ) );
         System.out.println( "Mean sequence length               : "
-                + NF_1.format( initial_msa_stats.arithmeticMean() ) );
-        System.out.println( "Max sequence length                : " + ( ( int ) initial_msa_stats.getMax() ) );
-        System.out.println( "Min sequence length                : " + ( ( int ) initial_msa_stats.getMin() ) );
+                + NF_1.format( msa_stats.arithmeticMean() ) );
+        System.out.println( "Max sequence length                : " + ( ( int ) msa_stats.getMax() ) );
+        System.out.println( "Min sequence length                : " + ( ( int ) msa_stats.getMin() ) );
         System.out.println( "Gap ratio                          : "
                 + NF_4.format( MsaMethods.calcGapRatio( msa ) ) );
         System.out.println( "Mean gap count per 100 residues    : "
-                + NF_1.format( MsaMethods.calcNumberOfGapsPer100Stats( msa ).arithmeticMean() ) );
+                + NF_1.format( MsaMethods.calcNumberOfGapsStats( msa ).arithmeticMean() ) );
         System.out.println( "Normalized Shannon Entropy (entn7) : "
                 + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 7, msa ) ) );
         System.out.println( "Normalized Shannon Entropy (entn21): "
index 648fd65..8a3eed6 100644 (file)
@@ -59,7 +59,7 @@ public final class Archaeopteryx {
         phylogenies[ 0 ] = phylogeny;
         return MainFrameApplication.createInstance( phylogenies, config, title );
     }
-
+    
     public static void main( final String args[] ) {
         Phylogeny[] phylogenies = null;
         String config_filename = null;
index adb0f40..52c644c 100644 (file)
@@ -998,6 +998,10 @@ public final class Configuration {
     public void setDynamicallyHideData( final boolean b ) {
         display_options[ dynamically_hide_data ][ 2 ] = b ? "yes" : "no";
     }
+    
+    public void setDisplayMultipleSequenceAlignment( final boolean b ) {
+        display_options[ show_mol_seqs ][ 2 ] = b ? "yes" : "no";
+    }
 
     private void setEditable( final boolean editable ) {
         _editable = editable;
index bccdd9d..fd49edc 100644 (file)
@@ -1197,7 +1197,7 @@ public final class MainFrameApplication extends MainFrame {
         _contentpane.removeAll();\r
         setVisible( false );\r
         dispose();\r
-        System.exit( 0 );\r
+       // System.exit( 0 ); //TODO reconfirm that this is OK, then remove.\r
     }\r
 \r
     void readPhylogeniesFromURL() {\r
index 881e622..2b47020 100644 (file)
@@ -107,12 +107,11 @@ public final class MsaMethods {
         _ignored_seqs_ids = new ArrayList<String>();
     }
 
-    public static final DescriptiveStatistics calcNumberOfGapsPer100Stats( final Msa msa ) {
+    public static final DescriptiveStatistics calcNumberOfGapsStats( final Msa msa ) {
         final int[] gaps = calcNumberOfGapsInMsa( msa );
         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
-        final double n = 100.0 / msa.getLength();
         for( final int gap : gaps ) {
-            stats.addValue( n * gap );
+            stats.addValue(  gap );
         }
         return stats;
     }
index 38da2b4..3ea86f4 100644 (file)
@@ -26,6 +26,8 @@ package org.forester.msa_compactor;
 
 import java.awt.BorderLayout;
 import java.awt.event.ActionListener;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
 import java.util.List;
 
 import javax.swing.JDialog;
@@ -45,6 +47,7 @@ import com.approximatrix.charting.swing.ChartPanel;
 
 public final class Chart extends JDialog implements ActionListener {
 
+    final private static NumberFormat NF_1             = new DecimalFormat( "0.##" );
     private static final long         serialVersionUID = -5292420246132943515L;
     private ChartPanel                _chart_panel     = null;
     private final int                 _initial_number_of_seqs;
@@ -92,50 +95,93 @@ public final class Chart extends JDialog implements ActionListener {
             final MultiScatterDataModel model = new MultiScatterDataModel();
             final double[][] seqs_length = new double[ _msa_props.size() ][ 2 ];
             int max_length = -1;
+            int min_length = Integer.MAX_VALUE;
+            double max_gap_ratio = -1;
+            double min_gap_ratio = Double.MAX_VALUE;
+            double max_avg_gap_count = -1;
+            double min_avg_gap_count = Double.MAX_VALUE;
             for( int i = 0; i < _msa_props.size(); ++i ) {
                 seqs_length[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
-                seqs_length[ i ][ 1 ] = _msa_props.get( i ).getLength();
-                if ( _msa_props.get( i ).getLength() > max_length ) {
-                    max_length = _msa_props.get( i ).getLength();
+                //
+                final int length = _msa_props.get( i ).getLength();
+                seqs_length[ i ][ 1 ] = length;
+                if ( length > max_length ) {
+                    max_length = length;
+                }
+                if ( length < min_length ) {
+                    min_length = length;
+                }
+                //
+                final double gap_ratio = _msa_props.get( i ).getGapRatio();
+                if ( gap_ratio > max_gap_ratio ) {
+                    max_gap_ratio = gap_ratio;
+                }
+                if ( gap_ratio < min_gap_ratio ) {
+                    min_gap_ratio = gap_ratio;
+                }
+                //
+                final double avg_gap_count = _msa_props.get( i ).getAvgNumberOfGaps();
+                if ( avg_gap_count > max_avg_gap_count ) {
+                    max_avg_gap_count = avg_gap_count;
+                }
+                if ( avg_gap_count < min_avg_gap_count ) {
+                    min_avg_gap_count = avg_gap_count;
                 }
             }
-            model.addData( seqs_length, "Length" );
+            model.addData( seqs_length, "Length" + " (" + minMaxToString( min_length, max_length ) + ")" );
             model.setSeriesLine( "Series " + "Length", true );
             model.setSeriesMarker( "Series " + "Length", false );
             final double[][] seqs_gaps = new double[ _msa_props.size() ][ 2 ];
-            double max_gap_ratio = -1;
             double max_ent7 = -1;
             double max_ent21 = -1;
-            for( int i = 0; i < _msa_props.size(); ++i ) {
-                if ( _msa_props.get( i ).getGapRatio() > max_gap_ratio ) {
-                    max_gap_ratio = _msa_props.get( i ).getGapRatio();
-                }
-                if ( _show_msa_qual ) {
-                    if ( _msa_props.get( i ).getEntropy7() > max_ent7 ) {
-                        max_ent7 = _msa_props.get( i ).getEntropy7();
+            double min_ent7 = Double.MAX_VALUE;
+            double min_ent21 = Double.MAX_VALUE;
+            if ( _show_msa_qual ) {
+                for( int i = 0; i < _msa_props.size(); ++i ) {
+                    final double ent7 = _msa_props.get( i ).getEntropy7();
+                    if ( ent7 > max_ent7 ) {
+                        max_ent7 = ent7;
+                    }
+                    if ( ent7 < max_ent7 ) {
+                        min_ent7 = ent7;
                     }
-                    if ( _msa_props.get( i ).getEntropy21() > max_ent21 ) {
-                        max_ent21 = _msa_props.get( i ).getEntropy21();
+                    final double ent21 = _msa_props.get( i ).getEntropy21();
+                    if ( ent21 > min_ent21 ) {
+                        max_ent21 = ent21;
+                    }
+                    if ( ent21 < min_ent21 ) {
+                        min_ent21 = ent21;
                     }
                 }
             }
             final double gap_ratio_factor = ( max_length / 2.0 ) / max_gap_ratio;
+            final double avg_gaps_counts_factor = ( max_length / 2.0 ) / max_avg_gap_count;
             final double ent7_factor = ( max_length / 2.0 ) / max_ent7;
             final double ent21_factor = ( max_length / 2.0 ) / max_ent21;
             for( int i = 0; i < _msa_props.size(); ++i ) {
                 seqs_gaps[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
                 seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * gap_ratio_factor );
             }
-            model.addData( seqs_gaps, "Gap ratio" );
-            model.setSeriesLine( "Series " + "Gap ratio", true );
-            model.setSeriesMarker( "Series " + "Gap ratio", false );
+            model.addData( seqs_gaps, "Gap Ratio" + " (" + minMaxToString( min_gap_ratio, max_gap_ratio ) + ")" );
+            model.setSeriesLine( "Series " + "Gap Ratio", true );
+            model.setSeriesMarker( "Series " + "Gap Ratio", false );
+            final double[][] gap_counts = new double[ _msa_props.size() ][ 2 ];
+            for( int i = 0; i < _msa_props.size(); ++i ) {
+                gap_counts[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
+                gap_counts[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getAvgNumberOfGaps()
+                                                                * avg_gaps_counts_factor );
+            }
+            model.addData( gap_counts, "Mean Gap Count" + " (" + minMaxToString( min_avg_gap_count, max_avg_gap_count )
+                    + ")" );
+            model.setSeriesLine( "Series " + "Mean Gap Count", true );
+            model.setSeriesMarker( "Series " + "Mean Gap Count", false );
             if ( _show_msa_qual ) {
                 final double[][] entropy7 = new double[ _msa_props.size() ][ 2 ];
                 for( int i = 0; i < _msa_props.size(); ++i ) {
                     entropy7[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
                     entropy7[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy7() * ent7_factor );
                 }
-                model.addData( entropy7, "Entropy norm 7" );
+                model.addData( entropy7, "Entropy norm 7" + " (" + minMaxToString( min_ent7, max_ent7 ) + ")" );
                 model.setSeriesLine( "Series " + "Entropy norm 7", true );
                 model.setSeriesMarker( "Series " + "Entropy norm 7", false );
                 //
@@ -144,7 +190,7 @@ public final class Chart extends JDialog implements ActionListener {
                     entropy21[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
                     entropy21[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy21() * ent21_factor );
                 }
-                model.addData( entropy21, "Entropy norm 21" );
+                model.addData( entropy21, "Entropy norm 21" + " (" + minMaxToString( min_ent21, max_ent21 ) + ")" );
                 model.setSeriesLine( "Series " + "Entropy norm 21", true );
                 model.setSeriesMarker( "Series " + "Entropy norm 21", false );
             }
@@ -162,6 +208,10 @@ public final class Chart extends JDialog implements ActionListener {
         return _chart_panel;
     }
 
+    private final static String minMaxToString( final double min, final double max ) {
+        return NF_1.format( min ) + "-" + NF_1.format( max );
+    }
+
     public static void display( final List<MsaProperties> msa_props,
                                 final int initial_number_of_seqs,
                                 final boolean show_msa_qual,
index 687951c..6209da1 100644 (file)
@@ -245,14 +245,14 @@ public class MsaCompactor {
                 }
                 else {
                     n.getNodeData()
-                            .getNodeVisualData()
-                            .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
-                                                                   min,
-                                                                   max,
-                                                                   mean,
-                                                                   min_color,
-                                                                   max_color,
-                                                                   mean_color ) );
+                    .getNodeVisualData()
+                    .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
+                                                           min,
+                                                           max,
+                                                           mean,
+                                                           min_color,
+                                                           max_color,
+                                                           mean_color ) );
                 }
             }
         }
@@ -272,6 +272,7 @@ public class MsaCompactor {
         config.setDisplaySequenceNames( false );
         config.setDisplaySequenceSymbols( false );
         config.setDisplayGeneNames( false );
+        config.setDisplayMultipleSequenceAlignment( true );
         config.setShowScale( true );
         config.setAddTaxonomyImagesCB( false );
         config.setBaseFontSize( 9 );
@@ -302,7 +303,7 @@ public class MsaCompactor {
     }
 
     public final List<MsaProperties> removeViaGapAverage( final double mean_gapiness ) throws IOException,
-            InterruptedException {
+    InterruptedException {
         final GapContribution stats[] = calcGapContribtionsStats( _norm );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
@@ -405,7 +406,7 @@ public class MsaCompactor {
     }
 
     public final List<MsaProperties> removeWorstOffenders( final int to_remove ) throws IOException,
-            InterruptedException {
+    InterruptedException {
         final GapContribution stats[] = calcGapContribtionsStats( _norm );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
@@ -637,7 +638,7 @@ public class MsaCompactor {
         sb.append( "\t" );
         sb.append( NF_4.format( msa_properties.getGapRatio() ) );
         sb.append( "\t" );
-        sb.append( NF_1.format( msa_properties.getAvgNumberOfGapsPer100() ) );
+        sb.append( NF_1.format( msa_properties.getAvgNumberOfGaps() ) );
         if ( _calculate_shannon_entropy ) {
             sb.append( "\t" );
             sb.append( NF_4.format( msa_properties.getEntropy7() ) );
index 5343aec..1abc638 100644 (file)
@@ -34,7 +34,7 @@ public final class MsaProperties {
     final private double _gap_ratio;
     final private int    _length;
     final private int    _number_of_sequences;
-    final private double _avg_number_of_gaps_per_100;
+    final private double _avg_number_of_gaps;
     final private String _removed_seq;
 
     public MsaProperties( final int number_of_sequences,
@@ -42,14 +42,14 @@ public final class MsaProperties {
                           final double gap_ratio,
                           final double entropy7,
                           final double entropy21,
-                          final double avg_number_of_gaps_per_100,
+                          final double avg_number_of_gaps,
                           final String removed_seq ) {
         _number_of_sequences = number_of_sequences;
         _length = length;
         _gap_ratio = gap_ratio;
         _entropy7 = entropy7;
         _entropy21 = entropy21;
-        _avg_number_of_gaps_per_100 = avg_number_of_gaps_per_100;
+        _avg_number_of_gaps = avg_number_of_gaps;
         _removed_seq = removed_seq;
     }
 
@@ -58,7 +58,7 @@ public final class MsaProperties {
         _length = msa.getLength();
         _gap_ratio = MsaMethods.calcGapRatio( msa );
         _removed_seq = removed_seq;
-        _avg_number_of_gaps_per_100 = MsaMethods.calcNumberOfGapsPer100Stats( msa ).arithmeticMean();
+        _avg_number_of_gaps = MsaMethods.calcNumberOfGapsStats( msa ).arithmeticMean();
         if ( calculate_normalized_shannon_entropy ) {
             _entropy7 = MsaMethods.calcNormalizedShannonsEntropy( 7, msa );
             _entropy21 = MsaMethods.calcNormalizedShannonsEntropy( 21, msa );
@@ -81,8 +81,8 @@ public final class MsaProperties {
         return _gap_ratio;
     }
 
-    public final double getAvgNumberOfGapsPer100() {
-        return _avg_number_of_gaps_per_100;
+    public final double getAvgNumberOfGaps() {
+        return _avg_number_of_gaps;
     }
     
     public final int getLength() {