typed search is being added
[jalview.git] / forester / java / src / org / forester / msa_compactor / MsaCompactor.java
index a1c61c1..494b771 100644 (file)
@@ -64,35 +64,38 @@ import org.forester.phylogeny.data.NodeVisualData;
 import org.forester.phylogeny.data.NodeVisualData.NodeFill;
 import org.forester.phylogeny.data.NodeVisualData.NodeShape;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.tools.ConfidenceAssessor;
 import org.forester.util.BasicDescriptiveStatistics;
+import org.forester.util.DescriptiveStatistics;
 import org.forester.util.ForesterUtil;
 
 public class MsaCompactor {
 
-    final private static NumberFormat NF_3                       = new DecimalFormat( "#.###" );
-    final private static NumberFormat NF_4                       = new DecimalFormat( "#.####" );
-    private boolean                   _calculate_shannon_entropy = false;
+    final private static NumberFormat          NF_1                       = new DecimalFormat( "0.#" );
+    final private static NumberFormat          NF_3                       = new DecimalFormat( "0.###" );
+    final private static NumberFormat          NF_4                       = new DecimalFormat( "0.####" );
+    private boolean                            _calculate_shannon_entropy = false;
     //
-    private String                    _infile_name               = null;
-    private final short               _longest_id_length;
+    private String                             _infile_name               = null;
+    private final short                        _longest_id_length;
     //
-    private String                    _maffts_opts               = "--auto";
-    private DeleteableMsa             _msa                       = null;
-    private boolean                   _norm                      = true;
-    private File                      _out_file_base             = null;
-    private MSA_FORMAT                _output_format             = MSA_FORMAT.FASTA;
-    private String                    _path_to_mafft             = null;
-    private boolean                   _phylogentic_inference     = false;
+    private String                             _maffts_opts               = "--auto";
+    private DeleteableMsa                      _msa                       = null;
+    private boolean                            _norm                      = true;
+    private File                               _out_file_base             = null;
+    private MSA_FORMAT                         _output_format             = MSA_FORMAT.FASTA;
+    private String                             _path_to_mafft             = null;
+    private boolean                            _phylogentic_inference     = false;
     //
-    private boolean                   _realign                   = false;
-    private final SortedSet<String>   _removed_seq_ids;
-    private final ArrayList<Sequence> _removed_seqs;
-    private File                      _removed_seqs_out_base     = null;
-    private int                       _step                      = -1;
-    private int                       _step_for_diagnostics      = -1;
+    private boolean                            _realign                   = false;
+    private final SortedSet<String>            _removed_seq_ids;
+    private final ArrayList<MolecularSequence> _removed_seqs;
+    private File                               _removed_seqs_out_base     = null;
+    private int                                _step                      = -1;
+    private int                                _step_for_diagnostics      = -1;
     static {
+        NF_1.setRoundingMode( RoundingMode.HALF_UP );
         NF_4.setRoundingMode( RoundingMode.HALF_UP );
         NF_3.setRoundingMode( RoundingMode.HALF_UP );
     }
@@ -101,7 +104,7 @@ public class MsaCompactor {
         _msa = msa;
         _removed_seq_ids = new TreeSet<String>();
         _longest_id_length = _msa.determineMaxIdLength();
-        _removed_seqs = new ArrayList<Sequence>();
+        _removed_seqs = new ArrayList<MolecularSequence>();
     }
 
     public final Phylogeny calcTree() {
@@ -222,14 +225,14 @@ public class MsaCompactor {
                 }
                 else {
                     n.getNodeData()
-                            .getNodeVisualData()
-                            .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
-                                                                   min,
-                                                                   max,
-                                                                   mean,
-                                                                   min_color,
-                                                                   max_color,
-                                                                   mean_color ) );
+                    .getNodeVisualData()
+                    .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
+                                                           min,
+                                                           max,
+                                                           mean,
+                                                           min_color,
+                                                           max_color,
+                                                           mean_color ) );
                 }
             }
         }
@@ -260,17 +263,26 @@ public class MsaCompactor {
         return _msa;
     }
 
-    public final void removeSequencesByMinimalLength( final int min_effective_length ) {
-        printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) );
-        System.out.println();
+    public final void removeSequencesByMinimalLength( final int min_effective_length ) throws IOException {
         _msa = DeleteableMsa.createInstance( MsaMethods.removeSequencesByMinimalLength( _msa, min_effective_length ) );
         removeGapColumns();
-        printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) );
+        final String s = writeOutfile();
+        final DescriptiveStatistics msa_stats = MsaMethods.calculateEffectiveLengthStatistics( _msa );
+        System.out.println( "Output MSA                           : " + s );
+        System.out.println( "  MSA length                         : " + _msa.getLength() );
+        System.out.println( "  Number of sequences                : " + _msa.getNumberOfSequences() );
+        System.out.println( "  Median sequence length             : " + NF_1.format( msa_stats.median() ) );
+        System.out.println( "  Mean sequence length               : " + NF_1.format( msa_stats.arithmeticMean() ) );
+        System.out.println( "  Max sequence length                : " + ( ( int ) msa_stats.getMax() ) );
+        System.out.println( "  Min sequence length                : " + ( ( int ) msa_stats.getMin() ) );
+        System.out.println( "  Gap ratio                          : " + NF_4.format( MsaMethods.calcGapRatio( _msa ) ) );
+        System.out.println( "  Normalized Shannon Entropy (entn21): "
+                + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 21, _msa ) ) );
         System.out.println();
     }
 
     public final List<MsaProperties> removeViaGapAverage( final double mean_gapiness ) throws IOException,
-            InterruptedException {
+    InterruptedException {
         final GapContribution stats[] = calcGapContribtionsStats( _norm );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
@@ -292,7 +304,7 @@ public class MsaCompactor {
         while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
@@ -342,7 +354,7 @@ public class MsaCompactor {
         while ( _msa.getLength() > length ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( _msa.getLength() <= length ) ) {
@@ -371,7 +383,7 @@ public class MsaCompactor {
     }
 
     public final List<MsaProperties> removeWorstOffenders( final int to_remove ) throws IOException,
-            InterruptedException {
+    InterruptedException {
         final GapContribution stats[] = calcGapContribtionsStats( _norm );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
@@ -392,7 +404,7 @@ public class MsaCompactor {
         for( int i = 0; i < to_remove_ids.size(); ++i ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
@@ -498,7 +510,7 @@ public class MsaCompactor {
         return s;
     }
 
-    final int calcNonGapResidues( final Sequence seq ) {
+    final int calcNonGapResidues( final MolecularSequence seq ) {
         int ng = 0;
         for( int i = 0; i < seq.getLength(); ++i ) {
             if ( !seq.isGapAt( i ) ) {
@@ -544,6 +556,12 @@ public class MsaCompactor {
         return gappiness;
     }
 
+    private final Phylogeny collapse( final Msa msa, final int threshold ) {
+        final BasicSymmetricalDistanceMatrix m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
+        //TODO
+        return null;
+    }
+
     private final Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method,
                                               final Msa msa,
                                               final boolean write_matrix,