in progress
[jalview.git] / forester / java / src / org / forester / msa_compactor / MsaCompactor.java
index 8f0ea1b..8a13b5c 100644 (file)
@@ -64,7 +64,7 @@ import org.forester.phylogeny.data.NodeVisualData;
 import org.forester.phylogeny.data.NodeVisualData.NodeFill;
 import org.forester.phylogeny.data.NodeVisualData.NodeShape;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
 import org.forester.tools.ConfidenceAssessor;
 import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.DescriptiveStatistics;
@@ -72,28 +72,28 @@ import org.forester.util.ForesterUtil;
 
 public class MsaCompactor {
 
-    final private static NumberFormat NF_1                       = new DecimalFormat( "0.#" );
-    final private static NumberFormat NF_3                       = new DecimalFormat( "0.###" );
-    final private static NumberFormat NF_4                       = new DecimalFormat( "0.####" );
-    private boolean                   _calculate_shannon_entropy = false;
+    final private static NumberFormat          NF_1                       = new DecimalFormat( "0.#" );
+    final private static NumberFormat          NF_3                       = new DecimalFormat( "0.###" );
+    final private static NumberFormat          NF_4                       = new DecimalFormat( "0.####" );
+    private boolean                            _calculate_shannon_entropy = false;
     //
-    private String                    _infile_name               = null;
-    private final short               _longest_id_length;
+    private String                             _infile_name               = null;
+    private final short                        _longest_id_length;
     //
-    private String                    _maffts_opts               = "--auto";
-    private DeleteableMsa             _msa                       = null;
-    private boolean                   _norm                      = true;
-    private File                      _out_file_base             = null;
-    private MSA_FORMAT                _output_format             = MSA_FORMAT.FASTA;
-    private String                    _path_to_mafft             = null;
-    private boolean                   _phylogentic_inference     = false;
+    private String                             _maffts_opts               = "--auto";
+    private DeleteableMsa                      _msa                       = null;
+    private boolean                            _normalize_for_effective_seq_length                      = true;
+    private File                               _out_file_base             = null;
+    private MSA_FORMAT                         _output_format             = MSA_FORMAT.FASTA;
+    private String                             _path_to_mafft             = null;
+    private boolean                            _phylogentic_inference     = false;
     //
-    private boolean                   _realign                   = false;
-    private final SortedSet<String>   _removed_seq_ids;
-    private final ArrayList<Sequence> _removed_seqs;
-    private File                      _removed_seqs_out_base     = null;
-    private int                       _step                      = -1;
-    private int                       _step_for_diagnostics      = -1;
+    private boolean                            _realign                   = false;
+    private final SortedSet<String>            _removed_seq_ids;
+    private final ArrayList<MolecularSequence> _removed_seqs;
+    private File                               _removed_seqs_out_base     = null;
+    private int                                _step                      = -1;
+    private int                                _step_for_diagnostics      = -1;
     static {
         NF_1.setRoundingMode( RoundingMode.HALF_UP );
         NF_4.setRoundingMode( RoundingMode.HALF_UP );
@@ -104,7 +104,7 @@ public class MsaCompactor {
         _msa = msa;
         _removed_seq_ids = new TreeSet<String>();
         _longest_id_length = _msa.determineMaxIdLength();
-        _removed_seqs = new ArrayList<Sequence>();
+        _removed_seqs = new ArrayList<MolecularSequence>();
     }
 
     public final Phylogeny calcTree() {
@@ -130,9 +130,9 @@ public class MsaCompactor {
         return phy;
     }
 
-    public final List<MsaProperties> chart( final int step, final boolean realign, final boolean norm )
+    public final List<MsaProperties> chart( final int step, final boolean realign, final boolean normalize_for_effective_seq_length )
             throws IOException, InterruptedException {
-        final GapContribution stats[] = calcGapContribtionsStats( norm );
+        final GapContribution stats[] = calcGapContribtionsStats( normalize_for_effective_seq_length );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
         for( final GapContribution gap_gontribution : stats ) {
@@ -143,6 +143,7 @@ public class MsaCompactor {
             System.out.println( "calculating phylogentic tree..." );
             System.out.println();
             phy = calcTree();
+            addSeqs2Tree( _msa, phy );
         }
         if ( !_realign ) {
             _step = -1;
@@ -178,6 +179,7 @@ public class MsaCompactor {
             }
             ++i;
         }
+        
         if ( _phylogentic_inference ) {
             decorateTree( phy, msa_props, true );
             displayTree( phy );
@@ -185,7 +187,26 @@ public class MsaCompactor {
         return msa_props;
     }
 
-    public final void decorateTree( final Phylogeny phy, final List<MsaProperties> msa_props, final boolean chart_only ) {
+    private final static void addSeqs2Tree( final Msa msa, final Phylogeny phy ) {
+        for( int i = 0; i < msa.getNumberOfSequences(); ++i ) {
+            final MolecularSequence seq = msa.getSequence( i );
+            final String seq_name = seq.getIdentifier();
+            final PhylogenyNode n = phy.getNode( seq_name );
+            if ( !n.getNodeData().isHasSequence() ) {
+                n.getNodeData().addSequence( new org.forester.phylogeny.data.Sequence() );
+            }
+            else {
+                throw new IllegalArgumentException( "this should not have happened" );
+            }
+            n.getNodeData().getSequence().setMolecularSequence( seq.getMolecularSequenceAsString() );
+            n.getNodeData().getSequence().setMolecularSequenceAligned( true );
+            n.getNodeData().getSequence().setName( seq_name );
+        }
+    }
+
+    private final static void decorateTree( final Phylogeny phy,
+                                            final List<MsaProperties> msa_props,
+                                            final boolean chart_only ) {
         final BasicDescriptiveStatistics length_stats = new BasicDescriptiveStatistics();
         for( int i = 0; i < msa_props.size(); ++i ) {
             final MsaProperties msa_prop = msa_props.get( i );
@@ -225,14 +246,14 @@ public class MsaCompactor {
                 }
                 else {
                     n.getNodeData()
-                            .getNodeVisualData()
-                            .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
-                                                                   min,
-                                                                   max,
-                                                                   mean,
-                                                                   min_color,
-                                                                   max_color,
-                                                                   mean_color ) );
+                    .getNodeVisualData()
+                    .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
+                                                           min,
+                                                           max,
+                                                           mean,
+                                                           min_color,
+                                                           max_color,
+                                                           mean_color ) );
                 }
             }
         }
@@ -252,6 +273,7 @@ public class MsaCompactor {
         config.setDisplaySequenceNames( false );
         config.setDisplaySequenceSymbols( false );
         config.setDisplayGeneNames( false );
+        config.setDisplayMultipleSequenceAlignment( true );
         config.setShowScale( true );
         config.setAddTaxonomyImagesCB( false );
         config.setBaseFontSize( 9 );
@@ -282,8 +304,8 @@ public class MsaCompactor {
     }
 
     public final List<MsaProperties> removeViaGapAverage( final double mean_gapiness ) throws IOException,
-            InterruptedException {
-        final GapContribution stats[] = calcGapContribtionsStats( _norm );
+    InterruptedException {
+        final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
         for( final GapContribution gap_gontribution : stats ) {
@@ -294,6 +316,7 @@ public class MsaCompactor {
             System.out.println( "calculating phylogentic tree..." );
             System.out.println();
             phy = calcTree();
+            addSeqs2Tree( _msa, phy );
         }
         printTableHeader();
         MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
@@ -304,7 +327,7 @@ public class MsaCompactor {
         while ( MsaMethods.calcGapRatio( _msa ) > mean_gapiness ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( MsaMethods.calcGapRatio( _msa ) <= mean_gapiness ) ) {
@@ -328,12 +351,19 @@ public class MsaCompactor {
         if ( _phylogentic_inference ) {
             decorateTree( phy, msa_props, false );
             displayTree( phy );
-        }
+            System.out.println( "calculating phylogentic tree..." );
+            System.out.println();
+            final Phylogeny phy2 = calcTree();
+            addSeqs2Tree( _msa, phy2 );
+            displayTree( phy2 );
+        }   
+      
+       
         return msa_props;
     }
 
     public List<MsaProperties> removeViaLength( final int length ) throws IOException, InterruptedException {
-        final GapContribution stats[] = calcGapContribtionsStats( _norm );
+        final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
         for( final GapContribution gap_gontribution : stats ) {
@@ -344,6 +374,7 @@ public class MsaCompactor {
             System.out.println( "calculating phylogentic tree..." );
             System.out.println();
             phy = calcTree();
+            addSeqs2Tree( _msa, phy );
         }
         printTableHeader();
         MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
@@ -354,7 +385,7 @@ public class MsaCompactor {
         while ( _msa.getLength() > length ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( _msa.getLength() <= length ) ) {
@@ -378,13 +409,19 @@ public class MsaCompactor {
         if ( _phylogentic_inference ) {
             decorateTree( phy, msa_props, false );
             displayTree( phy );
-        }
+            System.out.println( "calculating phylogentic tree..." );
+            System.out.println();
+            final Phylogeny phy2 = calcTree();
+            addSeqs2Tree( _msa, phy2 );
+            displayTree( phy2 );
+        }   
+       
         return msa_props;
     }
 
     public final List<MsaProperties> removeWorstOffenders( final int to_remove ) throws IOException,
-            InterruptedException {
-        final GapContribution stats[] = calcGapContribtionsStats( _norm );
+    InterruptedException {
+        final GapContribution stats[] = calcGapContribtionsStats( _normalize_for_effective_seq_length );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
         for( int j = 0; j < to_remove; ++j ) {
@@ -395,6 +432,7 @@ public class MsaCompactor {
             System.out.println( "calculating phylogentic tree..." );
             System.out.println();
             phy = calcTree();
+            addSeqs2Tree( _msa, phy );
         }
         printTableHeader();
         MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
@@ -404,7 +442,7 @@ public class MsaCompactor {
         for( int i = 0; i < to_remove_ids.size(); ++i ) {
             final String id = to_remove_ids.get( i );
             _removed_seq_ids.add( id );
-            final Sequence deleted = _msa.deleteRow( id, true );
+            final MolecularSequence deleted = _msa.deleteRow( id, true );
             _removed_seqs.add( deleted );
             removeGapColumns();
             if ( isPrintMsaStatsWriteOutfileAndRealign( i ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
@@ -427,7 +465,13 @@ public class MsaCompactor {
         if ( _phylogentic_inference ) {
             decorateTree( phy, msa_props, false );
             displayTree( phy );
-        }
+            System.out.println( "calculating phylogentic tree..." );
+            System.out.println();
+            final Phylogeny phy2 = calcTree();
+            addSeqs2Tree( _msa, phy2 );
+            displayTree( phy2 );
+        }   
+       
         return msa_props;
     }
 
@@ -443,8 +487,8 @@ public class MsaCompactor {
         _maffts_opts = maffts_opts;
     }
 
-    public final void setNorm( final boolean norm ) {
-        _norm = norm;
+    public final void setNorm( final boolean normalize_for_effective_seq_length ) {
+        _normalize_for_effective_seq_length = normalize_for_effective_seq_length;
     }
 
     final public void setOutFileBase( final File out_file_base ) {
@@ -510,7 +554,7 @@ public class MsaCompactor {
         return s;
     }
 
-    final int calcNonGapResidues( final Sequence seq ) {
+    final int calcNonGapResidues( final MolecularSequence seq ) {
         int ng = 0;
         for( int i = 0; i < seq.getLength(); ++i ) {
             if ( !seq.isGapAt( i ) ) {
@@ -540,8 +584,8 @@ public class MsaCompactor {
         return stats;
     }
 
-    final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
-        final GapContribution stats[] = calcGapContribtions( norm );
+    final private GapContribution[] calcGapContribtionsStats( final boolean normalize_for_effective_seq_length ) {
+        final GapContribution stats[] = calcGapContribtions( normalize_for_effective_seq_length );
         Arrays.sort( stats );
         return stats;
     }
@@ -608,6 +652,8 @@ public class MsaCompactor {
         sb.append( msa_properties.getLength() );
         sb.append( "\t" );
         sb.append( NF_4.format( msa_properties.getGapRatio() ) );
+        sb.append( "\t" );
+        sb.append( NF_1.format( msa_properties.getAvgNumberOfGaps() ) );
         if ( _calculate_shannon_entropy ) {
             sb.append( "\t" );
             sb.append( NF_4.format( msa_properties.getEntropy7() ) );
@@ -675,6 +721,8 @@ public class MsaCompactor {
         System.out.print( "\t" );
         System.out.print( "Length" );
         System.out.print( "\t" );
+        System.out.print( "Gap R" );
+        System.out.print( "\t" );
         System.out.print( "Gaps" );
         System.out.print( "\t" );
         if ( _calculate_shannon_entropy ) {