in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 28 Jan 2015 03:15:33 +0000 (03:15 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 28 Jan 2015 03:15:33 +0000 (03:15 +0000)
forester/java/src/org/forester/application/msa_compactor.java
forester/java/src/org/forester/msa/MsaMethods.java
forester/java/src/org/forester/msa_compactor/MsaCompactor.java
forester/java/src/org/forester/msa_compactor/MsaProperties.java

index 4b68a89..15c2d4c 100644 (file)
@@ -427,6 +427,10 @@ public class msa_compactor {
         System.out.println( "  Min sequence length                : " + ( ( int ) initial_msa_stats.getMin() ) );
         System.out.println( "  Gap ratio                          : "
                 + NF_4.format( MsaMethods.calcGapRatio( msa ) ) );
+        System.out.println( "  Mean gap count per 100 residues    : "
+                + NF_1.format( MsaMethods.calcNumberOfGapsPer100Stats( msa ).arithmeticMean() ) );
+        System.out.println( "  Normalized Shannon Entropy (entn7) : "
+                + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 7, msa ) ) );
         System.out.println( "  Normalized Shannon Entropy (entn21): "
                 + NF_4.format( MsaMethods.calcNormalizedShannonsEntropy( 21, msa ) ) );
     }
index ee5188e..881e622 100644 (file)
@@ -107,6 +107,44 @@ public final class MsaMethods {
         _ignored_seqs_ids = new ArrayList<String>();
     }
 
+    public static final DescriptiveStatistics calcNumberOfGapsPer100Stats( final Msa msa ) {
+        final int[] gaps = calcNumberOfGapsInMsa( msa );
+        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+        final double n = 100.0 / msa.getLength();
+        for( final int gap : gaps ) {
+            stats.addValue( n * gap );
+        }
+        return stats;
+    }
+
+    public static final int[] calcNumberOfGapsInMsa( final Msa msa ) {
+        final int seqs = msa.getNumberOfSequences();
+        final int[]  gaps= new int[ seqs ];
+        for( int i = 0; i < seqs; ++i ) {
+            gaps[ i ] =  calcNumberOfGaps( msa.getSequence( i ) );
+        }
+        return gaps;
+    }
+    
+    
+
+    public final static int calcNumberOfGaps( final MolecularSequence seq  ) {
+        int gaps = 0;
+        boolean was_gap = false;
+        for( int i = 0; i < seq.getLength(); ++i ) {
+            if ( seq.isGapAt( i ) ) {
+               if ( !was_gap ) {
+                   ++gaps;
+                   was_gap = true;
+               }
+            }
+            else {
+                was_gap = false;
+            }
+        }
+        return gaps;
+    }
+
     public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
         for( int i = 0; i < msa.getLength(); ++i ) {
index da74dec..687951c 100644 (file)
@@ -230,34 +230,32 @@ public class MsaCompactor {
                 it.next().getNodeData().setNodeVisualData( vis );
             }
         }
-      
-                for( int i = 0; i < msa_props.size(); ++i ) {
-                    final MsaProperties msa_prop = msa_props.get( i );
-                    final String id = msa_prop.getRemovedSeq();
-                    if ( !ForesterUtil.isEmpty( id ) ) {
-                        final PhylogenyNode n = phy.getNode( id );
-                        n.setName( n.getName() + " [" + i + "]" );
-                        if ( !chart_only ) {
-                            final NodeVisualData vis = new NodeVisualData();
-                            vis.setFillType( NodeFill.SOLID );
-                            vis.setShape( NodeShape.RECTANGLE );
-                            vis.setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(), min, max, mean_color, max_color ) );
-                            n.getNodeData().setNodeVisualData( vis );
-                        }
-                        else {
-                            n.getNodeData()
-                                    .getNodeVisualData()
-                                    .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
-                                                                           min,
-                                                                           max,
-                                                                           mean,
-                                                                           min_color,
-                                                                           max_color,
-                                                                           mean_color ) );
-                        }
-                    }
-        
+        for( int i = 0; i < msa_props.size(); ++i ) {
+            final MsaProperties msa_prop = msa_props.get( i );
+            final String id = msa_prop.getRemovedSeq();
+            if ( !ForesterUtil.isEmpty( id ) ) {
+                final PhylogenyNode n = phy.getNode( id );
+                n.setName( n.getName() + " [" + i + "]" );
+                if ( !chart_only ) {
+                    final NodeVisualData vis = new NodeVisualData();
+                    vis.setFillType( NodeFill.SOLID );
+                    vis.setShape( NodeShape.RECTANGLE );
+                    vis.setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(), min, max, mean_color, max_color ) );
+                    n.getNodeData().setNodeVisualData( vis );
+                }
+                else {
+                    n.getNodeData()
+                            .getNodeVisualData()
+                            .setNodeColor( ForesterUtil.calcColor( msa_prop.getLength(),
+                                                                   min,
+                                                                   max,
+                                                                   mean,
+                                                                   min_color,
+                                                                   max_color,
+                                                                   mean_color ) );
                 }
+            }
+        }
     }
 
     final public void deleteGapColumns( final double max_allowed_gap_ratio ) {
@@ -304,7 +302,7 @@ public class MsaCompactor {
     }
 
     public final List<MsaProperties> removeViaGapAverage( final double mean_gapiness ) throws IOException,
-    InterruptedException {
+            InterruptedException {
         final GapContribution stats[] = calcGapContribtionsStats( _norm );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
@@ -349,7 +347,7 @@ public class MsaCompactor {
             System.out.println( msg );
         }
         if ( _phylogentic_inference ) {
-            decorateTree( phy,  msa_props, false );
+            decorateTree( phy, msa_props, false );
             displayTree( phy );
         }
         return msa_props;
@@ -400,14 +398,14 @@ public class MsaCompactor {
             System.out.println( msg );
         }
         if ( _phylogentic_inference ) {
-            decorateTree( phy,  msa_props, false );
+            decorateTree( phy, msa_props, false );
             displayTree( phy );
         }
         return msa_props;
     }
 
     public final List<MsaProperties> removeWorstOffenders( final int to_remove ) throws IOException,
-    InterruptedException {
+            InterruptedException {
         final GapContribution stats[] = calcGapContribtionsStats( _norm );
         final List<String> to_remove_ids = new ArrayList<String>();
         final List<MsaProperties> msa_props = new ArrayList<MsaProperties>();
@@ -450,16 +448,13 @@ public class MsaCompactor {
             System.out.println( msg );
         }
         if ( _phylogentic_inference ) {
-            decorateTree( phy,  msa_props, false );
+            decorateTree( phy, msa_props, false );
             displayTree( phy );
-            
-           
-                System.out.println( "calculating phylogentic tree..." );
-                System.out.println();
-                Phylogeny phy2 = calcTree();
-                addSeqs2Tree( _msa, phy2 );
-                displayTree( phy2 );
-            
+            System.out.println( "calculating phylogentic tree..." );
+            System.out.println();
+            final Phylogeny phy2 = calcTree();
+            addSeqs2Tree( _msa, phy2 );
+            displayTree( phy2 );
         }
         return msa_props;
     }
@@ -641,6 +636,8 @@ public class MsaCompactor {
         sb.append( msa_properties.getLength() );
         sb.append( "\t" );
         sb.append( NF_4.format( msa_properties.getGapRatio() ) );
+        sb.append( "\t" );
+        sb.append( NF_1.format( msa_properties.getAvgNumberOfGapsPer100() ) );
         if ( _calculate_shannon_entropy ) {
             sb.append( "\t" );
             sb.append( NF_4.format( msa_properties.getEntropy7() ) );
@@ -708,6 +705,8 @@ public class MsaCompactor {
         System.out.print( "\t" );
         System.out.print( "Length" );
         System.out.print( "\t" );
+        System.out.print( "Gap R" );
+        System.out.print( "\t" );
         System.out.print( "Gaps" );
         System.out.print( "\t" );
         if ( _calculate_shannon_entropy ) {
index ae575b2..5343aec 100644 (file)
@@ -34,6 +34,7 @@ public final class MsaProperties {
     final private double _gap_ratio;
     final private int    _length;
     final private int    _number_of_sequences;
+    final private double _avg_number_of_gaps_per_100;
     final private String _removed_seq;
 
     public MsaProperties( final int number_of_sequences,
@@ -41,12 +42,14 @@ public final class MsaProperties {
                           final double gap_ratio,
                           final double entropy7,
                           final double entropy21,
+                          final double avg_number_of_gaps_per_100,
                           final String removed_seq ) {
         _number_of_sequences = number_of_sequences;
         _length = length;
         _gap_ratio = gap_ratio;
         _entropy7 = entropy7;
         _entropy21 = entropy21;
+        _avg_number_of_gaps_per_100 = avg_number_of_gaps_per_100;
         _removed_seq = removed_seq;
     }
 
@@ -55,6 +58,7 @@ public final class MsaProperties {
         _length = msa.getLength();
         _gap_ratio = MsaMethods.calcGapRatio( msa );
         _removed_seq = removed_seq;
+        _avg_number_of_gaps_per_100 = MsaMethods.calcNumberOfGapsPer100Stats( msa ).arithmeticMean();
         if ( calculate_normalized_shannon_entropy ) {
             _entropy7 = MsaMethods.calcNormalizedShannonsEntropy( 7, msa );
             _entropy21 = MsaMethods.calcNormalizedShannonsEntropy( 21, msa );
@@ -77,6 +81,10 @@ public final class MsaProperties {
         return _gap_ratio;
     }
 
+    public final double getAvgNumberOfGapsPer100() {
+        return _avg_number_of_gaps_per_100;
+    }
+    
     public final int getLength() {
         return _length;
     }