inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 27 Feb 2014 02:27:58 +0000 (02:27 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Thu, 27 Feb 2014 02:27:58 +0000 (02:27 +0000)
forester/java/src/org/forester/application/msa_compactor.java
forester/java/src/org/forester/io/parsers/nhx/NHXParser.java
forester/java/src/org/forester/msa/BasicMsa.java
forester/java/src/org/forester/msa/Msa.java
forester/java/src/org/forester/msa/MsaMethods.java
forester/java/src/org/forester/msa_compactor/GapContribution.java [new file with mode: 0644]
forester/java/src/org/forester/msa_compactor/MsaCompactor.java [moved from forester/java/src/org/forester/msa/MsaCompactor.java with 73% similarity]
forester/java/src/org/forester/sequence/BasicSequence.java
forester/java/src/org/forester/sequence/Sequence.java
forester/java/src/org/forester/test/Test.java

index 9ed5e20..c385ca9 100644 (file)
@@ -10,26 +10,27 @@ import org.forester.io.parsers.FastaParser;
 import org.forester.io.parsers.GeneralMsaParser;
 import org.forester.msa.Msa;
 import org.forester.msa.Msa.MSA_FORMAT;
-import org.forester.msa.MsaCompactor;
+import org.forester.msa_compactor.MsaCompactor;
 import org.forester.msa.MsaMethods;
 import org.forester.util.CommandLineArguments;
 import org.forester.util.ForesterUtil;
 
 public class msa_compactor {
 
-    final static private String HELP_OPTION_1                 = "help";
-    final static private String HELP_OPTION_2                 = "h";
-    final static private String REMOVE_WORST_OFFENDERS_OPTION = "w";
-    final static private String AV_GAPINESS_OPTION            = "a";
-    final static private String STEP_OPTION                   = "s";
-    final static private String LENGTH_OPTION                 = "l";
-    final static private String REALIGN_OPTION                = "r";
-    final static private String PRG_NAME                      = "msa_compactor";
-    final static private String PRG_DESC                      = "multiple sequnce aligment compactor";
-    final static private String PRG_VERSION                   = "0.01";
-    final static private String PRG_DATE                      = "140221";
-    final static private String E_MAIL                        = "phylosoft@gmail.com";
-    final static private String WWW                           = "https://sites.google.com/site/cmzmasek/home/software/forester";
+    final static private String HELP_OPTION_1                          = "help";
+    final static private String HELP_OPTION_2                          = "h";
+    final static private String REMOVE_WORST_OFFENDERS_OPTION          = "w";
+    final static private String AV_GAPINESS_OPTION                     = "a";
+    final static private String STEP_OPTION                            = "s";
+    final static private String LENGTH_OPTION                          = "l";
+    final static private String REALIGN_OPTION                         = "r";
+    final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn";
+    final static private String PRG_NAME                               = "msa_compactor";
+    final static private String PRG_DESC                               = "multiple sequnce aligment compactor";
+    final static private String PRG_VERSION                            = "0.01";
+    final static private String PRG_DATE                               = "140221";
+    final static private String E_MAIL                                 = "phylosoft@gmail.com";
+    final static private String WWW                                    = "https://sites.google.com/site/cmzmasek/home/software/forester";
 
     public static void main( final String args[] ) {
         try {
@@ -45,11 +46,13 @@ public class msa_compactor {
             int length = -1;
             int step = 1;
             boolean realign = false;
+            boolean norm = true;
             final List<String> allowed_options = new ArrayList<String>();
             allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION );
             allowed_options.add( AV_GAPINESS_OPTION );
             allowed_options.add( LENGTH_OPTION );
             allowed_options.add( REALIGN_OPTION );
+            allowed_options.add( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION );
             allowed_options.add( STEP_OPTION );
             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
             if ( dissallowed_options.length() > 0 ) {
@@ -70,6 +73,9 @@ public class msa_compactor {
             if ( cla.isOptionSet( REALIGN_OPTION ) ) {
                 realign = true;
             }
+            if ( cla.isOptionSet( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION ) ) {
+                norm = false;
+            }
             //            else if ( cla.isOptionSet( STEP_OPTION ) && cla.isOptionSet( WINDOW_OPTION ) ) {
             //                step = cla.getOptionValueAsInt( STEP_OPTION );
             //                window = cla.getOptionValueAsInt( WINDOW_OPTION );
@@ -88,7 +94,7 @@ public class msa_compactor {
             }
             MsaCompactor mc = null;
             if ( worst_remove > 0 ) {
-                mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign );
+                mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign, norm );
             }
             else if ( av > 0 ) {
                 mc = MsaCompactor.reduceGapAverage( msa, av, step, realign, out, 50 );
index 72c95d3..55f0285 100644 (file)
@@ -678,7 +678,7 @@ public final class NHXParser implements PhylogenyParser, IteratingPhylogenyParse
                         node_to_annotate.getNodeData().getSequence().setName( s.substring( 3 ) );
                     }
                     else if ( s.indexOf( '=' ) < 0 ) {
-                        if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT
+                        if ( ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT )
                                 && !allow_errors_in_distance_to_parent ) {
                             throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:"
                                     + "\"" + s + "\"" );
index 5fc2590..daeef6b 100644 (file)
@@ -218,4 +218,9 @@ public class BasicMsa implements Msa {
         }
         return column;
     }
+
+    @Override
+    public boolean isGapAt( final int row, final int col ) {
+        return getResidueAt( row, col ) == Sequence.GAP;
+    }
 }
index 1eb9533..648006a 100644 (file)
@@ -48,6 +48,8 @@ public interface Msa {
 
     public char getResidueAt( int row, int col );
 
+    public boolean isGapAt( int row, int col );
+
     public List<Character> getColumnAt( int col );
 
     public Sequence getSequence( final String id );
index ff8342c..b05e837 100644 (file)
@@ -65,7 +65,7 @@ public final class MsaMethods {
     public static int calcGapSumPerColumn( final Msa msa, final int col ) {
         int gap_rows = 0;
         for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
-            if ( msa.getResidueAt( j, col ) == Sequence.GAP ) {
+            if ( msa.isGapAt( j, col ) ) {
                 gap_rows++;
             }
         }
diff --git a/forester/java/src/org/forester/msa_compactor/GapContribution.java b/forester/java/src/org/forester/msa_compactor/GapContribution.java
new file mode 100644 (file)
index 0000000..b7b3403
--- /dev/null
@@ -0,0 +1,51 @@
+
+package org.forester.msa_compactor;
+
+import org.forester.util.ForesterUtil;
+
+public final class GapContribution implements Comparable<GapContribution> {
+
+    private final String _id;
+    private double       _value;
+
+    GapContribution( final String id ) {
+        if ( ForesterUtil.isEmpty( id ) ) {
+            throw new IllegalArgumentException( "id is empty or null" );
+        }
+        _id = id;
+        _value = 0;
+    }
+
+    final String getId() {
+        return _id;
+    }
+
+    final double getValue() {
+        return _value;
+    }
+
+    final void addToValue( final double v ) {
+        if ( v < 0 ) {
+            throw new IllegalArgumentException( "cannot add negative value" );
+        }
+        _value += v;
+    }
+
+    final void divideValue( final double d ) {
+        if ( d <= 0 ) {
+            throw new IllegalArgumentException( "attempt to divide by non-positive value" );
+        }
+        _value /= d;
+    }
+
+    @Override
+    public int compareTo( final GapContribution o ) {
+        if ( getValue() < o.getValue() ) {
+            return 1;
+        }
+        else if ( getValue() > o.getValue() ) {
+            return -1;
+        }
+        return 0;
+    }
+}
@@ -1,12 +1,11 @@
 
-package org.forester.msa;
+package org.forester.msa_compactor;
 
 import java.io.File;
 import java.io.IOException;
 import java.io.Writer;
 import java.math.RoundingMode;
 import java.text.DecimalFormat;
-import java.text.DecimalFormatSymbols;
 import java.text.NumberFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -15,7 +14,11 @@ import java.util.List;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
+import org.forester.msa.Mafft;
+import org.forester.msa.Msa;
 import org.forester.msa.Msa.MSA_FORMAT;
+import org.forester.msa.MsaInferrer;
+import org.forester.msa.MsaMethods;
 import org.forester.sequence.Sequence;
 import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.DescriptiveStatistics;
@@ -24,10 +27,12 @@ import org.forester.util.ForesterUtil;
 public class MsaCompactor {
 
     final private static NumberFormat NF_3    = new DecimalFormat( "#.###" );
+    final private static NumberFormat NF_4    = new DecimalFormat( "#.####" );
     private static final boolean      VERBOSE = true;
     private Msa                       _msa;
     private final SortedSet<String>   _removed_seq_ids;
     static {
+        NF_4.setRoundingMode( RoundingMode.HALF_UP );
         NF_3.setRoundingMode( RoundingMode.HALF_UP );
     }
 
@@ -51,20 +56,76 @@ public class MsaCompactor {
                   format );
     }
 
-    private final DescriptiveStatistics[] calcGapContribtions() {
+    final int calcNonGapResidues( final Sequence seq ) {
+        int ng = 0;
+        for( int i = 0; i < seq.getLength(); ++i ) {
+            if ( !seq.isGapAt( i ) ) {
+                ++ng;
+            }
+        }
+        return ng;
+    }
+
+    private final DescriptiveStatistics[] calcGapContribtionsX( final boolean normalize_for_effective_seq_length ) {
         final double gappiness[] = calcGappiness();
         final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
         for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
             stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
+            final double l = calculateEffectiveLengthRatio( row );
             for( int col = 0; col < _msa.getLength(); ++col ) {
-                if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) {
-                    stats[ row ].addValue( gappiness[ col ] );
+                if ( !_msa.isGapAt( row, col ) ) {
+                    if ( normalize_for_effective_seq_length ) {
+                        stats[ row ].addValue( gappiness[ col ] / l );
+                    }
+                    else {
+                        stats[ row ].addValue( gappiness[ col ] );
+                    }
                 }
             }
         }
         return stats;
     }
 
+    private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) {
+        final double gappiness[] = calcGappiness();
+        final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ];
+        for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
+            stats[ row ] = new GapContribution( _msa.getIdentifier( row ) );
+            for( int col = 0; col < _msa.getLength(); ++col ) {
+                if ( !_msa.isGapAt( row, col ) ) {
+                    stats[ row ].addToValue( gappiness[ col ] );
+                }
+            }
+            if ( normalize_for_effective_seq_length ) {
+                stats[ row ].divideValue( calculateEffectiveLengthRatio( row ) );
+            }
+            else {
+                // 
+            }
+        }
+        return stats;
+    }
+
+    final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
+        final GapContribution stats[] = calcGapContribtions( norm );
+        Arrays.sort( stats );
+        for( final GapContribution stat : stats ) {
+            final StringBuilder sb = new StringBuilder();
+            sb.append( stat.getId() );
+            sb.append( "\t" );
+            sb.append( NF_4.format( stat.getValue() ) );
+            sb.append( "\t" );
+            //            sb.append( NF_4.format( stat.median() ) );
+            //            sb.append( "\t" );
+            //            sb.append( NF_4.format( stat.getMin() ) );
+            //            sb.append( "\t" );
+            //            sb.append( NF_4.format( stat.getMax() ) );
+            //sb.append( "\t" );
+            System.out.println( sb );
+        }
+        return stats;
+    }
+
     private final double[] calcGappiness() {
         final int l = _msa.getLength();
         final double gappiness[] = new double[ l ];
@@ -75,37 +136,17 @@ public class MsaCompactor {
         return gappiness;
     }
 
-    final private DescriptiveStatistics[] calcStats() {
-        final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
-        dfs.setDecimalSeparator( '.' );
-        final NumberFormat f = new DecimalFormat( "#.####", dfs );
-        f.setRoundingMode( RoundingMode.HALF_UP );
-        final DescriptiveStatistics stats[] = calcGapContribtions();
-        Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MEAN ) );
-        for( final DescriptiveStatistics stat : stats ) {
-            final StringBuilder sb = new StringBuilder();
-            sb.append( stat.getDescription() );
-            sb.append( "\t" );
-            sb.append( f.format( stat.arithmeticMean() ) );
-            sb.append( "\t" );
-            sb.append( f.format( stat.median() ) );
-            sb.append( "\t" );
-            sb.append( f.format( stat.getMin() ) );
-            sb.append( "\t" );
-            sb.append( f.format( stat.getMax() ) );
-            sb.append( "\t" );
-            System.out.println( sb );
-        }
-        return stats;
+    private double calculateEffectiveLengthRatio( final int row ) {
+        return ( double ) calcNonGapResidues( _msa.getSequence( row ) ) / _msa.getLength();
     }
 
     final private void mafft() throws IOException, InterruptedException {
         final MsaInferrer mafft = Mafft
                 .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
         final List<String> opts = new ArrayList<String>();
-        opts.add( "--maxiterate" );
-        opts.add( "1000" );
-        opts.add( "--localpair" );
+        // opts.add( "--maxiterate" );
+        // opts.add( "1000" );
+        // opts.add( "--localpair" );
         opts.add( "--quiet" );
         _msa = mafft.infer( _msa.asSequenceList(), opts );
     }
@@ -148,7 +189,7 @@ public class MsaCompactor {
         int counter = step;
         double gr;
         do {
-            removeWorstOffenders( step, 1, false );
+            removeWorstOffenders( step, 1, false, false );
             if ( realign ) {
                 mafft();
             }
@@ -177,7 +218,7 @@ public class MsaCompactor {
         }
         int counter = step;
         while ( _msa.getLength() > length ) {
-            removeWorstOffenders( step, 1, false );
+            removeWorstOffenders( step, 1, false, false );
             if ( realign ) {
                 mafft();
             }
@@ -188,13 +229,15 @@ public class MsaCompactor {
         }
     }
 
-    final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign )
-            throws IOException, InterruptedException {
-        final DescriptiveStatistics stats[] = calcStats();
+    final private void removeWorstOffenders( final int to_remove,
+                                             final int step,
+                                             final boolean realign,
+                                             final boolean norm ) throws IOException, InterruptedException {
+        final GapContribution stats[] = calcGapContribtionsStats( norm );
         final List<String> to_remove_ids = new ArrayList<String>();
         for( int j = 0; j < to_remove; ++j ) {
-            to_remove_ids.add( stats[ j ].getDescription() );
-            _removed_seq_ids.add( stats[ j ].getDescription() );
+            to_remove_ids.add( stats[ j ].getId() );
+            _removed_seq_ids.add( stats[ j ].getId() );
         }
         //TODO if verbose/interactve
         for( final String id : to_remove_ids ) {
@@ -242,10 +285,11 @@ public class MsaCompactor {
 
     public final static MsaCompactor removeWorstOffenders( final Msa msa,
                                                            final int worst_offenders_to_remove,
-                                                           final boolean realign ) throws IOException,
+                                                           final boolean realign,
+                                                           final boolean norm ) throws IOException,
             InterruptedException {
         final MsaCompactor mc = new MsaCompactor( msa );
-        mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign );
+        mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm );
         return mc;
     }
 
index f850205..a172221 100644 (file)
@@ -155,4 +155,9 @@ public class BasicSequence implements Sequence {
     public String getMolecularSequenceAsString() {
         return new String( getMolecularSequence() );
     }
+
+    @Override
+    public boolean isGapAt( final int position ) {
+        return getResidueAt( position ) == GAP;
+    }
 }
index 3e2473a..cac6bcf 100644 (file)
@@ -49,6 +49,8 @@ public interface Sequence {
 
     public abstract char getResidueAt( final int position );
 
+    public abstract boolean isGapAt( final int position );
+
     public abstract TYPE getType();
 
     public enum TYPE {
index ac070ff..1fcf900 100644 (file)
@@ -912,16 +912,13 @@ public final class Test {
             path = "C:\\Program Files\\mafft-win\\mafft.bat";
         }
         else {
-            path = "/home/czmasek/bin/mafft";
-        }
-        if ( !MsaInferrer.isInstalled( path ) ) {
             path = "mafft";
-        }
-        if ( !MsaInferrer.isInstalled( path ) ) {
-            path = "/usr/local/bin/mafft";
-        }
-        if ( !MsaInferrer.isInstalled( path ) ) {
-            path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft";
+            if ( !MsaInferrer.isInstalled( path ) ) {
+                path = "/usr/bin/mafft";
+            }
+            if ( !MsaInferrer.isInstalled( path ) ) {
+                path = "/usr/local/bin/mafft";
+            }
         }
         if ( MsaInferrer.isInstalled( path ) ) {
             System.out.print( "MAFFT (external program): " );
@@ -1109,18 +1106,22 @@ public final class Test {
             final URL u = new URL( s );
             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
             final Phylogeny[] phys = factory.create( u, new NHXParser() );
-            if ( ( phys == null ) || ( phys.length != 1 ) ) {
+            if ( ( phys == null ) || ( phys.length != 5 ) ) {
                 return false;
             }
-            if ( !phys[ 0 ].toNewHampshire().equals( "((a,b),c);" ) ) {
+            if ( !phys[ 0 ].toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
                 System.out.println( phys[ 0 ].toNewHampshire() );
                 return false;
             }
+            if ( !phys[ 1 ].toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) {
+                System.out.println( phys[ 1 ].toNewHampshire() );
+                return false;
+            }
             final Phylogeny[] phys2 = factory.create( u.openStream(), new NHXParser() );
-            if ( ( phys2 == null ) || ( phys2.length != 1 ) ) {
+            if ( ( phys2 == null ) || ( phys2.length != 5 ) ) {
                 return false;
             }
-            if ( !phys2[ 0 ].toNewHampshire().equals( "((a,b),c);" ) ) {
+            if ( !phys2[ 0 ].toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
                 System.out.println( phys2[ 0 ].toNewHampshire() );
                 return false;
             }
@@ -1131,42 +1132,30 @@ public final class Test {
             if ( !p.hasNext() ) {
                 return false;
             }
-            if ( !p.next().toNewHampshire().equals( "((a,b),c);" ) ) {
+            if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
                 return false;
             }
-            if ( p.hasNext() ) {
-                return false;
-            }
-            if ( p.next() != null ) {
-                return false;
-            }
-            if ( p.hasNext() ) {
-                return false;
-            }
-            if ( p.next() != null ) {
+            if ( !p.hasNext() ) {
                 return false;
             }
             p.reset();
             if ( !p.hasNext() ) {
                 return false;
             }
-            if ( !p.next().toNewHampshire().equals( "((a,b),c);" ) ) {
-                return false;
-            }
-            if ( p.hasNext() ) {
+            if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
                 return false;
             }
-            if ( p.next() != null ) {
+            if ( !p.next().toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) {
                 return false;
             }
-            if ( p.hasNext() ) {
+            p.reset();
+            if ( !p.hasNext() ) {
                 return false;
             }
-            if ( p.next() != null ) {
+            if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
                 return false;
             }
-            p.reset();
-            if ( !p.hasNext() ) {
+            if ( !p.next().toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) {
                 return false;
             }
         }