From: cmzmasek@gmail.com Date: Thu, 27 Feb 2014 02:27:58 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=d261d45fb4105fb12b0158d05311394b14ec014c;p=jalview.git inprogress --- diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index 9ed5e20..c385ca9 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -10,26 +10,27 @@ import org.forester.io.parsers.FastaParser; import org.forester.io.parsers.GeneralMsaParser; import org.forester.msa.Msa; import org.forester.msa.Msa.MSA_FORMAT; -import org.forester.msa.MsaCompactor; +import org.forester.msa_compactor.MsaCompactor; import org.forester.msa.MsaMethods; import org.forester.util.CommandLineArguments; import org.forester.util.ForesterUtil; public class msa_compactor { - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String REMOVE_WORST_OFFENDERS_OPTION = "w"; - final static private String AV_GAPINESS_OPTION = "a"; - final static private String STEP_OPTION = "s"; - final static private String LENGTH_OPTION = "l"; - final static private String REALIGN_OPTION = "r"; - final static private String PRG_NAME = "msa_compactor"; - final static private String PRG_DESC = "multiple sequnce aligment compactor"; - final static private String PRG_VERSION = "0.01"; - final static private String PRG_DATE = "140221"; - final static private String E_MAIL = "phylosoft@gmail.com"; - final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String REMOVE_WORST_OFFENDERS_OPTION = "w"; + final static private String AV_GAPINESS_OPTION = "a"; + final static private String STEP_OPTION = "s"; + final static private String LENGTH_OPTION = "l"; + final static private String REALIGN_OPTION = "r"; + final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn"; + final static private String PRG_NAME = "msa_compactor"; + final static private String PRG_DESC = "multiple sequnce aligment compactor"; + final static private String PRG_VERSION = "0.01"; + final static private String PRG_DATE = "140221"; + final static private String E_MAIL = "phylosoft@gmail.com"; + final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; public static void main( final String args[] ) { try { @@ -45,11 +46,13 @@ public class msa_compactor { int length = -1; int step = 1; boolean realign = false; + boolean norm = true; final List allowed_options = new ArrayList(); allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION ); allowed_options.add( AV_GAPINESS_OPTION ); allowed_options.add( LENGTH_OPTION ); allowed_options.add( REALIGN_OPTION ); + allowed_options.add( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION ); allowed_options.add( STEP_OPTION ); final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); if ( dissallowed_options.length() > 0 ) { @@ -70,6 +73,9 @@ public class msa_compactor { if ( cla.isOptionSet( REALIGN_OPTION ) ) { realign = true; } + if ( cla.isOptionSet( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION ) ) { + norm = false; + } // else if ( cla.isOptionSet( STEP_OPTION ) && cla.isOptionSet( WINDOW_OPTION ) ) { // step = cla.getOptionValueAsInt( STEP_OPTION ); // window = cla.getOptionValueAsInt( WINDOW_OPTION ); @@ -88,7 +94,7 @@ public class msa_compactor { } MsaCompactor mc = null; if ( worst_remove > 0 ) { - mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign ); + mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign, norm ); } else if ( av > 0 ) { mc = MsaCompactor.reduceGapAverage( msa, av, step, realign, out, 50 ); diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java index 72c95d3..55f0285 100644 --- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -678,7 +678,7 @@ public final class NHXParser implements PhylogenyParser, IteratingPhylogenyParse node_to_annotate.getNodeData().getSequence().setName( s.substring( 3 ) ); } else if ( s.indexOf( '=' ) < 0 ) { - if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT + if ( ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) && !allow_errors_in_distance_to_parent ) { throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:" + "\"" + s + "\"" ); diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index 5fc2590..daeef6b 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -218,4 +218,9 @@ public class BasicMsa implements Msa { } return column; } + + @Override + public boolean isGapAt( final int row, final int col ) { + return getResidueAt( row, col ) == Sequence.GAP; + } } diff --git a/forester/java/src/org/forester/msa/Msa.java b/forester/java/src/org/forester/msa/Msa.java index 1eb9533..648006a 100644 --- a/forester/java/src/org/forester/msa/Msa.java +++ b/forester/java/src/org/forester/msa/Msa.java @@ -48,6 +48,8 @@ public interface Msa { public char getResidueAt( int row, int col ); + public boolean isGapAt( int row, int col ); + public List getColumnAt( int col ); public Sequence getSequence( final String id ); diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index ff8342c..b05e837 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -65,7 +65,7 @@ public final class MsaMethods { public static int calcGapSumPerColumn( final Msa msa, final int col ) { int gap_rows = 0; for( int j = 0; j < msa.getNumberOfSequences(); ++j ) { - if ( msa.getResidueAt( j, col ) == Sequence.GAP ) { + if ( msa.isGapAt( j, col ) ) { gap_rows++; } } diff --git a/forester/java/src/org/forester/msa_compactor/GapContribution.java b/forester/java/src/org/forester/msa_compactor/GapContribution.java new file mode 100644 index 0000000..b7b3403 --- /dev/null +++ b/forester/java/src/org/forester/msa_compactor/GapContribution.java @@ -0,0 +1,51 @@ + +package org.forester.msa_compactor; + +import org.forester.util.ForesterUtil; + +public final class GapContribution implements Comparable { + + private final String _id; + private double _value; + + GapContribution( final String id ) { + if ( ForesterUtil.isEmpty( id ) ) { + throw new IllegalArgumentException( "id is empty or null" ); + } + _id = id; + _value = 0; + } + + final String getId() { + return _id; + } + + final double getValue() { + return _value; + } + + final void addToValue( final double v ) { + if ( v < 0 ) { + throw new IllegalArgumentException( "cannot add negative value" ); + } + _value += v; + } + + final void divideValue( final double d ) { + if ( d <= 0 ) { + throw new IllegalArgumentException( "attempt to divide by non-positive value" ); + } + _value /= d; + } + + @Override + public int compareTo( final GapContribution o ) { + if ( getValue() < o.getValue() ) { + return 1; + } + else if ( getValue() > o.getValue() ) { + return -1; + } + return 0; + } +} diff --git a/forester/java/src/org/forester/msa/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java similarity index 73% rename from forester/java/src/org/forester/msa/MsaCompactor.java rename to forester/java/src/org/forester/msa_compactor/MsaCompactor.java index f6b30b0..0356dec 100644 --- a/forester/java/src/org/forester/msa/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -1,12 +1,11 @@ -package org.forester.msa; +package org.forester.msa_compactor; import java.io.File; import java.io.IOException; import java.io.Writer; import java.math.RoundingMode; import java.text.DecimalFormat; -import java.text.DecimalFormatSymbols; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; @@ -15,7 +14,11 @@ import java.util.List; import java.util.SortedSet; import java.util.TreeSet; +import org.forester.msa.Mafft; +import org.forester.msa.Msa; import org.forester.msa.Msa.MSA_FORMAT; +import org.forester.msa.MsaInferrer; +import org.forester.msa.MsaMethods; import org.forester.sequence.Sequence; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; @@ -24,10 +27,12 @@ import org.forester.util.ForesterUtil; public class MsaCompactor { final private static NumberFormat NF_3 = new DecimalFormat( "#.###" ); + final private static NumberFormat NF_4 = new DecimalFormat( "#.####" ); private static final boolean VERBOSE = true; private Msa _msa; private final SortedSet _removed_seq_ids; static { + NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); } @@ -51,20 +56,76 @@ public class MsaCompactor { format ); } - private final DescriptiveStatistics[] calcGapContribtions() { + final int calcNonGapResidues( final Sequence seq ) { + int ng = 0; + for( int i = 0; i < seq.getLength(); ++i ) { + if ( !seq.isGapAt( i ) ) { + ++ng; + } + } + return ng; + } + + private final DescriptiveStatistics[] calcGapContribtionsX( final boolean normalize_for_effective_seq_length ) { final double gappiness[] = calcGappiness(); final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ]; for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) { stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) ); + final double l = calculateEffectiveLengthRatio( row ); for( int col = 0; col < _msa.getLength(); ++col ) { - if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) { - stats[ row ].addValue( gappiness[ col ] ); + if ( !_msa.isGapAt( row, col ) ) { + if ( normalize_for_effective_seq_length ) { + stats[ row ].addValue( gappiness[ col ] / l ); + } + else { + stats[ row ].addValue( gappiness[ col ] ); + } } } } return stats; } + private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) { + final double gappiness[] = calcGappiness(); + final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ]; + for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) { + stats[ row ] = new GapContribution( _msa.getIdentifier( row ) ); + for( int col = 0; col < _msa.getLength(); ++col ) { + if ( !_msa.isGapAt( row, col ) ) { + stats[ row ].addToValue( gappiness[ col ] ); + } + } + if ( normalize_for_effective_seq_length ) { + stats[ row ].divideValue( calculateEffectiveLengthRatio( row ) ); + } + else { + // + } + } + return stats; + } + + final private GapContribution[] calcGapContribtionsStats( final boolean norm ) { + final GapContribution stats[] = calcGapContribtions( norm ); + Arrays.sort( stats ); + for( final GapContribution stat : stats ) { + final StringBuilder sb = new StringBuilder(); + sb.append( stat.getId() ); + sb.append( "\t" ); + sb.append( NF_4.format( stat.getValue() ) ); + sb.append( "\t" ); + // sb.append( NF_4.format( stat.median() ) ); + // sb.append( "\t" ); + // sb.append( NF_4.format( stat.getMin() ) ); + // sb.append( "\t" ); + // sb.append( NF_4.format( stat.getMax() ) ); + //sb.append( "\t" ); + System.out.println( sb ); + } + return stats; + } + private final double[] calcGappiness() { final int l = _msa.getLength(); final double gappiness[] = new double[ l ]; @@ -75,37 +136,17 @@ public class MsaCompactor { return gappiness; } - final private DescriptiveStatistics[] calcStats() { - final DecimalFormatSymbols dfs = new DecimalFormatSymbols(); - dfs.setDecimalSeparator( '.' ); - final NumberFormat f = new DecimalFormat( "#.####", dfs ); - f.setRoundingMode( RoundingMode.HALF_UP ); - final DescriptiveStatistics stats[] = calcGapContribtions(); - Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MEAN ) ); - for( final DescriptiveStatistics stat : stats ) { - final StringBuilder sb = new StringBuilder(); - sb.append( stat.getDescription() ); - sb.append( "\t" ); - sb.append( f.format( stat.arithmeticMean() ) ); - sb.append( "\t" ); - sb.append( f.format( stat.median() ) ); - sb.append( "\t" ); - sb.append( f.format( stat.getMin() ) ); - sb.append( "\t" ); - sb.append( f.format( stat.getMax() ) ); - sb.append( "\t" ); - System.out.println( sb ); - } - return stats; + private double calculateEffectiveLengthRatio( final int row ) { + return ( double ) calcNonGapResidues( _msa.getSequence( row ) ) / _msa.getLength(); } final private void mafft() throws IOException, InterruptedException { final MsaInferrer mafft = Mafft .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); final List opts = new ArrayList(); - opts.add( "--maxiterate" ); - opts.add( "1000" ); - opts.add( "--localpair" ); + // opts.add( "--maxiterate" ); + // opts.add( "1000" ); + // opts.add( "--localpair" ); opts.add( "--quiet" ); _msa = mafft.infer( _msa.asSequenceList(), opts ); } @@ -148,7 +189,7 @@ public class MsaCompactor { int counter = step; double gr; do { - removeWorstOffenders( step, 1, false ); + removeWorstOffenders( step, 1, false, false ); if ( realign ) { mafft(); } @@ -177,7 +218,7 @@ public class MsaCompactor { } int counter = step; while ( _msa.getLength() > length ) { - removeWorstOffenders( step, 1, false ); + removeWorstOffenders( step, 1, false, false ); if ( realign ) { mafft(); } @@ -188,13 +229,15 @@ public class MsaCompactor { } } - final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign ) - throws IOException, InterruptedException { - final DescriptiveStatistics stats[] = calcStats(); + final private void removeWorstOffenders( final int to_remove, + final int step, + final boolean realign, + final boolean norm ) throws IOException, InterruptedException { + final GapContribution stats[] = calcGapContribtionsStats( norm ); final List to_remove_ids = new ArrayList(); for( int j = 0; j < to_remove; ++j ) { - to_remove_ids.add( stats[ j ].getDescription() ); - _removed_seq_ids.add( stats[ j ].getDescription() ); + to_remove_ids.add( stats[ j ].getId() ); + _removed_seq_ids.add( stats[ j ].getId() ); } //TODO if verbose/interactve for( final String id : to_remove_ids ) { @@ -242,10 +285,11 @@ public class MsaCompactor { public final static MsaCompactor removeWorstOffenders( final Msa msa, final int worst_offenders_to_remove, - final boolean realign ) throws IOException, + final boolean realign, + final boolean norm ) throws IOException, InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); - mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign ); + mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm ); return mc; } diff --git a/forester/java/src/org/forester/sequence/BasicSequence.java b/forester/java/src/org/forester/sequence/BasicSequence.java index f850205..a172221 100644 --- a/forester/java/src/org/forester/sequence/BasicSequence.java +++ b/forester/java/src/org/forester/sequence/BasicSequence.java @@ -155,4 +155,9 @@ public class BasicSequence implements Sequence { public String getMolecularSequenceAsString() { return new String( getMolecularSequence() ); } + + @Override + public boolean isGapAt( final int position ) { + return getResidueAt( position ) == GAP; + } } diff --git a/forester/java/src/org/forester/sequence/Sequence.java b/forester/java/src/org/forester/sequence/Sequence.java index 3e2473a..cac6bcf 100644 --- a/forester/java/src/org/forester/sequence/Sequence.java +++ b/forester/java/src/org/forester/sequence/Sequence.java @@ -49,6 +49,8 @@ public interface Sequence { public abstract char getResidueAt( final int position ); + public abstract boolean isGapAt( final int position ); + public abstract TYPE getType(); public enum TYPE { diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index ac070ff..1fcf900 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -912,16 +912,13 @@ public final class Test { path = "C:\\Program Files\\mafft-win\\mafft.bat"; } else { - path = "/home/czmasek/bin/mafft"; - } - if ( !MsaInferrer.isInstalled( path ) ) { path = "mafft"; - } - if ( !MsaInferrer.isInstalled( path ) ) { - path = "/usr/local/bin/mafft"; - } - if ( !MsaInferrer.isInstalled( path ) ) { - path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft"; + if ( !MsaInferrer.isInstalled( path ) ) { + path = "/usr/bin/mafft"; + } + if ( !MsaInferrer.isInstalled( path ) ) { + path = "/usr/local/bin/mafft"; + } } if ( MsaInferrer.isInstalled( path ) ) { System.out.print( "MAFFT (external program): " ); @@ -1109,18 +1106,22 @@ public final class Test { final URL u = new URL( s ); final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); final Phylogeny[] phys = factory.create( u, new NHXParser() ); - if ( ( phys == null ) || ( phys.length != 1 ) ) { + if ( ( phys == null ) || ( phys.length != 5 ) ) { return false; } - if ( !phys[ 0 ].toNewHampshire().equals( "((a,b),c);" ) ) { + if ( !phys[ 0 ].toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) { System.out.println( phys[ 0 ].toNewHampshire() ); return false; } + if ( !phys[ 1 ].toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) { + System.out.println( phys[ 1 ].toNewHampshire() ); + return false; + } final Phylogeny[] phys2 = factory.create( u.openStream(), new NHXParser() ); - if ( ( phys2 == null ) || ( phys2.length != 1 ) ) { + if ( ( phys2 == null ) || ( phys2.length != 5 ) ) { return false; } - if ( !phys2[ 0 ].toNewHampshire().equals( "((a,b),c);" ) ) { + if ( !phys2[ 0 ].toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) { System.out.println( phys2[ 0 ].toNewHampshire() ); return false; } @@ -1131,42 +1132,30 @@ public final class Test { if ( !p.hasNext() ) { return false; } - if ( !p.next().toNewHampshire().equals( "((a,b),c);" ) ) { + if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) { return false; } - if ( p.hasNext() ) { - return false; - } - if ( p.next() != null ) { - return false; - } - if ( p.hasNext() ) { - return false; - } - if ( p.next() != null ) { + if ( !p.hasNext() ) { return false; } p.reset(); if ( !p.hasNext() ) { return false; } - if ( !p.next().toNewHampshire().equals( "((a,b),c);" ) ) { - return false; - } - if ( p.hasNext() ) { + if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) { return false; } - if ( p.next() != null ) { + if ( !p.next().toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) { return false; } - if ( p.hasNext() ) { + p.reset(); + if ( !p.hasNext() ) { return false; } - if ( p.next() != null ) { + if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) { return false; } - p.reset(); - if ( !p.hasNext() ) { + if ( !p.next().toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) { return false; } }