import org.forester.io.parsers.GeneralMsaParser;
import org.forester.msa.Msa;
import org.forester.msa.Msa.MSA_FORMAT;
-import org.forester.msa.MsaCompactor;
+import org.forester.msa_compactor.MsaCompactor;
import org.forester.msa.MsaMethods;
import org.forester.util.CommandLineArguments;
import org.forester.util.ForesterUtil;
public class msa_compactor {
- final static private String HELP_OPTION_1 = "help";
- final static private String HELP_OPTION_2 = "h";
- final static private String REMOVE_WORST_OFFENDERS_OPTION = "w";
- final static private String AV_GAPINESS_OPTION = "a";
- final static private String STEP_OPTION = "s";
- final static private String LENGTH_OPTION = "l";
- final static private String REALIGN_OPTION = "r";
- final static private String PRG_NAME = "msa_compactor";
- final static private String PRG_DESC = "multiple sequnce aligment compactor";
- final static private String PRG_VERSION = "0.01";
- final static private String PRG_DATE = "140221";
- final static private String E_MAIL = "phylosoft@gmail.com";
- final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
+ final static private String HELP_OPTION_1 = "help";
+ final static private String HELP_OPTION_2 = "h";
+ final static private String REMOVE_WORST_OFFENDERS_OPTION = "w";
+ final static private String AV_GAPINESS_OPTION = "a";
+ final static private String STEP_OPTION = "s";
+ final static private String LENGTH_OPTION = "l";
+ final static private String REALIGN_OPTION = "r";
+ final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn";
+ final static private String PRG_NAME = "msa_compactor";
+ final static private String PRG_DESC = "multiple sequnce aligment compactor";
+ final static private String PRG_VERSION = "0.01";
+ final static private String PRG_DATE = "140221";
+ final static private String E_MAIL = "phylosoft@gmail.com";
+ final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
public static void main( final String args[] ) {
try {
int length = -1;
int step = 1;
boolean realign = false;
+ boolean norm = true;
final List<String> allowed_options = new ArrayList<String>();
allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION );
allowed_options.add( AV_GAPINESS_OPTION );
allowed_options.add( LENGTH_OPTION );
allowed_options.add( REALIGN_OPTION );
+ allowed_options.add( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION );
allowed_options.add( STEP_OPTION );
final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
if ( dissallowed_options.length() > 0 ) {
if ( cla.isOptionSet( REALIGN_OPTION ) ) {
realign = true;
}
+ if ( cla.isOptionSet( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION ) ) {
+ norm = false;
+ }
// else if ( cla.isOptionSet( STEP_OPTION ) && cla.isOptionSet( WINDOW_OPTION ) ) {
// step = cla.getOptionValueAsInt( STEP_OPTION );
// window = cla.getOptionValueAsInt( WINDOW_OPTION );
}
MsaCompactor mc = null;
if ( worst_remove > 0 ) {
- mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign );
+ mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign, norm );
}
else if ( av > 0 ) {
mc = MsaCompactor.reduceGapAverage( msa, av, step, realign, out, 50 );
node_to_annotate.getNodeData().getSequence().setName( s.substring( 3 ) );
}
else if ( s.indexOf( '=' ) < 0 ) {
- if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT
+ if ( ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT )
&& !allow_errors_in_distance_to_parent ) {
throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:"
+ "\"" + s + "\"" );
}
return column;
}
+
+ @Override
+ public boolean isGapAt( final int row, final int col ) {
+ return getResidueAt( row, col ) == Sequence.GAP;
+ }
}
public char getResidueAt( int row, int col );
+ public boolean isGapAt( int row, int col );
+
public List<Character> getColumnAt( int col );
public Sequence getSequence( final String id );
public static int calcGapSumPerColumn( final Msa msa, final int col ) {
int gap_rows = 0;
for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
- if ( msa.getResidueAt( j, col ) == Sequence.GAP ) {
+ if ( msa.isGapAt( j, col ) ) {
gap_rows++;
}
}
--- /dev/null
+
+package org.forester.msa_compactor;
+
+import org.forester.util.ForesterUtil;
+
+public final class GapContribution implements Comparable<GapContribution> {
+
+ private final String _id;
+ private double _value;
+
+ GapContribution( final String id ) {
+ if ( ForesterUtil.isEmpty( id ) ) {
+ throw new IllegalArgumentException( "id is empty or null" );
+ }
+ _id = id;
+ _value = 0;
+ }
+
+ final String getId() {
+ return _id;
+ }
+
+ final double getValue() {
+ return _value;
+ }
+
+ final void addToValue( final double v ) {
+ if ( v < 0 ) {
+ throw new IllegalArgumentException( "cannot add negative value" );
+ }
+ _value += v;
+ }
+
+ final void divideValue( final double d ) {
+ if ( d <= 0 ) {
+ throw new IllegalArgumentException( "attempt to divide by non-positive value" );
+ }
+ _value /= d;
+ }
+
+ @Override
+ public int compareTo( final GapContribution o ) {
+ if ( getValue() < o.getValue() ) {
+ return 1;
+ }
+ else if ( getValue() > o.getValue() ) {
+ return -1;
+ }
+ return 0;
+ }
+}
-package org.forester.msa;
+package org.forester.msa_compactor;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.math.RoundingMode;
import java.text.DecimalFormat;
-import java.text.DecimalFormatSymbols;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.SortedSet;
import java.util.TreeSet;
+import org.forester.msa.Mafft;
+import org.forester.msa.Msa;
import org.forester.msa.Msa.MSA_FORMAT;
+import org.forester.msa.MsaInferrer;
+import org.forester.msa.MsaMethods;
import org.forester.sequence.Sequence;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.DescriptiveStatistics;
public class MsaCompactor {
final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
+ final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
private static final boolean VERBOSE = true;
private Msa _msa;
private final SortedSet<String> _removed_seq_ids;
static {
+ NF_4.setRoundingMode( RoundingMode.HALF_UP );
NF_3.setRoundingMode( RoundingMode.HALF_UP );
}
format );
}
- private final DescriptiveStatistics[] calcGapContribtions() {
+ final int calcNonGapResidues( final Sequence seq ) {
+ int ng = 0;
+ for( int i = 0; i < seq.getLength(); ++i ) {
+ if ( !seq.isGapAt( i ) ) {
+ ++ng;
+ }
+ }
+ return ng;
+ }
+
+ private final DescriptiveStatistics[] calcGapContribtionsX( final boolean normalize_for_effective_seq_length ) {
final double gappiness[] = calcGappiness();
final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
+ final double l = calculateEffectiveLengthRatio( row );
for( int col = 0; col < _msa.getLength(); ++col ) {
- if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) {
- stats[ row ].addValue( gappiness[ col ] );
+ if ( !_msa.isGapAt( row, col ) ) {
+ if ( normalize_for_effective_seq_length ) {
+ stats[ row ].addValue( gappiness[ col ] / l );
+ }
+ else {
+ stats[ row ].addValue( gappiness[ col ] );
+ }
}
}
}
return stats;
}
+ private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) {
+ final double gappiness[] = calcGappiness();
+ final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ];
+ for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
+ stats[ row ] = new GapContribution( _msa.getIdentifier( row ) );
+ for( int col = 0; col < _msa.getLength(); ++col ) {
+ if ( !_msa.isGapAt( row, col ) ) {
+ stats[ row ].addToValue( gappiness[ col ] );
+ }
+ }
+ if ( normalize_for_effective_seq_length ) {
+ stats[ row ].divideValue( calculateEffectiveLengthRatio( row ) );
+ }
+ else {
+ //
+ }
+ }
+ return stats;
+ }
+
+ final private GapContribution[] calcGapContribtionsStats( final boolean norm ) {
+ final GapContribution stats[] = calcGapContribtions( norm );
+ Arrays.sort( stats );
+ for( final GapContribution stat : stats ) {
+ final StringBuilder sb = new StringBuilder();
+ sb.append( stat.getId() );
+ sb.append( "\t" );
+ sb.append( NF_4.format( stat.getValue() ) );
+ sb.append( "\t" );
+ // sb.append( NF_4.format( stat.median() ) );
+ // sb.append( "\t" );
+ // sb.append( NF_4.format( stat.getMin() ) );
+ // sb.append( "\t" );
+ // sb.append( NF_4.format( stat.getMax() ) );
+ //sb.append( "\t" );
+ System.out.println( sb );
+ }
+ return stats;
+ }
+
private final double[] calcGappiness() {
final int l = _msa.getLength();
final double gappiness[] = new double[ l ];
return gappiness;
}
- final private DescriptiveStatistics[] calcStats() {
- final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
- dfs.setDecimalSeparator( '.' );
- final NumberFormat f = new DecimalFormat( "#.####", dfs );
- f.setRoundingMode( RoundingMode.HALF_UP );
- final DescriptiveStatistics stats[] = calcGapContribtions();
- Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MEAN ) );
- for( final DescriptiveStatistics stat : stats ) {
- final StringBuilder sb = new StringBuilder();
- sb.append( stat.getDescription() );
- sb.append( "\t" );
- sb.append( f.format( stat.arithmeticMean() ) );
- sb.append( "\t" );
- sb.append( f.format( stat.median() ) );
- sb.append( "\t" );
- sb.append( f.format( stat.getMin() ) );
- sb.append( "\t" );
- sb.append( f.format( stat.getMax() ) );
- sb.append( "\t" );
- System.out.println( sb );
- }
- return stats;
+ private double calculateEffectiveLengthRatio( final int row ) {
+ return ( double ) calcNonGapResidues( _msa.getSequence( row ) ) / _msa.getLength();
}
final private void mafft() throws IOException, InterruptedException {
final MsaInferrer mafft = Mafft
.createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
final List<String> opts = new ArrayList<String>();
- opts.add( "--maxiterate" );
- opts.add( "1000" );
- opts.add( "--localpair" );
+ // opts.add( "--maxiterate" );
+ // opts.add( "1000" );
+ // opts.add( "--localpair" );
opts.add( "--quiet" );
_msa = mafft.infer( _msa.asSequenceList(), opts );
}
int counter = step;
double gr;
do {
- removeWorstOffenders( step, 1, false );
+ removeWorstOffenders( step, 1, false, false );
if ( realign ) {
mafft();
}
}
int counter = step;
while ( _msa.getLength() > length ) {
- removeWorstOffenders( step, 1, false );
+ removeWorstOffenders( step, 1, false, false );
if ( realign ) {
mafft();
}
}
}
- final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign )
- throws IOException, InterruptedException {
- final DescriptiveStatistics stats[] = calcStats();
+ final private void removeWorstOffenders( final int to_remove,
+ final int step,
+ final boolean realign,
+ final boolean norm ) throws IOException, InterruptedException {
+ final GapContribution stats[] = calcGapContribtionsStats( norm );
final List<String> to_remove_ids = new ArrayList<String>();
for( int j = 0; j < to_remove; ++j ) {
- to_remove_ids.add( stats[ j ].getDescription() );
- _removed_seq_ids.add( stats[ j ].getDescription() );
+ to_remove_ids.add( stats[ j ].getId() );
+ _removed_seq_ids.add( stats[ j ].getId() );
}
//TODO if verbose/interactve
for( final String id : to_remove_ids ) {
public final static MsaCompactor removeWorstOffenders( final Msa msa,
final int worst_offenders_to_remove,
- final boolean realign ) throws IOException,
+ final boolean realign,
+ final boolean norm ) throws IOException,
InterruptedException {
final MsaCompactor mc = new MsaCompactor( msa );
- mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign );
+ mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm );
return mc;
}
public String getMolecularSequenceAsString() {
return new String( getMolecularSequence() );
}
+
+ @Override
+ public boolean isGapAt( final int position ) {
+ return getResidueAt( position ) == GAP;
+ }
}
public abstract char getResidueAt( final int position );
+ public abstract boolean isGapAt( final int position );
+
public abstract TYPE getType();
public enum TYPE {
path = "C:\\Program Files\\mafft-win\\mafft.bat";
}
else {
- path = "/home/czmasek/bin/mafft";
- }
- if ( !MsaInferrer.isInstalled( path ) ) {
path = "mafft";
- }
- if ( !MsaInferrer.isInstalled( path ) ) {
- path = "/usr/local/bin/mafft";
- }
- if ( !MsaInferrer.isInstalled( path ) ) {
- path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft";
+ if ( !MsaInferrer.isInstalled( path ) ) {
+ path = "/usr/bin/mafft";
+ }
+ if ( !MsaInferrer.isInstalled( path ) ) {
+ path = "/usr/local/bin/mafft";
+ }
}
if ( MsaInferrer.isInstalled( path ) ) {
System.out.print( "MAFFT (external program): " );
final URL u = new URL( s );
final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
final Phylogeny[] phys = factory.create( u, new NHXParser() );
- if ( ( phys == null ) || ( phys.length != 1 ) ) {
+ if ( ( phys == null ) || ( phys.length != 5 ) ) {
return false;
}
- if ( !phys[ 0 ].toNewHampshire().equals( "((a,b),c);" ) ) {
+ if ( !phys[ 0 ].toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
System.out.println( phys[ 0 ].toNewHampshire() );
return false;
}
+ if ( !phys[ 1 ].toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) {
+ System.out.println( phys[ 1 ].toNewHampshire() );
+ return false;
+ }
final Phylogeny[] phys2 = factory.create( u.openStream(), new NHXParser() );
- if ( ( phys2 == null ) || ( phys2.length != 1 ) ) {
+ if ( ( phys2 == null ) || ( phys2.length != 5 ) ) {
return false;
}
- if ( !phys2[ 0 ].toNewHampshire().equals( "((a,b),c);" ) ) {
+ if ( !phys2[ 0 ].toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
System.out.println( phys2[ 0 ].toNewHampshire() );
return false;
}
if ( !p.hasNext() ) {
return false;
}
- if ( !p.next().toNewHampshire().equals( "((a,b),c);" ) ) {
+ if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
return false;
}
- if ( p.hasNext() ) {
- return false;
- }
- if ( p.next() != null ) {
- return false;
- }
- if ( p.hasNext() ) {
- return false;
- }
- if ( p.next() != null ) {
+ if ( !p.hasNext() ) {
return false;
}
p.reset();
if ( !p.hasNext() ) {
return false;
}
- if ( !p.next().toNewHampshire().equals( "((a,b),c);" ) ) {
- return false;
- }
- if ( p.hasNext() ) {
+ if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
return false;
}
- if ( p.next() != null ) {
+ if ( !p.next().toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) {
return false;
}
- if ( p.hasNext() ) {
+ p.reset();
+ if ( !p.hasNext() ) {
return false;
}
- if ( p.next() != null ) {
+ if ( !p.next().toNewHampshire().equals( "((((A,B),C),D),(E,F));" ) ) {
return false;
}
- p.reset();
- if ( !p.hasNext() ) {
+ if ( !p.next().toNewHampshire().equals( "((1,2,3),(4,5,6),(7,8,9));" ) ) {
return false;
}
}