From 0f62196182c853236023fce71ca24db42b591a36 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 13 Jul 2012 03:01:15 +0000 Subject: [PATCH] in progress --- .../org/forester/application/msa_compactor.java | 20 ++-- .../archaeopteryx/tools/PhylogeneticInferrer.java | 7 +- forester/java/src/org/forester/msa/BasicMsa.java | 21 +++- forester/java/src/org/forester/msa/Msa.java | 6 +- .../java/src/org/forester/msa/MsaCompactor.java | 112 ++++++++++++++++---- forester/java/src/org/forester/msa/MsaMethods.java | 41 +++++++ 6 files changed, 169 insertions(+), 38 deletions(-) diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index 0f1b961..7457b3e 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -20,6 +20,7 @@ public class msa_compactor { final static private String HELP_OPTION_2 = "h"; final static private String REMOVE_WORST_OFFENDERS_OPTION = "w"; final static private String AV_GAPINESS_OPTION = "a"; + final static private String STEP_OPTION = "s"; final static private String LENGTH_OPTION = "l"; final static private String REALIGN_OPTION = "r"; final static private String PRG_NAME = "msa_compactor"; @@ -32,24 +33,23 @@ public class msa_compactor { public static void main( final String args[] ) { try { final CommandLineArguments cla = new CommandLineArguments( args ); - if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( cla.getNumberOfNames() != 2 ) ) { printHelp(); System.exit( 0 ); } final File in = cla.getFile( 0 ); + final File out = cla.getFile( 1 ); int worst_remove = -1; double av = -1; int length = -1; - final int step = 5; + int step = 1; boolean realign = false; - // int to = 0; - // int window = 0; - // int step = 0; final List allowed_options = new ArrayList(); allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION ); allowed_options.add( AV_GAPINESS_OPTION ); allowed_options.add( LENGTH_OPTION ); allowed_options.add( REALIGN_OPTION ); + allowed_options.add( STEP_OPTION ); final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); @@ -63,6 +63,9 @@ public class msa_compactor { if ( cla.isOptionSet( LENGTH_OPTION ) ) { length = cla.getOptionValueAsInt( LENGTH_OPTION ); } + if ( cla.isOptionSet( STEP_OPTION ) ) { + step = cla.getOptionValueAsInt( STEP_OPTION ); + } if ( cla.isOptionSet( REALIGN_OPTION ) ) { realign = true; } @@ -82,20 +85,17 @@ public class msa_compactor { else { msa = GeneralMsaParser.parse( is ); } - System.out.println( msa.toString() ); - System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).arithmeticMean() ); MsaCompactor mc = null; if ( worst_remove > 0 ) { mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign ); } else if ( av > 0 ) { - mc = MsaCompactor.reduceGapAverage( msa, av, step, realign ); + mc = MsaCompactor.reduceGapAverage( msa, av, step, realign, out, 50 ); } else if ( length > 0 ) { mc = MsaCompactor.reduceLength( msa, length, step, realign ); } - System.out.println( mc.getMsa().toString() ); - System.out.println( MsaMethods.calcBasicGapinessStatistics( mc.getMsa() ).arithmeticMean() ); + System.out.println( MsaMethods.calcGapRatio( mc.getMsa() ) ); for( final String id : mc.getRemovedSeqIds() ) { System.out.println( id ); } diff --git a/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java b/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java index 0f07850..c1aefc5 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java @@ -46,6 +46,7 @@ import org.forester.msa.BasicMsa; import org.forester.msa.ClustalOmega; import org.forester.msa.Mafft; import org.forester.msa.Msa; +import org.forester.msa.Msa.MSA_FORMAT; import org.forester.msa.MsaInferrer; import org.forester.msa.MsaMethods; import org.forester.msa.ResampleableMsa; @@ -201,7 +202,7 @@ public class PhylogeneticInferrer extends RunnableProcess { } if ( DEBUG ) { System.out.println( msa.toString() ); - System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).toString() ); + System.out.println( MsaMethods.calcGapRatio( msa ) ); } final MsaMethods msa_tools = MsaMethods.createInstance(); if ( _options.isExecuteMsaProcessing() ) { @@ -222,7 +223,7 @@ public class PhylogeneticInferrer extends RunnableProcess { if ( DEBUG ) { System.out.println( msa_tools.getIgnoredSequenceIds() ); System.out.println( msa.toString() ); - System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).toString() ); + System.out.println( MsaMethods.calcGapRatio( msa ) ); } _msa = msa; } @@ -293,7 +294,7 @@ public class PhylogeneticInferrer extends RunnableProcess { try { final BufferedWriter msa_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase() + MSA_FILE_SUFFIX ) ); - _msa.write( msa_writer ); + _msa.write( msa_writer, MSA_FORMAT.PHYLIP ); msa_writer.close(); final BufferedWriter pwd_writer = new BufferedWriter( new FileWriter( _options.getIntermediateFilesBase() + PWD_FILE_SUFFIX ) ); diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index 1fdef10..907d212 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -32,6 +32,8 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import org.forester.io.writers.SequenceWriter; +import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; import org.forester.sequence.BasicSequence; import org.forester.sequence.Sequence; import org.forester.sequence.Sequence.TYPE; @@ -152,7 +154,24 @@ public class BasicMsa implements Msa { } @Override - public void write( final Writer w ) throws IOException { + public void write( final Writer w, final MSA_FORMAT format ) throws IOException { + switch ( format ) { + case PHYLIP: + writeToPhylip( w ); + break; + case FASTA: + writeToFasta( w ); + break; + default: + throw new RuntimeException( "unknown format " + format ); + } + } + + private void writeToFasta( final Writer w ) throws IOException { + SequenceWriter.writeSeqs( asSequenceList(), w, SEQ_FORMAT.FASTA, 100 ); + } + + private void writeToPhylip( final Writer w ) throws IOException { final int max = determineMaxIdLength() + 1; for( int row = 0; row < _data.length; ++row ) { w.write( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ).toString() ); diff --git a/forester/java/src/org/forester/msa/Msa.java b/forester/java/src/org/forester/msa/Msa.java index 6ffa1ea..790b674 100644 --- a/forester/java/src/org/forester/msa/Msa.java +++ b/forester/java/src/org/forester/msa/Msa.java @@ -34,6 +34,10 @@ import org.forester.sequence.Sequence.TYPE; public interface Msa { + public static enum MSA_FORMAT { + FASTA, PHYLIP; + } + public String getIdentifier( int row ); public void setIdentifier( int row, String identifier ); @@ -58,5 +62,5 @@ public interface Msa { public void setResidueAt( final int row, final int col, final char residue ); - public void write( Writer w ) throws IOException; + public void write( Writer w, MSA_FORMAT format ) throws IOException; } diff --git a/forester/java/src/org/forester/msa/MsaCompactor.java b/forester/java/src/org/forester/msa/MsaCompactor.java index a66c09b..469fa31 100644 --- a/forester/java/src/org/forester/msa/MsaCompactor.java +++ b/forester/java/src/org/forester/msa/MsaCompactor.java @@ -1,7 +1,9 @@ package org.forester.msa; +import java.io.File; import java.io.IOException; +import java.io.Writer; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -9,6 +11,7 @@ import java.util.List; import java.util.SortedSet; import java.util.TreeSet; +import org.forester.msa.Msa.MSA_FORMAT; import org.forester.sequence.Sequence; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; @@ -16,7 +19,11 @@ import org.forester.util.ForesterUtil; public class MsaCompactor { - private static final boolean VERBOSE = true; + private static final boolean VERBOSE = true; + + public static enum SORT_BY { + MAX, MEAN, MEDIAN; + } private Msa _msa; private final SortedSet _removed_seq_ids; @@ -45,9 +52,12 @@ public class MsaCompactor { public final static MsaCompactor reduceGapAverage( final Msa msa, final double max_gap_average, final int step, - final boolean realign ) throws IOException, InterruptedException { + final boolean realign, + final File out, + final int minimal_effective_length ) throws IOException, + InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); - mc.removeViaGapAverage( max_gap_average, step, realign ); + mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length ); return mc; } @@ -89,8 +99,12 @@ public class MsaCompactor { _msa = mafft.infer( _msa.asSequenceList(), opts ); } - final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign ) - throws IOException, InterruptedException { + final private void removeViaGapAverage( final double mean_gapiness, + final int step, + final boolean realign, + final File outfile, + final int minimal_effective_length ) throws IOException, + InterruptedException { if ( step < 1 ) { throw new IllegalArgumentException( "step cannot be less than 1" ); } @@ -98,23 +112,54 @@ public class MsaCompactor { throw new IllegalArgumentException( "target average gap ratio cannot be less than 0" ); } if ( VERBOSE ) { - System.out.println( "start: " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( "orig: " + msaStatsAsSB() ); + } + if ( minimal_effective_length > 1 ) { + _msa = MsaMethods.removeSequencesByMinimalLength( _msa, minimal_effective_length ); + if ( VERBOSE ) { + System.out.println( "short seq removal: " + msaStatsAsSB() ); + } } int counter = step; - while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) { + double gr; + do { removeWorstOffenders( step, 1, false ); if ( realign ) { mafft(); } + gr = MsaMethods.calcGapRatio( _msa ); if ( VERBOSE ) { - System.out.println( counter + ": " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( counter + ": " + msaStatsAsSB() ); } + write( outfile, gr ); counter += step; + } while ( gr > mean_gapiness ); + if ( VERBOSE ) { + System.out.println( "final: " + msaStatsAsSB() ); } } + final private void write( final File outfile, final double gr ) throws IOException { + writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_" + gr + ".aln" ); + } + + final private void writeMsa( final String outfile ) throws IOException { + final Writer w = ForesterUtil.createBufferedWriter( outfile ); + _msa.write( w, MSA_FORMAT.PHYLIP ); + w.close(); + } + + final private StringBuilder msaStatsAsSB() { + final StringBuilder sb = new StringBuilder(); + sb.append( _msa.getLength() ); + sb.append( "\t" ); + sb.append( _msa.getNumberOfSequences() ); + sb.append( "\t" ); + sb.append( ForesterUtil.round( MsaMethods.calcGapRatio( _msa ), 4 ) ); + sb.append( "\t" ); + return sb; + } + final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException, InterruptedException { if ( step < 1 ) { @@ -124,8 +169,7 @@ public class MsaCompactor { throw new IllegalArgumentException( "target length cannot be less than 1" ); } if ( VERBOSE ) { - System.out.println( "start: " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( "orig: " + msaStatsAsSB() ); } int counter = step; while ( _msa.getLength() > length ) { @@ -134,8 +178,7 @@ public class MsaCompactor { mafft(); } if ( VERBOSE ) { - System.out.println( counter + ": " + _msa.getLength() + " " - + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + System.out.println( counter + ": " + msaStatsAsSB() ); } counter += step; } @@ -162,7 +205,7 @@ public class MsaCompactor { } private final static void sort( final DescriptiveStatistics stats[] ) { - Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) ); + Arrays.sort( stats, new DescriptiveStatisticsComparator( false, SORT_BY.MAX ) ); } private final DescriptiveStatistics[] calc() { @@ -190,21 +233,44 @@ public class MsaCompactor { final static class DescriptiveStatisticsComparator implements Comparator { - final boolean _ascending; + final private boolean _ascending; + final private SORT_BY _sort_by; - public DescriptiveStatisticsComparator( final boolean ascending ) { + public DescriptiveStatisticsComparator( final boolean ascending, final SORT_BY sort_by ) { _ascending = ascending; + _sort_by = sort_by; } @Override public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) { - if ( s0.arithmeticMean() < s1.arithmeticMean() ) { - return _ascending ? -1 : 1; - } - else if ( s0.arithmeticMean() > s1.arithmeticMean() ) { - return _ascending ? 1 : -1; + switch ( _sort_by ) { + case MAX: + if ( s0.getMax() < s1.getMax() ) { + return _ascending ? -1 : 1; + } + else if ( s0.getMax() > s1.getMax() ) { + return _ascending ? 1 : -1; + } + return 0; + case MEAN: + if ( s0.arithmeticMean() < s1.arithmeticMean() ) { + return _ascending ? -1 : 1; + } + else if ( s0.arithmeticMean() > s1.arithmeticMean() ) { + return _ascending ? 1 : -1; + } + return 0; + case MEDIAN: + if ( s0.median() < s1.median() ) { + return _ascending ? -1 : 1; + } + else if ( s0.median() > s1.median() ) { + return _ascending ? 1 : -1; + } + return 0; + default: + return 0; } - return 0; } } } diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index ff6d570..ee9338b 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -85,6 +85,19 @@ public final class MsaMethods { return BasicMsa.createInstance( seqs ); } + final public static Msa removeSequencesByRow( final Msa msa, final List to_remove_rows ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_rows.contains( row ) ) { + seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio, final int min_allowed_length, final Msa msa ) { @@ -166,4 +179,32 @@ public final class MsaMethods { } return stats; } + + public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) { + final List to_remove_rows = new ArrayList(); + for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { + int eff_length = 0; + for( int i = 0; i < msa.getLength(); ++i ) { + if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) { + eff_length++; + } + } + if ( eff_length < min_effective_length ) { + to_remove_rows.add( seq ); + } + } + return removeSequencesByRow( msa, to_remove_rows ); + } + + public static double calcGapRatio( final Msa msa ) { + int gaps = 0; + for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { + for( int i = 0; i < msa.getLength(); ++i ) { + if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) { + gaps++; + } + } + } + return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() ); + } } -- 1.7.10.2