From: cmzmasek@gmail.com Date: Thu, 12 Jul 2012 05:55:27 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=b1a74bd076c917d428002284a53a2c6f390226fb;p=jalview.git in progress --- diff --git a/forester/java/src/org/forester/application/mcc.java b/forester/java/src/org/forester/application/mcc.java index 3ee440f..c0c72b0 100644 --- a/forester/java/src/org/forester/application/mcc.java +++ b/forester/java/src/org/forester/application/mcc.java @@ -27,7 +27,9 @@ package org.forester.application; import java.io.File; import java.io.FileInputStream; +import java.io.InputStream; +import org.forester.io.parsers.FastaParser; import org.forester.io.parsers.GeneralMsaParser; import org.forester.msa.Msa; import org.forester.msa.MsaMethods; @@ -76,7 +78,13 @@ public class mcc { System.exit( 0 ); } Msa msa = null; - msa = GeneralMsaParser.parse( new FileInputStream( in ) ); + final InputStream is = new FileInputStream( in ); + if ( FastaParser.isLikelyFasta( in ) ) { + msa = FastaParser.parseMsa( is ); + } + else { + msa = GeneralMsaParser.parse( is ); + } if ( cla.isOptionSet( FROM_OPTION ) ) { singleCalc( in, from, to, msa ); } diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java new file mode 100644 index 0000000..0f1b961 --- /dev/null +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -0,0 +1,132 @@ + +package org.forester.application; + +import java.io.File; +import java.io.FileInputStream; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.FastaParser; +import org.forester.io.parsers.GeneralMsaParser; +import org.forester.msa.Msa; +import org.forester.msa.MsaCompactor; +import org.forester.msa.MsaMethods; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class msa_compactor { + + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String REMOVE_WORST_OFFENDERS_OPTION = "w"; + final static private String AV_GAPINESS_OPTION = "a"; + final static private String LENGTH_OPTION = "l"; + final static private String REALIGN_OPTION = "r"; + final static private String PRG_NAME = "msa_compactor"; + final static private String PRG_DESC = "multiple sequnce aligment compactor"; + final static private String PRG_VERSION = "0.90"; + final static private String PRG_DATE = "2012.07.11"; + final static private String E_MAIL = "phylosoft@gmail.com"; + final static private String WWW = "www.phylosoft.org/forester/"; + + public static void main( final String args[] ) { + try { + final CommandLineArguments cla = new CommandLineArguments( args ); + if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) { + printHelp(); + System.exit( 0 ); + } + final File in = cla.getFile( 0 ); + int worst_remove = -1; + double av = -1; + int length = -1; + final int step = 5; + boolean realign = false; + // int to = 0; + // int window = 0; + // int step = 0; + final List allowed_options = new ArrayList(); + allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION ); + allowed_options.add( AV_GAPINESS_OPTION ); + allowed_options.add( LENGTH_OPTION ); + allowed_options.add( REALIGN_OPTION ); + final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); + if ( dissallowed_options.length() > 0 ) { + ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); + } + if ( cla.isOptionSet( REMOVE_WORST_OFFENDERS_OPTION ) ) { + worst_remove = cla.getOptionValueAsInt( REMOVE_WORST_OFFENDERS_OPTION ); + } + if ( cla.isOptionSet( AV_GAPINESS_OPTION ) ) { + av = cla.getOptionValueAsDouble( AV_GAPINESS_OPTION ); + } + if ( cla.isOptionSet( LENGTH_OPTION ) ) { + length = cla.getOptionValueAsInt( LENGTH_OPTION ); + } + if ( cla.isOptionSet( REALIGN_OPTION ) ) { + realign = true; + } + // else if ( cla.isOptionSet( STEP_OPTION ) && cla.isOptionSet( WINDOW_OPTION ) ) { + // step = cla.getOptionValueAsInt( STEP_OPTION ); + // window = cla.getOptionValueAsInt( WINDOW_OPTION ); + // } + // else { + // printHelp(); + // System.exit( 0 ); + // } + Msa msa = null; + final FileInputStream is = new FileInputStream( in ); + if ( FastaParser.isLikelyFasta( in ) ) { + msa = FastaParser.parseMsa( is ); + } + else { + msa = GeneralMsaParser.parse( is ); + } + System.out.println( msa.toString() ); + System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).arithmeticMean() ); + MsaCompactor mc = null; + if ( worst_remove > 0 ) { + mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign ); + } + else if ( av > 0 ) { + mc = MsaCompactor.reduceGapAverage( msa, av, step, realign ); + } + else if ( length > 0 ) { + mc = MsaCompactor.reduceLength( msa, length, step, realign ); + } + System.out.println( mc.getMsa().toString() ); + System.out.println( MsaMethods.calcBasicGapinessStatistics( mc.getMsa() ).arithmeticMean() ); + for( final String id : mc.getRemovedSeqIds() ) { + System.out.println( id ); + } + } + catch ( final Exception e ) { + e.printStackTrace(); + ForesterUtil.fatalError( PRG_NAME, e.getMessage() ); + } + } + + private static void printHelp() { + ForesterUtil.printProgramInformation( PRG_NAME, + PRG_DESC, + PRG_VERSION, + PRG_DATE, + E_MAIL, + WWW, + ForesterUtil.getForesterLibraryInformation() ); + System.out.println( "Usage:" ); + System.out.println(); + System.out.println( PRG_NAME + " " ); + System.out.println(); + System.out.println( " options: " ); + System.out.println(); + // System.out.println( " -" + FROM_OPTION + "=: from (msa column)" ); + // System.out.println( " -" + TO_OPTION + "=: to (msa column)" ); + // System.out.println( " or" ); + // System.out.println( " -" + WINDOW_OPTION + "=: window size (msa columns)" ); + System.out.println( " -" + REMOVE_WORST_OFFENDERS_OPTION + "=: step size (msa columns)" ); + System.out.println(); + System.out.println(); + System.out.println(); + } +} diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index f4dc785..d772d3e 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -36,6 +36,7 @@ import java.awt.event.WindowEvent; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; @@ -1976,12 +1977,12 @@ public final class MainFrameApplication extends MainFrame { setMsa( null ); Msa msa = null; try { - if ( FastaParser.isLikelyFasta( new FileInputStream( file ) ) ) { - msa = FastaParser.parseMsa( new FileInputStream( file ) ); - System.out.println( msa.toString() ); + final InputStream is = new FileInputStream( file ); + if ( FastaParser.isLikelyFasta( file ) ) { + msa = FastaParser.parseMsa( is ); } else { - msa = GeneralMsaParser.parse( new FileInputStream( file ) ); + msa = GeneralMsaParser.parse( is ); } } catch ( final MsaFormatException e ) { diff --git a/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java b/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java index bf78f01..0f07850 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java @@ -42,8 +42,6 @@ import org.forester.evoinference.distance.PairwiseDistanceCalculator; import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.evoinference.tools.BootstrapResampler; import org.forester.io.parsers.FastaParser; -import org.forester.io.writers.SequenceWriter; -import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; import org.forester.msa.BasicMsa; import org.forester.msa.ClustalOmega; import org.forester.msa.Mafft; @@ -88,18 +86,18 @@ public class PhylogeneticInferrer extends RunnableProcess { } private Msa inferMsa() throws IOException, InterruptedException { - final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" ); - if ( DEBUG ) { - System.out.println(); - System.out.println( "temp file: " + temp_seqs_file ); - System.out.println(); - } - //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" ); - final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) ); - SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 ); - writer.close(); + // final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" ); + // if ( DEBUG ) { + // System.out.println(); + // System.out.println( "temp file: " + temp_seqs_file ); + // System.out.println(); + // } + // //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" ); + // final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) ); + // SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 ); + // writer.close(); final List opts = processMafftOptions(); - return runMAFFT( temp_seqs_file, opts ); + return runMAFFT( _seqs, opts ); } private List processMafftOptions() { @@ -263,12 +261,12 @@ public class PhylogeneticInferrer extends RunnableProcess { } } - private Msa runMAFFT( final File input_seqs, final List opts ) throws IOException, InterruptedException { + private Msa runMAFFT( final List seqs, final List opts ) throws IOException, InterruptedException { Msa msa = null; final MsaInferrer mafft = Mafft.createInstance( _mf.getInferenceManager().getPathToLocalMafft() .getCanonicalPath() ); try { - msa = mafft.infer( input_seqs, opts ); + msa = mafft.infer( seqs, opts ); } catch ( final IOException e ) { System.out.println( mafft.getErrorDescription() ); diff --git a/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java b/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java index 961c13a..f2b99a1 100644 --- a/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java +++ b/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java @@ -150,7 +150,7 @@ public final class PairwiseDistanceCalculator { private void copyIdentifiers( final int s, final BasicSymmetricalDistanceMatrix d ) { for( int i = 0; i < s; i++ ) { - d.setIdentifier( i, ( String ) _msa.getIdentifier( i ) ); + d.setIdentifier( i, _msa.getIdentifier( i ) ); } } diff --git a/forester/java/src/org/forester/io/parsers/FastaParser.java b/forester/java/src/org/forester/io/parsers/FastaParser.java index 5f42df8..6a652f5 100644 --- a/forester/java/src/org/forester/io/parsers/FastaParser.java +++ b/forester/java/src/org/forester/io/parsers/FastaParser.java @@ -68,6 +68,10 @@ public class FastaParser { } } + static public boolean isLikelyFasta( final File f ) throws IOException { + return isLikelyFasta( new FileInputStream( f ) ); + } + static public boolean isLikelyFasta( final InputStream is ) throws IOException { final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); String line = null; diff --git a/forester/java/src/org/forester/io/writers/SequenceWriter.java b/forester/java/src/org/forester/io/writers/SequenceWriter.java index b4135d1..327f955 100644 --- a/forester/java/src/org/forester/io/writers/SequenceWriter.java +++ b/forester/java/src/org/forester/io/writers/SequenceWriter.java @@ -5,7 +5,6 @@ import java.io.IOException; import java.io.Writer; import java.util.List; -import org.forester.sequence.BasicSequence; import org.forester.sequence.Sequence; import org.forester.util.ForesterUtil; @@ -15,21 +14,6 @@ public class SequenceWriter { FASTA; } - public static void main( final String[] args ) { - final Sequence s = BasicSequence.createAaSequence( "name", "abcdefghiiklmnap" ); - System.out.println( s.toString() ); - System.out.println( SequenceWriter.toFasta( s, 0 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 5 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 8 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 4 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 3 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 2 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 1 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 100 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 15 ).toString() ); - System.out.println( SequenceWriter.toFasta( s, 16 ).toString() ); - } - public static StringBuilder toFasta( final Sequence seq, final int width ) { final StringBuilder sb = new StringBuilder(); sb.append( ">" ); diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java index d806c90..1fdef10 100644 --- a/forester/java/src/org/forester/msa/BasicMsa.java +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -28,8 +28,11 @@ package org.forester.msa; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import org.forester.sequence.BasicSequence; import org.forester.sequence.Sequence; import org.forester.sequence.Sequence.TYPE; import org.forester.util.ForesterUtil; @@ -37,7 +40,7 @@ import org.forester.util.ForesterUtil; public class BasicMsa implements Msa { private final char[][] _data; - private final Object[] _identifiers; + private final String[] _identifiers; private final TYPE _type; public BasicMsa( final int rows, final int columns, final TYPE type ) { @@ -45,7 +48,7 @@ public class BasicMsa implements Msa { throw new IllegalArgumentException( "basic msa of size zero are illegal" ); } _data = new char[ rows ][ columns ]; - _identifiers = new Object[ rows ]; + _identifiers = new String[ rows ]; _type = type; } @@ -55,6 +58,15 @@ public class BasicMsa implements Msa { _type = msa._type; } + @Override + public List asSequenceList() { + final List seqs = new ArrayList(); + for( int i = 0; i < getNumberOfSequences(); ++i ) { + seqs.add( getSequence( i ) ); + } + return seqs; + } + private int determineMaxIdLength() { int max = 0; for( int row = 0; row < _data.length; ++row ) { @@ -67,7 +79,7 @@ public class BasicMsa implements Msa { } @Override - public Object getIdentifier( final int row ) { + public String getIdentifier( final int row ) { return _identifiers[ row ]; } @@ -87,6 +99,21 @@ public class BasicMsa implements Msa { } @Override + public Sequence getSequence( final String id ) { + for( int i = 0; i < getNumberOfSequences(); ++i ) { + if ( getIdentifier( i ).equals( id ) ) { + return getSequence( i ); + } + } + return null; + } + + @Override + public Sequence getSequence( final int row ) { + return new BasicSequence( getIdentifier( row ), _data[ row ], getType() ); + } + + @Override public StringBuffer getSequenceAsString( final int row ) { final StringBuffer sb = new StringBuffer( _data[ 0 ].length ); for( int col = 0; col < _data[ 0 ].length; ++col ) { @@ -101,7 +128,7 @@ public class BasicMsa implements Msa { } @Override - public void setIdentifier( final int row, final Object id ) { + public void setIdentifier( final int row, final String id ) { _identifiers[ row ] = id; } @@ -140,6 +167,7 @@ public class BasicMsa implements Msa { if ( seqs.size() < 1 ) { throw new IllegalArgumentException( "cannot create basic msa from less than one sequence" ); } + final Set ids = new HashSet(); final int length = seqs.get( 0 ).getLength(); final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() ); for( int row = 0; row < seqs.size(); ++row ) { @@ -150,6 +178,11 @@ public class BasicMsa implements Msa { if ( seq.getType() != msa.getType() ) { throw new IllegalArgumentException( "illegal attempt to build msa from sequences of different type" ); } + if ( ids.contains( seq.getIdentifier() ) ) { + throw new IllegalArgumentException( "illegal attempt to create msa with non-unique identifiers [" + + seq.getIdentifier() + "]" ); + } + ids.add( seq.getIdentifier() ); msa.setIdentifier( row, seq.getIdentifier() ); for( int col = 0; col < length; ++col ) { msa._data[ row ][ col ] = seq.getResidueAt( col ); diff --git a/forester/java/src/org/forester/msa/ClustalOmega.java b/forester/java/src/org/forester/msa/ClustalOmega.java index 627eb64..72db9d3 100644 --- a/forester/java/src/org/forester/msa/ClustalOmega.java +++ b/forester/java/src/org/forester/msa/ClustalOmega.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.List; import org.forester.io.parsers.FastaParser; +import org.forester.sequence.Sequence; import org.forester.util.SystemCommandExecutor; public final class ClustalOmega extends MsaInferrer { @@ -96,4 +97,10 @@ public final class ClustalOmega extends MsaInferrer { _error = null; _exit_code = -100; } + + @Override + public Msa infer( final List seqs, final List opts ) throws IOException, InterruptedException { + // TODO Auto-generated method stub + return null; + } } diff --git a/forester/java/src/org/forester/msa/Mafft.java b/forester/java/src/org/forester/msa/Mafft.java index e175f0d..ec72c0d 100644 --- a/forester/java/src/org/forester/msa/Mafft.java +++ b/forester/java/src/org/forester/msa/Mafft.java @@ -25,12 +25,17 @@ package org.forester.msa; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.forester.io.parsers.FastaParser; +import org.forester.io.writers.SequenceWriter; +import org.forester.io.writers.SequenceWriter.SEQ_FORMAT; +import org.forester.sequence.Sequence; import org.forester.util.SystemCommandExecutor; public final class Mafft extends MsaInferrer { @@ -67,6 +72,18 @@ public final class Mafft extends MsaInferrer { } @Override + public Msa infer( final List seqs, final List opts ) throws IOException, InterruptedException { + final File file = File.createTempFile( "__mafft_input_", ".fasta" ); + file.deleteOnExit(); + final BufferedWriter writer = new BufferedWriter( new FileWriter( file ) ); + SequenceWriter.writeSeqs( seqs, writer, SEQ_FORMAT.FASTA, 100 ); + writer.close(); + final Msa msa = infer( file, opts ); + file.delete(); + return msa; + } + + @Override public Msa infer( final File path_to_input_seqs, final List opts ) throws IOException, InterruptedException { init(); final List my_opts = new ArrayList(); diff --git a/forester/java/src/org/forester/msa/Msa.java b/forester/java/src/org/forester/msa/Msa.java index 50fcee9..6ffa1ea 100644 --- a/forester/java/src/org/forester/msa/Msa.java +++ b/forester/java/src/org/forester/msa/Msa.java @@ -29,13 +29,14 @@ import java.io.IOException; import java.io.Writer; import java.util.List; +import org.forester.sequence.Sequence; import org.forester.sequence.Sequence.TYPE; public interface Msa { - public Object getIdentifier( int row ); + public String getIdentifier( int row ); - public void setIdentifier( int row, Object identifier ); + public void setIdentifier( int row, String identifier ); public int getLength(); @@ -45,6 +46,12 @@ public interface Msa { public List getColumnAt( int col ); + public Sequence getSequence( final String id ); + + public Sequence getSequence( final int row ); + + public List asSequenceList(); + public StringBuffer getSequenceAsString( int row ); public abstract TYPE getType(); diff --git a/forester/java/src/org/forester/msa/MsaCompactor.java b/forester/java/src/org/forester/msa/MsaCompactor.java index a268a24..9172696 100644 --- a/forester/java/src/org/forester/msa/MsaCompactor.java +++ b/forester/java/src/org/forester/msa/MsaCompactor.java @@ -1,45 +1,198 @@ + package org.forester.msa; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + import org.forester.sequence.Sequence; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; - +import org.forester.util.ForesterUtil; public class MsaCompactor { - - final private Msa _msa; - - public MsaCompactor( Msa msa ) { - + + private static final boolean VERBOSE = true; + private Msa _msa; + private final SortedSet _removed_seq_ids; + + private MsaCompactor( final Msa msa ) { _msa = msa; + _removed_seq_ids = new TreeSet(); + } + + final public SortedSet getRemovedSeqIds() { + return _removed_seq_ids; + } + + final public Msa getMsa() { + return _msa; + } + + public final static MsaCompactor removeWorstOffenders( final Msa msa, + final int worst_offenders_to_remove, + final boolean realign ) throws IOException, + InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign ); + return mc; + } + + public final static MsaCompactor reduceGapAverage( final Msa msa, + final double max_gap_average, + final int step, + final boolean realign ) throws IOException, InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeViaGapAverage( max_gap_average, step, realign ); + return mc; + } + + public final static MsaCompactor reduceLength( final Msa msa, + final int length, + final int step, + final boolean realign ) throws IOException, InterruptedException { + final MsaCompactor mc = new MsaCompactor( msa ); + mc.removeViaLength( length, step, realign ); + return mc; + } + + final private void removeGapColumns() { + _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa ); + } + + final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign ) + throws IOException, InterruptedException { + final DescriptiveStatistics stats[] = calcStats(); + final List to_remove_ids = new ArrayList(); + for( int j = 0; j < to_remove; ++j ) { + to_remove_ids.add( stats[ j ].getDescription() ); + _removed_seq_ids.add( stats[ j ].getDescription() ); + } + _msa = MsaMethods.removeSequences( _msa, to_remove_ids ); + removeGapColumns(); + if ( realign ) { + mafft(); + } + } + + final private void mafft() throws IOException, InterruptedException { + final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" ); + final List opts = new ArrayList(); + opts.add( "--maxiterate" ); + opts.add( "1000" ); + opts.add( "--localpair" ); + opts.add( "--quiet" ); + _msa = mafft.infer( _msa.asSequenceList(), opts ); + } + + final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign ) + throws IOException, InterruptedException { + if ( VERBOSE ) { + System.out.println( "start: " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + int counter = 0; + while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) { + removeWorstOffenders( step, 1, false ); + if ( realign ) { + mafft(); + } + if ( VERBOSE ) { + System.out.println( counter + ": " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + counter += step; + } + } + + final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException, + InterruptedException { + if ( VERBOSE ) { + System.out.println( "start: " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + int counter = 0; + while ( _msa.getLength() > length ) { + removeWorstOffenders( step, 1, false ); + if ( realign ) { + mafft(); + } + if ( VERBOSE ) { + System.out.println( counter + ": " + _msa.getLength() + " " + + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) ); + } + counter += step; + } + } + + final private DescriptiveStatistics[] calcStats() { + final DescriptiveStatistics stats[] = calc(); + sort( stats ); + for( int i = 0; i < stats.length; ++i ) { + final DescriptiveStatistics s = stats[ i ]; + // System.out.print( s.getDescription() ); + // System.out.print( "\t" ); + // System.out.print( s.arithmeticMean() ); + // System.out.print( "\t(" ); + // System.out.print( s.arithmeticMean() ); + // System.out.print( ")" ); + // System.out.print( "\t" ); + // System.out.print( s.getMin() ); + // System.out.print( "\t" ); + // System.out.print( s.getMax() ); + // System.out.println(); + } + return stats; } - - - - private DescriptiveStatistics[] calc() { + + private final static void sort( final DescriptiveStatistics stats[] ) { + Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) ); + } + + private final DescriptiveStatistics[] calc() { final double gappiness[] = calcGappiness(); final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ]; - for ( int row = 0; row < _msa.getNumberOfSequences(); ++row ) { - stats[ row ] = new BasicDescriptiveStatistics(); + for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) { + stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) ); for( int col = 0; col < _msa.getLength(); ++col ) { if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) { stats[ row ].addValue( gappiness[ col ] ); - } } } return stats; } - - private double[] calcGappiness() { + + private final double[] calcGappiness() { final double gappiness[] = new double[ _msa.getLength() ]; - final int seqs = _msa.getNumberOfSequences(); + final int seqs = _msa.getNumberOfSequences(); for( int i = 0; i < gappiness.length; ++i ) { - gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / _msa.getNumberOfSequences(); - + gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs; } return gappiness; - } - + + final static class DescriptiveStatisticsComparator implements Comparator { + + final boolean _ascending; + + public DescriptiveStatisticsComparator( final boolean ascending ) { + _ascending = ascending; + } + + @Override + public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) { + if ( s0.arithmeticMean() < s1.arithmeticMean() ) { + return _ascending ? -1 : 1; + } + else if ( s0.arithmeticMean() > s1.arithmeticMean() ) { + return _ascending ? 1 : -1; + } + return 0; + } + } } diff --git a/forester/java/src/org/forester/msa/MsaInferrer.java b/forester/java/src/org/forester/msa/MsaInferrer.java index 8266648..8c839f9 100644 --- a/forester/java/src/org/forester/msa/MsaInferrer.java +++ b/forester/java/src/org/forester/msa/MsaInferrer.java @@ -29,6 +29,7 @@ import java.io.File; import java.io.IOException; import java.util.List; +import org.forester.sequence.Sequence; import org.forester.util.SystemCommandExecutor; public abstract class MsaInferrer { @@ -47,4 +48,7 @@ public abstract class MsaInferrer { } public abstract Msa infer( File path_to_input_seqs, List opts ) throws IOException, InterruptedException; + + public abstract Msa infer( final List seqs, final List opts ) throws IOException, + InterruptedException; } diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index 716c843..ff6d570 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -72,9 +72,22 @@ public final class MsaMethods { return gap_rows; } - synchronized public Msa removeGapColumns( final double max_allowed_gap_ratio, - final int min_allowed_length, - final Msa msa ) { + final public static Msa removeSequences( final Msa msa, final List to_remove_ids ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { + seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio, + final int min_allowed_length, + final Msa msa ) { init(); if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) { throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio ); @@ -83,7 +96,7 @@ public final class MsaMethods { final boolean[] delete_cols = new boolean[ msa.getLength() ]; int new_length = 0; for( int col = 0; col < msa.getLength(); ++col ) { - delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) > max_allowed_gap_ratio; + delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio; if ( !delete_cols[ col ] ) { ++new_length; } diff --git a/forester/java/src/org/forester/sequence/BasicSequence.java b/forester/java/src/org/forester/sequence/BasicSequence.java index 786cc03..c14277c 100644 --- a/forester/java/src/org/forester/sequence/BasicSequence.java +++ b/forester/java/src/org/forester/sequence/BasicSequence.java @@ -29,24 +29,24 @@ package org.forester.sequence; public class BasicSequence implements Sequence { private final char[] _mol_sequence; - private final Object _identifier; + private final String _identifier; private final TYPE _type; - private BasicSequence( final Object identifier, final String mol_sequence, final TYPE type ) { + private BasicSequence( final String identifier, final String mol_sequence, final TYPE type ) { _mol_sequence = mol_sequence.toCharArray(); _identifier = identifier; _type = type; } // Only use if you know what you are doing! - public BasicSequence( final Object identifier, final char[] mol_sequence, final TYPE type ) { + public BasicSequence( final String identifier, final char[] mol_sequence, final TYPE type ) { _mol_sequence = mol_sequence; _identifier = identifier; _type = type; } @Override - public Object getIdentifier() { + public String getIdentifier() { return _identifier; } @@ -90,17 +90,25 @@ public class BasicSequence implements Sequence { return sb.toString(); } - public static Sequence createAaSequence( final Object identifier, final String mol_sequence ) { + public static Sequence copySequence( final Sequence seq ) { + final char[] s = new char[ seq.getMolecularSequence().length ]; + for( int i = 0; i < seq.getMolecularSequence().length; i++ ) { + s[ i ] = seq.getMolecularSequence()[ i ]; + } + return new BasicSequence( new String( seq.getIdentifier() ), s, seq.getType() ); + } + + public static Sequence createAaSequence( final String identifier, final String mol_sequence ) { return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) .replaceAll( AA_REGEXP, Character.toString( UNSPECIFIED_AA ) ), TYPE.AA ); } - public static Sequence createDnaSequence( final Object identifier, final String mol_sequence ) { + public static Sequence createDnaSequence( final String identifier, final String mol_sequence ) { return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) .replaceAll( DNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.DNA ); } - public static Sequence createRnaSequence( final Object identifier, final String mol_sequence ) { + public static Sequence createRnaSequence( final String identifier, final String mol_sequence ) { return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) .replaceAll( RNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.RNA ); } diff --git a/forester/java/src/org/forester/sequence/Sequence.java b/forester/java/src/org/forester/sequence/Sequence.java index fd13abf..9a92fbe 100644 --- a/forester/java/src/org/forester/sequence/Sequence.java +++ b/forester/java/src/org/forester/sequence/Sequence.java @@ -37,7 +37,7 @@ public interface Sequence { static final String DNA_REGEXP = "[^ACGTRYMKWSN\\-\\*]"; static final String RNA_REGEXP = "[^ACGURYMKWSN\\-\\*]"; - public abstract Object getIdentifier(); + public abstract String getIdentifier(); public abstract int getLength(); diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index d6a70ee..b7eff50 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -7970,8 +7970,6 @@ public final class Test { opts.add( "1000" ); opts.add( "--localpair" ); opts.add( "--quiet" ); - - Msa msa = null; final MsaInferrer mafft = Mafft.createInstance( path ); msa = mafft.infer( new File( PATH_TO_TEST_DATA + "ncbi_sn.fasta" ), opts ); @@ -8553,9 +8551,9 @@ public final class Test { private static boolean testMsaQualityMethod() { try { final Sequence s0 = BasicSequence.createAaSequence( "a", "ABAXEFGHIJ" ); - final Sequence s1 = BasicSequence.createAaSequence( "a", "ABBXEFGHIJ" ); - final Sequence s2 = BasicSequence.createAaSequence( "a", "AXCXEFGHIJ" ); - final Sequence s3 = BasicSequence.createAaSequence( "a", "AXDDEFGHIJ" ); + final Sequence s1 = BasicSequence.createAaSequence( "b", "ABBXEFGHIJ" ); + final Sequence s2 = BasicSequence.createAaSequence( "c", "AXCXEFGHIJ" ); + final Sequence s3 = BasicSequence.createAaSequence( "d", "AXDDEFGHIJ" ); final List l = new ArrayList(); l.add( s0 ); l.add( s1 ); diff --git a/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java b/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java index 4d73ffb..55aa23b 100644 --- a/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java +++ b/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java @@ -45,6 +45,11 @@ public class BasicDescriptiveStatistics implements DescriptiveStatistics { init(); } + public BasicDescriptiveStatistics( final String desc ) { + init(); + setDescription( desc ); + } + /* (non-Javadoc) * @see org.forester.util.DescriptiveStatisticsI#addValue(double) */