import java.io.File;
import java.io.FileInputStream;
+import java.io.InputStream;
+import org.forester.io.parsers.FastaParser;
import org.forester.io.parsers.GeneralMsaParser;
import org.forester.msa.Msa;
import org.forester.msa.MsaMethods;
System.exit( 0 );
}
Msa msa = null;
- msa = GeneralMsaParser.parse( new FileInputStream( in ) );
+ final InputStream is = new FileInputStream( in );
+ if ( FastaParser.isLikelyFasta( in ) ) {
+ msa = FastaParser.parseMsa( is );
+ }
+ else {
+ msa = GeneralMsaParser.parse( is );
+ }
if ( cla.isOptionSet( FROM_OPTION ) ) {
singleCalc( in, from, to, msa );
}
--- /dev/null
+
+package org.forester.application;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.forester.io.parsers.FastaParser;
+import org.forester.io.parsers.GeneralMsaParser;
+import org.forester.msa.Msa;
+import org.forester.msa.MsaCompactor;
+import org.forester.msa.MsaMethods;
+import org.forester.util.CommandLineArguments;
+import org.forester.util.ForesterUtil;
+
+public class msa_compactor {
+
+ final static private String HELP_OPTION_1 = "help";
+ final static private String HELP_OPTION_2 = "h";
+ final static private String REMOVE_WORST_OFFENDERS_OPTION = "w";
+ final static private String AV_GAPINESS_OPTION = "a";
+ final static private String LENGTH_OPTION = "l";
+ final static private String REALIGN_OPTION = "r";
+ final static private String PRG_NAME = "msa_compactor";
+ final static private String PRG_DESC = "multiple sequnce aligment compactor";
+ final static private String PRG_VERSION = "0.90";
+ final static private String PRG_DATE = "2012.07.11";
+ final static private String E_MAIL = "phylosoft@gmail.com";
+ final static private String WWW = "www.phylosoft.org/forester/";
+
+ public static void main( final String args[] ) {
+ try {
+ final CommandLineArguments cla = new CommandLineArguments( args );
+ if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) {
+ printHelp();
+ System.exit( 0 );
+ }
+ final File in = cla.getFile( 0 );
+ int worst_remove = -1;
+ double av = -1;
+ int length = -1;
+ final int step = 5;
+ boolean realign = false;
+ // int to = 0;
+ // int window = 0;
+ // int step = 0;
+ final List<String> allowed_options = new ArrayList<String>();
+ allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION );
+ allowed_options.add( AV_GAPINESS_OPTION );
+ allowed_options.add( LENGTH_OPTION );
+ allowed_options.add( REALIGN_OPTION );
+ final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
+ if ( dissallowed_options.length() > 0 ) {
+ ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
+ }
+ if ( cla.isOptionSet( REMOVE_WORST_OFFENDERS_OPTION ) ) {
+ worst_remove = cla.getOptionValueAsInt( REMOVE_WORST_OFFENDERS_OPTION );
+ }
+ if ( cla.isOptionSet( AV_GAPINESS_OPTION ) ) {
+ av = cla.getOptionValueAsDouble( AV_GAPINESS_OPTION );
+ }
+ if ( cla.isOptionSet( LENGTH_OPTION ) ) {
+ length = cla.getOptionValueAsInt( LENGTH_OPTION );
+ }
+ if ( cla.isOptionSet( REALIGN_OPTION ) ) {
+ realign = true;
+ }
+ // else if ( cla.isOptionSet( STEP_OPTION ) && cla.isOptionSet( WINDOW_OPTION ) ) {
+ // step = cla.getOptionValueAsInt( STEP_OPTION );
+ // window = cla.getOptionValueAsInt( WINDOW_OPTION );
+ // }
+ // else {
+ // printHelp();
+ // System.exit( 0 );
+ // }
+ Msa msa = null;
+ final FileInputStream is = new FileInputStream( in );
+ if ( FastaParser.isLikelyFasta( in ) ) {
+ msa = FastaParser.parseMsa( is );
+ }
+ else {
+ msa = GeneralMsaParser.parse( is );
+ }
+ System.out.println( msa.toString() );
+ System.out.println( MsaMethods.calcBasicGapinessStatistics( msa ).arithmeticMean() );
+ MsaCompactor mc = null;
+ if ( worst_remove > 0 ) {
+ mc = MsaCompactor.removeWorstOffenders( msa, worst_remove, realign );
+ }
+ else if ( av > 0 ) {
+ mc = MsaCompactor.reduceGapAverage( msa, av, step, realign );
+ }
+ else if ( length > 0 ) {
+ mc = MsaCompactor.reduceLength( msa, length, step, realign );
+ }
+ System.out.println( mc.getMsa().toString() );
+ System.out.println( MsaMethods.calcBasicGapinessStatistics( mc.getMsa() ).arithmeticMean() );
+ for( final String id : mc.getRemovedSeqIds() ) {
+ System.out.println( id );
+ }
+ }
+ catch ( final Exception e ) {
+ e.printStackTrace();
+ ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
+ }
+ }
+
+ private static void printHelp() {
+ ForesterUtil.printProgramInformation( PRG_NAME,
+ PRG_DESC,
+ PRG_VERSION,
+ PRG_DATE,
+ E_MAIL,
+ WWW,
+ ForesterUtil.getForesterLibraryInformation() );
+ System.out.println( "Usage:" );
+ System.out.println();
+ System.out.println( PRG_NAME + " <options> <msa input file>" );
+ System.out.println();
+ System.out.println( " options: " );
+ System.out.println();
+ // System.out.println( " -" + FROM_OPTION + "=<integer>: from (msa column)" );
+ // System.out.println( " -" + TO_OPTION + "=<integer>: to (msa column)" );
+ // System.out.println( " or" );
+ // System.out.println( " -" + WINDOW_OPTION + "=<integer>: window size (msa columns)" );
+ System.out.println( " -" + REMOVE_WORST_OFFENDERS_OPTION + "=<integer>: step size (msa columns)" );
+ System.out.println();
+ System.out.println();
+ System.out.println();
+ }
+}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
setMsa( null );
Msa msa = null;
try {
- if ( FastaParser.isLikelyFasta( new FileInputStream( file ) ) ) {
- msa = FastaParser.parseMsa( new FileInputStream( file ) );
- System.out.println( msa.toString() );
+ final InputStream is = new FileInputStream( file );
+ if ( FastaParser.isLikelyFasta( file ) ) {
+ msa = FastaParser.parseMsa( is );
}
else {
- msa = GeneralMsaParser.parse( new FileInputStream( file ) );
+ msa = GeneralMsaParser.parse( is );
}
}
catch ( final MsaFormatException e ) {
import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.tools.BootstrapResampler;
import org.forester.io.parsers.FastaParser;
-import org.forester.io.writers.SequenceWriter;
-import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
import org.forester.msa.BasicMsa;
import org.forester.msa.ClustalOmega;
import org.forester.msa.Mafft;
}
private Msa inferMsa() throws IOException, InterruptedException {
- final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" );
- if ( DEBUG ) {
- System.out.println();
- System.out.println( "temp file: " + temp_seqs_file );
- System.out.println();
- }
- //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" );
- final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) );
- SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 );
- writer.close();
+ // final File temp_seqs_file = File.createTempFile( "__msa__temp__", ".fasta" );
+ // if ( DEBUG ) {
+ // System.out.println();
+ // System.out.println( "temp file: " + temp_seqs_file );
+ // System.out.println();
+ // }
+ // //final File temp_seqs_file = new File( _options.getTempDir() + ForesterUtil.FILE_SEPARATOR + "s.fasta" );
+ // final BufferedWriter writer = new BufferedWriter( new FileWriter( temp_seqs_file ) );
+ // SequenceWriter.writeSeqs( _seqs, writer, SEQ_FORMAT.FASTA, 100 );
+ // writer.close();
final List<String> opts = processMafftOptions();
- return runMAFFT( temp_seqs_file, opts );
+ return runMAFFT( _seqs, opts );
}
private List<String> processMafftOptions() {
}
}
- private Msa runMAFFT( final File input_seqs, final List<String> opts ) throws IOException, InterruptedException {
+ private Msa runMAFFT( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
Msa msa = null;
final MsaInferrer mafft = Mafft.createInstance( _mf.getInferenceManager().getPathToLocalMafft()
.getCanonicalPath() );
try {
- msa = mafft.infer( input_seqs, opts );
+ msa = mafft.infer( seqs, opts );
}
catch ( final IOException e ) {
System.out.println( mafft.getErrorDescription() );
private void copyIdentifiers( final int s, final BasicSymmetricalDistanceMatrix d ) {
for( int i = 0; i < s; i++ ) {
- d.setIdentifier( i, ( String ) _msa.getIdentifier( i ) );
+ d.setIdentifier( i, _msa.getIdentifier( i ) );
}
}
}
}
+ static public boolean isLikelyFasta( final File f ) throws IOException {
+ return isLikelyFasta( new FileInputStream( f ) );
+ }
+
static public boolean isLikelyFasta( final InputStream is ) throws IOException {
final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
String line = null;
import java.io.Writer;
import java.util.List;
-import org.forester.sequence.BasicSequence;
import org.forester.sequence.Sequence;
import org.forester.util.ForesterUtil;
FASTA;
}
- public static void main( final String[] args ) {
- final Sequence s = BasicSequence.createAaSequence( "name", "abcdefghiiklmnap" );
- System.out.println( s.toString() );
- System.out.println( SequenceWriter.toFasta( s, 0 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 5 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 8 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 4 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 3 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 2 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 1 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 100 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 15 ).toString() );
- System.out.println( SequenceWriter.toFasta( s, 16 ).toString() );
- }
-
public static StringBuilder toFasta( final Sequence seq, final int width ) {
final StringBuilder sb = new StringBuilder();
sb.append( ">" );
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
+import org.forester.sequence.BasicSequence;
import org.forester.sequence.Sequence;
import org.forester.sequence.Sequence.TYPE;
import org.forester.util.ForesterUtil;
public class BasicMsa implements Msa {
private final char[][] _data;
- private final Object[] _identifiers;
+ private final String[] _identifiers;
private final TYPE _type;
public BasicMsa( final int rows, final int columns, final TYPE type ) {
throw new IllegalArgumentException( "basic msa of size zero are illegal" );
}
_data = new char[ rows ][ columns ];
- _identifiers = new Object[ rows ];
+ _identifiers = new String[ rows ];
_type = type;
}
_type = msa._type;
}
+ @Override
+ public List<Sequence> asSequenceList() {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int i = 0; i < getNumberOfSequences(); ++i ) {
+ seqs.add( getSequence( i ) );
+ }
+ return seqs;
+ }
+
private int determineMaxIdLength() {
int max = 0;
for( int row = 0; row < _data.length; ++row ) {
}
@Override
- public Object getIdentifier( final int row ) {
+ public String getIdentifier( final int row ) {
return _identifiers[ row ];
}
}
@Override
+ public Sequence getSequence( final String id ) {
+ for( int i = 0; i < getNumberOfSequences(); ++i ) {
+ if ( getIdentifier( i ).equals( id ) ) {
+ return getSequence( i );
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public Sequence getSequence( final int row ) {
+ return new BasicSequence( getIdentifier( row ), _data[ row ], getType() );
+ }
+
+ @Override
public StringBuffer getSequenceAsString( final int row ) {
final StringBuffer sb = new StringBuffer( _data[ 0 ].length );
for( int col = 0; col < _data[ 0 ].length; ++col ) {
}
@Override
- public void setIdentifier( final int row, final Object id ) {
+ public void setIdentifier( final int row, final String id ) {
_identifiers[ row ] = id;
}
if ( seqs.size() < 1 ) {
throw new IllegalArgumentException( "cannot create basic msa from less than one sequence" );
}
+ final Set<String> ids = new HashSet<String>();
final int length = seqs.get( 0 ).getLength();
final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() );
for( int row = 0; row < seqs.size(); ++row ) {
if ( seq.getType() != msa.getType() ) {
throw new IllegalArgumentException( "illegal attempt to build msa from sequences of different type" );
}
+ if ( ids.contains( seq.getIdentifier() ) ) {
+ throw new IllegalArgumentException( "illegal attempt to create msa with non-unique identifiers ["
+ + seq.getIdentifier() + "]" );
+ }
+ ids.add( seq.getIdentifier() );
msa.setIdentifier( row, seq.getIdentifier() );
for( int col = 0; col < length; ++col ) {
msa._data[ row ][ col ] = seq.getResidueAt( col );
import java.util.List;
import org.forester.io.parsers.FastaParser;
+import org.forester.sequence.Sequence;
import org.forester.util.SystemCommandExecutor;
public final class ClustalOmega extends MsaInferrer {
_error = null;
_exit_code = -100;
}
+
+ @Override
+ public Msa infer( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
+ // TODO Auto-generated method stub
+ return null;
+ }
}
package org.forester.msa;
+import java.io.BufferedWriter;
import java.io.File;
+import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.forester.io.parsers.FastaParser;
+import org.forester.io.writers.SequenceWriter;
+import org.forester.io.writers.SequenceWriter.SEQ_FORMAT;
+import org.forester.sequence.Sequence;
import org.forester.util.SystemCommandExecutor;
public final class Mafft extends MsaInferrer {
}
@Override
+ public Msa infer( final List<Sequence> seqs, final List<String> opts ) throws IOException, InterruptedException {
+ final File file = File.createTempFile( "__mafft_input_", ".fasta" );
+ file.deleteOnExit();
+ final BufferedWriter writer = new BufferedWriter( new FileWriter( file ) );
+ SequenceWriter.writeSeqs( seqs, writer, SEQ_FORMAT.FASTA, 100 );
+ writer.close();
+ final Msa msa = infer( file, opts );
+ file.delete();
+ return msa;
+ }
+
+ @Override
public Msa infer( final File path_to_input_seqs, final List<String> opts ) throws IOException, InterruptedException {
init();
final List<String> my_opts = new ArrayList<String>();
import java.io.Writer;
import java.util.List;
+import org.forester.sequence.Sequence;
import org.forester.sequence.Sequence.TYPE;
public interface Msa {
- public Object getIdentifier( int row );
+ public String getIdentifier( int row );
- public void setIdentifier( int row, Object identifier );
+ public void setIdentifier( int row, String identifier );
public int getLength();
public List<Character> getColumnAt( int col );
+ public Sequence getSequence( final String id );
+
+ public Sequence getSequence( final int row );
+
+ public List<Sequence> asSequenceList();
+
public StringBuffer getSequenceAsString( int row );
public abstract TYPE getType();
+
package org.forester.msa;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
import org.forester.sequence.Sequence;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.DescriptiveStatistics;
-
+import org.forester.util.ForesterUtil;
public class MsaCompactor {
-
- final private Msa _msa;
-
- public MsaCompactor( Msa msa ) {
-
+
+ private static final boolean VERBOSE = true;
+ private Msa _msa;
+ private final SortedSet<String> _removed_seq_ids;
+
+ private MsaCompactor( final Msa msa ) {
_msa = msa;
+ _removed_seq_ids = new TreeSet<String>();
+ }
+
+ final public SortedSet<String> getRemovedSeqIds() {
+ return _removed_seq_ids;
+ }
+
+ final public Msa getMsa() {
+ return _msa;
+ }
+
+ public final static MsaCompactor removeWorstOffenders( final Msa msa,
+ final int worst_offenders_to_remove,
+ final boolean realign ) throws IOException,
+ InterruptedException {
+ final MsaCompactor mc = new MsaCompactor( msa );
+ mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign );
+ return mc;
+ }
+
+ public final static MsaCompactor reduceGapAverage( final Msa msa,
+ final double max_gap_average,
+ final int step,
+ final boolean realign ) throws IOException, InterruptedException {
+ final MsaCompactor mc = new MsaCompactor( msa );
+ mc.removeViaGapAverage( max_gap_average, step, realign );
+ return mc;
+ }
+
+ public final static MsaCompactor reduceLength( final Msa msa,
+ final int length,
+ final int step,
+ final boolean realign ) throws IOException, InterruptedException {
+ final MsaCompactor mc = new MsaCompactor( msa );
+ mc.removeViaLength( length, step, realign );
+ return mc;
+ }
+
+ final private void removeGapColumns() {
+ _msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
+ }
+
+ final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign )
+ throws IOException, InterruptedException {
+ final DescriptiveStatistics stats[] = calcStats();
+ final List<String> to_remove_ids = new ArrayList<String>();
+ for( int j = 0; j < to_remove; ++j ) {
+ to_remove_ids.add( stats[ j ].getDescription() );
+ _removed_seq_ids.add( stats[ j ].getDescription() );
+ }
+ _msa = MsaMethods.removeSequences( _msa, to_remove_ids );
+ removeGapColumns();
+ if ( realign ) {
+ mafft();
+ }
+ }
+
+ final private void mafft() throws IOException, InterruptedException {
+ final MsaInferrer mafft = Mafft.createInstance( "/home/czmasek/bin/mafft" );
+ final List<String> opts = new ArrayList<String>();
+ opts.add( "--maxiterate" );
+ opts.add( "1000" );
+ opts.add( "--localpair" );
+ opts.add( "--quiet" );
+ _msa = mafft.infer( _msa.asSequenceList(), opts );
+ }
+
+ final private void removeViaGapAverage( final double mean_gapiness, final int step, final boolean realign )
+ throws IOException, InterruptedException {
+ if ( VERBOSE ) {
+ System.out.println( "start: " + _msa.getLength() + " "
+ + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ }
+ int counter = 0;
+ while ( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean() > mean_gapiness ) {
+ removeWorstOffenders( step, 1, false );
+ if ( realign ) {
+ mafft();
+ }
+ if ( VERBOSE ) {
+ System.out.println( counter + ": " + _msa.getLength() + " "
+ + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ }
+ counter += step;
+ }
+ }
+
+ final private void removeViaLength( final int length, final int step, final boolean realign ) throws IOException,
+ InterruptedException {
+ if ( VERBOSE ) {
+ System.out.println( "start: " + _msa.getLength() + " "
+ + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ }
+ int counter = 0;
+ while ( _msa.getLength() > length ) {
+ removeWorstOffenders( step, 1, false );
+ if ( realign ) {
+ mafft();
+ }
+ if ( VERBOSE ) {
+ System.out.println( counter + ": " + _msa.getLength() + " "
+ + ForesterUtil.round( MsaMethods.calcBasicGapinessStatistics( _msa ).arithmeticMean(), 3 ) );
+ }
+ counter += step;
+ }
+ }
+
+ final private DescriptiveStatistics[] calcStats() {
+ final DescriptiveStatistics stats[] = calc();
+ sort( stats );
+ for( int i = 0; i < stats.length; ++i ) {
+ final DescriptiveStatistics s = stats[ i ];
+ // System.out.print( s.getDescription() );
+ // System.out.print( "\t" );
+ // System.out.print( s.arithmeticMean() );
+ // System.out.print( "\t(" );
+ // System.out.print( s.arithmeticMean() );
+ // System.out.print( ")" );
+ // System.out.print( "\t" );
+ // System.out.print( s.getMin() );
+ // System.out.print( "\t" );
+ // System.out.print( s.getMax() );
+ // System.out.println();
+ }
+ return stats;
}
-
-
-
- private DescriptiveStatistics[] calc() {
+
+ private final static void sort( final DescriptiveStatistics stats[] ) {
+ Arrays.sort( stats, new DescriptiveStatisticsComparator( false ) );
+ }
+
+ private final DescriptiveStatistics[] calc() {
final double gappiness[] = calcGappiness();
final DescriptiveStatistics stats[] = new DescriptiveStatistics[ _msa.getNumberOfSequences() ];
- for ( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
- stats[ row ] = new BasicDescriptiveStatistics();
+ for( int row = 0; row < _msa.getNumberOfSequences(); ++row ) {
+ stats[ row ] = new BasicDescriptiveStatistics( _msa.getIdentifier( row ) );
for( int col = 0; col < _msa.getLength(); ++col ) {
if ( _msa.getResidueAt( row, col ) != Sequence.GAP ) {
stats[ row ].addValue( gappiness[ col ] );
-
}
}
}
return stats;
}
-
- private double[] calcGappiness() {
+
+ private final double[] calcGappiness() {
final double gappiness[] = new double[ _msa.getLength() ];
- final int seqs = _msa.getNumberOfSequences();
+ final int seqs = _msa.getNumberOfSequences();
for( int i = 0; i < gappiness.length; ++i ) {
- gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / _msa.getNumberOfSequences();
-
+ gappiness[ i ] = ( double ) MsaMethods.calcGapSumPerColumn( _msa, i ) / seqs;
}
return gappiness;
-
}
-
+
+ final static class DescriptiveStatisticsComparator implements Comparator<DescriptiveStatistics> {
+
+ final boolean _ascending;
+
+ public DescriptiveStatisticsComparator( final boolean ascending ) {
+ _ascending = ascending;
+ }
+
+ @Override
+ public final int compare( final DescriptiveStatistics s0, final DescriptiveStatistics s1 ) {
+ if ( s0.arithmeticMean() < s1.arithmeticMean() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( s0.arithmeticMean() > s1.arithmeticMean() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ }
+ }
}
import java.io.IOException;
import java.util.List;
+import org.forester.sequence.Sequence;
import org.forester.util.SystemCommandExecutor;
public abstract class MsaInferrer {
}
public abstract Msa infer( File path_to_input_seqs, List<String> opts ) throws IOException, InterruptedException;
+
+ public abstract Msa infer( final List<Sequence> seqs, final List<String> opts ) throws IOException,
+ InterruptedException;
}
return gap_rows;
}
- synchronized public Msa removeGapColumns( final double max_allowed_gap_ratio,
- final int min_allowed_length,
- final Msa msa ) {
+ final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
+ seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio,
+ final int min_allowed_length,
+ final Msa msa ) {
init();
if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
final boolean[] delete_cols = new boolean[ msa.getLength() ];
int new_length = 0;
for( int col = 0; col < msa.getLength(); ++col ) {
- delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) > max_allowed_gap_ratio;
+ delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio;
if ( !delete_cols[ col ] ) {
++new_length;
}
public class BasicSequence implements Sequence {
private final char[] _mol_sequence;
- private final Object _identifier;
+ private final String _identifier;
private final TYPE _type;
- private BasicSequence( final Object identifier, final String mol_sequence, final TYPE type ) {
+ private BasicSequence( final String identifier, final String mol_sequence, final TYPE type ) {
_mol_sequence = mol_sequence.toCharArray();
_identifier = identifier;
_type = type;
}
// Only use if you know what you are doing!
- public BasicSequence( final Object identifier, final char[] mol_sequence, final TYPE type ) {
+ public BasicSequence( final String identifier, final char[] mol_sequence, final TYPE type ) {
_mol_sequence = mol_sequence;
_identifier = identifier;
_type = type;
}
@Override
- public Object getIdentifier() {
+ public String getIdentifier() {
return _identifier;
}
return sb.toString();
}
- public static Sequence createAaSequence( final Object identifier, final String mol_sequence ) {
+ public static Sequence copySequence( final Sequence seq ) {
+ final char[] s = new char[ seq.getMolecularSequence().length ];
+ for( int i = 0; i < seq.getMolecularSequence().length; i++ ) {
+ s[ i ] = seq.getMolecularSequence()[ i ];
+ }
+ return new BasicSequence( new String( seq.getIdentifier() ), s, seq.getType() );
+ }
+
+ public static Sequence createAaSequence( final String identifier, final String mol_sequence ) {
return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
.replaceAll( AA_REGEXP, Character.toString( UNSPECIFIED_AA ) ), TYPE.AA );
}
- public static Sequence createDnaSequence( final Object identifier, final String mol_sequence ) {
+ public static Sequence createDnaSequence( final String identifier, final String mol_sequence ) {
return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
.replaceAll( DNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.DNA );
}
- public static Sequence createRnaSequence( final Object identifier, final String mol_sequence ) {
+ public static Sequence createRnaSequence( final String identifier, final String mol_sequence ) {
return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
.replaceAll( RNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.RNA );
}
static final String DNA_REGEXP = "[^ACGTRYMKWSN\\-\\*]";
static final String RNA_REGEXP = "[^ACGURYMKWSN\\-\\*]";
- public abstract Object getIdentifier();
+ public abstract String getIdentifier();
public abstract int getLength();
opts.add( "1000" );
opts.add( "--localpair" );
opts.add( "--quiet" );
-
-
Msa msa = null;
final MsaInferrer mafft = Mafft.createInstance( path );
msa = mafft.infer( new File( PATH_TO_TEST_DATA + "ncbi_sn.fasta" ), opts );
private static boolean testMsaQualityMethod() {
try {
final Sequence s0 = BasicSequence.createAaSequence( "a", "ABAXEFGHIJ" );
- final Sequence s1 = BasicSequence.createAaSequence( "a", "ABBXEFGHIJ" );
- final Sequence s2 = BasicSequence.createAaSequence( "a", "AXCXEFGHIJ" );
- final Sequence s3 = BasicSequence.createAaSequence( "a", "AXDDEFGHIJ" );
+ final Sequence s1 = BasicSequence.createAaSequence( "b", "ABBXEFGHIJ" );
+ final Sequence s2 = BasicSequence.createAaSequence( "c", "AXCXEFGHIJ" );
+ final Sequence s3 = BasicSequence.createAaSequence( "d", "AXDDEFGHIJ" );
final List<Sequence> l = new ArrayList<Sequence>();
l.add( s0 );
l.add( s1 );
init();
}
+ public BasicDescriptiveStatistics( final String desc ) {
+ init();
+ setDescription( desc );
+ }
+
/* (non-Javadoc)
* @see org.forester.util.DescriptiveStatisticsI#addValue(double)
*/