final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
- private static final boolean VERBOSE = true;
+ private static final boolean VERBOSE = false;
private Msa _msa;
- private final SortedSet<String> _removed_seq_ids;
+ private File _out_file_base;
private String _path_to_mafft;
+ private final SortedSet<String> _removed_seq_ids;
static {
NF_4.setRoundingMode( RoundingMode.HALF_UP );
NF_3.setRoundingMode( RoundingMode.HALF_UP );
return _removed_seq_ids;
}
- final public void writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException {
+ final public void setOutFileBase( final File out_file_base ) {
+ _out_file_base = out_file_base;
+ }
+
+ final public String writeMsa( final File outfile, final MSA_FORMAT format, final String suffix ) throws IOException {
final Double gr = MsaMethods.calcGapRatio( _msa );
- writeMsa( outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
- + ForesterUtil.roundToInt( gr * 100 ) + suffix,
- format );
+ final String s = outfile + "_" + _msa.getNumberOfSequences() + "_" + _msa.getLength() + "_"
+ + ForesterUtil.roundToInt( gr * 100 );
+ writeMsa( s + suffix, format );
+ return s;
}
final int calcNonGapResidues( final Sequence seq ) {
return ng;
}
+ Phylogeny pi( final String matrix ) {
+ final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix );
+ final int seed = 15;
+ final int n = 100;
+ final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
+ final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(),
+ n,
+ seed );
+ final Phylogeny[] eval_phys = new Phylogeny[ n ];
+ for( int i = 0; i < n; ++i ) {
+ resampleable_msa.resample( resampled_column_positions[ i ] );
+ eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null );
+ }
+ ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
+ PhylogenyMethods.extractFastaInformation( master_phy );
+ return master_phy;
+ }
+
private final GapContribution[] calcGapContribtions( final boolean normalize_for_effective_seq_length ) {
final double gappiness[] = calcGappiness();
final GapContribution stats[] = new GapContribution[ _msa.getNumberOfSequences() ];
return stats;
}
- private static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
- final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
- for( int c = from; c <= to; ++c ) {
- stats.addValue( MsaMethods.calculateIdentityRatio( msa, c ) );
- }
- return stats;
- }
-
private final double[] calcGappiness() {
final int l = _msa.getLength();
final double gappiness[] = new double[ l ];
return gappiness;
}
- // Returns null if not path found.
- final public static String guessPathToMafft() {
- String path;
- if ( ForesterUtil.OS_NAME.toLowerCase().indexOf( "win" ) >= 0 ) {
- path = "C:\\Program Files\\mafft-win\\mafft.bat";
- if ( MsaInferrer.isInstalled( path ) ) {
- return path;
- }
- }
- path = "/usr/local/bin/mafft";
- if ( MsaInferrer.isInstalled( path ) ) {
- return path;
- }
- path = "/usr/bin/mafft";
- if ( MsaInferrer.isInstalled( path ) ) {
- return path;
- }
- path = "/bin/mafft";
- if ( MsaInferrer.isInstalled( path ) ) {
- return path;
+ private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method,
+ final Msa msa,
+ final boolean write_matrix,
+ final String matrix_name ) {
+ BasicSymmetricalDistanceMatrix m = null;
+ switch ( pwd_distance_method ) {
+ case KIMURA_DISTANCE:
+ m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
+ break;
+ case POISSON_DISTANCE:
+ m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
+ break;
+ case FRACTIONAL_DISSIMILARITY:
+ m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
+ break;
+ default:
+ throw new IllegalArgumentException( "invalid pwd method" );
}
- path = "mafft";
- if ( MsaInferrer.isInstalled( path ) ) {
- return path;
+ if ( write_matrix ) {
+ try {
+ m.write( ForesterUtil.createBufferedWriter( matrix_name ) );
+ }
+ catch ( final IOException e ) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
- return null;
- }
-
- final private void mafft() throws IOException, InterruptedException {
- // final MsaInferrer mafft = Mafft
- // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
- final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft );
- final List<String> opts = new ArrayList<String>();
- opts.add( "--maxiterate" );
- opts.add( "1000" );
- opts.add( "--localpair" );
- opts.add( "--quiet" );
- _msa = mafft.infer( _msa.asSequenceList(), opts );
+ final NeighborJoiningF nj = NeighborJoiningF.createInstance( false, 5 );
+ final Phylogeny phy = nj.execute( m );
+ return phy;
}
private StringBuilder msaStatsAsSB() {
return sb;
}
+ final private void realignWithMafft() throws IOException, InterruptedException {
+ // final MsaInferrer mafft = Mafft
+ // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" );
+ final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft );
+ final List<String> opts = new ArrayList<String>();
+ opts.add( "--maxiterate" );
+ opts.add( "1000" );
+ opts.add( "--localpair" );
+ opts.add( "--quiet" );
+ _msa = mafft.infer( _msa.asSequenceList(), opts );
+ }
+
final private void removeGapColumns() {
_msa = MsaMethods.createInstance().removeGapColumns( 1, 0, _msa );
}
int counter = step;
double gr;
do {
- removeWorstOffenders( step, 1, false, false );
+ removeWorstOffenders( step, 1, false, false, false );
if ( realign ) {
- mafft();
+ realignWithMafft();
}
gr = MsaMethods.calcGapRatio( _msa );
if ( VERBOSE ) {
}
int counter = step;
while ( _msa.getLength() > length ) {
- removeWorstOffenders( step, 1, false, false );
+ removeWorstOffenders( step, 1, false, false, false );
if ( realign ) {
- mafft();
+ realignWithMafft();
}
if ( VERBOSE ) {
System.out.println( counter + ": " + msaStatsAsSB() );
}
}
- Phylogeny pi( final String matrix ) {
- final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix );
- final int seed = 15;
- final int n = 100;
- final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa );
- final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(),
- n,
- seed );
- final Phylogeny[] eval_phys = new Phylogeny[ n ];
- for( int i = 0; i < n; ++i ) {
- resampleable_msa.resample( resampled_column_positions[ i ] );
- eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null );
- }
- ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 );
- PhylogenyMethods.extractFastaInformation( master_phy );
- return master_phy;
- }
-
- private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method,
- final Msa msa,
- final boolean write_matrix,
- final String matrix_name ) {
- BasicSymmetricalDistanceMatrix m = null;
- switch ( pwd_distance_method ) {
- case KIMURA_DISTANCE:
- m = PairwiseDistanceCalculator.calcKimuraDistances( msa );
- break;
- case POISSON_DISTANCE:
- m = PairwiseDistanceCalculator.calcPoissonDistances( msa );
- break;
- case FRACTIONAL_DISSIMILARITY:
- m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa );
- break;
- default:
- throw new IllegalArgumentException( "invalid pwd method" );
- }
- if ( write_matrix ) {
- try {
- m.write( ForesterUtil.createBufferedWriter( matrix_name ) );
- }
- catch ( final IOException e ) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- final NeighborJoiningF nj = NeighborJoiningF.createInstance( false, 5 );
- final Phylogeny phy = nj.execute( m );
- return phy;
- }
-
final private void removeWorstOffenders( final int to_remove,
final int step,
final boolean realign,
- final boolean norm ) throws IOException, InterruptedException {
- //final Phylogeny a = pi( "a.pwd" );
- //Archaeopteryx.createApplication( a );
+ final boolean norm,
+ final boolean verbose ) throws IOException, InterruptedException {
final GapContribution stats[] = calcGapContribtionsStats( norm );
final List<String> to_remove_ids = new ArrayList<String>();
for( int j = 0; j < to_remove; ++j ) {
to_remove_ids.add( stats[ j ].getId() );
_removed_seq_ids.add( stats[ j ].getId() );
}
- //TODO if verbose/interactive
- for( final String id : to_remove_ids ) {
+ for( int i = 0; i < to_remove_ids.size(); ++i ) {
+ final String id = to_remove_ids.get( i );
_msa = MsaMethods.removeSequence( _msa, id );
removeGapColumns();
- //System.out.print( id );
- System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
- System.out.print( "\t" );
- final StringBuilder sb = msaStatsAsSB();
- System.out.println( sb );
- }
- //TODO else:
- //_msa = MsaMethods.removeSequences( _msa, to_remove_ids );
- //removeGapColumns();
- if ( realign ) {
- mafft();
+ if ( verbose ) {
+ System.out.print( ForesterUtil.pad( id, 20, ' ', false ) );
+ System.out.print( "\t" );
+ final StringBuilder sb = msaStatsAsSB();
+ System.out.print( sb );
+ System.out.print( "\t" );
+ }
+ if ( ( ( ( i + 1 ) % step ) == 0 ) || ( i == ( to_remove_ids.size() - 1 ) ) ) {
+ if ( realign ) {
+ realignWithMafft();
+ }
+ final String s = writeOutfile();
+ if ( verbose ) {
+ System.out.print( "-> " + s );
+ }
+ }
+ if ( verbose ) {
+ System.out.println();
+ }
}
- //final Phylogeny b = pi( "b.pwd" );
- //Archaeopteryx.createApplication( b );
+ }
+
+ private void setPathToMafft( final String path_to_mafft ) {
+ _path_to_mafft = path_to_mafft;
}
final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException {
w.close();
}
+ private String writeOutfile() throws IOException {
+ final String s = writeMsa( _out_file_base, MSA_FORMAT.PHYLIP, ".aln" );
+ //writeMsa( _out_file_base, MSA_FORMAT.FASTA, ".fasta" );
+ return s;
+ }
+
+ // Returns null if not path found.
+ final public static String guessPathToMafft() {
+ String path;
+ if ( ForesterUtil.OS_NAME.toLowerCase().indexOf( "win" ) >= 0 ) {
+ path = "C:\\Program Files\\mafft-win\\mafft.bat";
+ if ( MsaInferrer.isInstalled( path ) ) {
+ return path;
+ }
+ }
+ path = "/usr/local/bin/mafft";
+ if ( MsaInferrer.isInstalled( path ) ) {
+ return path;
+ }
+ path = "/usr/bin/mafft";
+ if ( MsaInferrer.isInstalled( path ) ) {
+ return path;
+ }
+ path = "/bin/mafft";
+ if ( MsaInferrer.isInstalled( path ) ) {
+ return path;
+ }
+ path = "mafft";
+ if ( MsaInferrer.isInstalled( path ) ) {
+ return path;
+ }
+ return null;
+ }
+
public final static MsaCompactor reduceGapAverage( final Msa msa,
final double max_gap_average,
final int step,
final boolean realign,
- final File out,
final int minimal_effective_length,
- final String path_to_mafft ) throws IOException,
- InterruptedException {
+ final String path_to_mafft,
+ final File out ) throws IOException, InterruptedException {
final MsaCompactor mc = new MsaCompactor( msa );
if ( realign ) {
mc.setPathToMafft( path_to_mafft );
}
+ mc.setOutFileBase( out );
mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length );
return mc;
}
final int length,
final int step,
final boolean realign,
- final String path_to_mafft ) throws IOException,
- InterruptedException {
+ final String path_to_mafft,
+ final File out ) throws IOException, InterruptedException {
final MsaCompactor mc = new MsaCompactor( msa );
if ( realign ) {
mc.setPathToMafft( path_to_mafft );
}
+ mc.setOutFileBase( out );
mc.removeViaLength( length, step, realign );
return mc;
}
public final static MsaCompactor removeWorstOffenders( final Msa msa,
final int worst_offenders_to_remove,
+ final int step,
final boolean realign,
final boolean norm,
- final String path_to_mafft ) throws IOException,
- InterruptedException {
+ final String path_to_mafft,
+ final File out ) throws IOException, InterruptedException {
final MsaCompactor mc = new MsaCompactor( msa );
if ( realign ) {
mc.setPathToMafft( path_to_mafft );
}
- mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm );
+ mc.setOutFileBase( out );
+ mc.removeWorstOffenders( worst_offenders_to_remove, step, realign, norm, true );
return mc;
}
- private void setPathToMafft( final String path_to_mafft ) {
- _path_to_mafft = path_to_mafft;
+ private static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( int c = from; c <= to; ++c ) {
+ stats.addValue( MsaMethods.calculateIdentityRatio( msa, c ) );
+ }
+ return stats;
}
}