X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa_compactor%2FMsaCompactor.java;h=5d3a55c6d229e51887e8800aa308f3a0620fe33c;hb=6479c35c4734850f517a6ef8de0fce500fdd6693;hp=5f30b5afc69e21255545e87e82822c1279307897;hpb=ec7d093adc7417a551b42968816ec2e8573239d1;p=jalview.git diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 5f30b5a..5d3a55c 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -13,8 +13,7 @@ import java.util.List; import java.util.SortedSet; import java.util.TreeSet; -import org.forester.archaeopteryx.Archaeopteryx; -import org.forester.evoinference.distance.NeighborJoining; +import org.forester.evoinference.distance.NeighborJoiningF; import org.forester.evoinference.distance.PairwiseDistanceCalculator; import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANCE_METHOD; import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; @@ -30,6 +29,8 @@ import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.sequence.Sequence; import org.forester.tools.ConfidenceAssessor; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; public class MsaCompactor { @@ -39,6 +40,7 @@ public class MsaCompactor { private static final boolean VERBOSE = true; private Msa _msa; private final SortedSet _removed_seq_ids; + private String _path_to_mafft; static { NF_4.setRoundingMode( RoundingMode.HALF_UP ); NF_3.setRoundingMode( RoundingMode.HALF_UP ); @@ -97,19 +99,27 @@ public class MsaCompactor { final private GapContribution[] calcGapContribtionsStats( final boolean norm ) { final GapContribution stats[] = calcGapContribtions( norm ); Arrays.sort( stats ); - for( final GapContribution stat : stats ) { - final StringBuilder sb = new StringBuilder(); - sb.append( stat.getId() ); - sb.append( "\t" ); - sb.append( NF_4.format( stat.getValue() ) ); - sb.append( "\t" ); - // sb.append( NF_4.format( stat.median() ) ); - // sb.append( "\t" ); - // sb.append( NF_4.format( stat.getMin() ) ); - // sb.append( "\t" ); - // sb.append( NF_4.format( stat.getMax() ) ); - //sb.append( "\t" ); - System.out.println( sb ); + // for( final GapContribution stat : stats ) { + // final StringBuilder sb = new StringBuilder(); + // sb.append( stat.getId() ); + // sb.append( "\t" ); + // sb.append( NF_4.format( stat.getValue() ) ); + // sb.append( "\t" ); + // sb.append( NF_4.format( stat.median() ) ); + // sb.append( "\t" ); + // sb.append( NF_4.format( stat.getMin() ) ); + // sb.append( "\t" ); + // sb.append( NF_4.format( stat.getMax() ) ); + //sb.append( "\t" ); + //System.out.println( sb ); + // } + return stats; + } + + private static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( int c = from; c <= to; ++c ) { + stats.addValue( MsaMethods.calculateIdentityRatio( msa, c ) ); } return stats; } @@ -124,13 +134,42 @@ public class MsaCompactor { return gappiness; } + // Returns null if not path found. + final public static String guessPathToMafft() { + String path; + if ( ForesterUtil.OS_NAME.toLowerCase().indexOf( "win" ) >= 0 ) { + path = "C:\\Program Files\\mafft-win\\mafft.bat"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + } + path = "/usr/local/bin/mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + path = "/usr/bin/mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + path = "/bin/mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + path = "mafft"; + if ( MsaInferrer.isInstalled( path ) ) { + return path; + } + return null; + } + final private void mafft() throws IOException, InterruptedException { - final MsaInferrer mafft = Mafft - .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); + // final MsaInferrer mafft = Mafft + // .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); + final MsaInferrer mafft = Mafft.createInstance( _path_to_mafft ); final List opts = new ArrayList(); - // opts.add( "--maxiterate" ); - // opts.add( "1000" ); - // opts.add( "--localpair" ); + opts.add( "--maxiterate" ); + opts.add( "1000" ); + opts.add( "--localpair" ); opts.add( "--quiet" ); _msa = mafft.infer( _msa.asSequenceList(), opts ); } @@ -142,6 +181,8 @@ public class MsaCompactor { sb.append( _msa.getLength() ); sb.append( "\t" ); sb.append( NF_3.format( MsaMethods.calcGapRatio( _msa ) ) ); + sb.append( "\t" ); + sb.append( NF_3.format( calculateIdentityRatio( 0, _msa.getLength() - 1, _msa ).arithmeticMean() ) ); return sb; } @@ -213,8 +254,8 @@ public class MsaCompactor { } } - Phylogeny pi() { - final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa ); + Phylogeny pi( final String matrix ) { + final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix ); final int seed = 15; final int n = 100; final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa ); @@ -224,14 +265,17 @@ public class MsaCompactor { final Phylogeny[] eval_phys = new Phylogeny[ n ]; for( int i = 0; i < n; ++i ) { resampleable_msa.resample( resampled_column_positions[ i ] ); - eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa ); + eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null ); } ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 ); PhylogenyMethods.extractFastaInformation( master_phy ); return master_phy; } - private Phylogeny inferNJphylogeny( PWD_DISTANCE_METHOD pwd_distance_method, final Msa msa ) { + private Phylogeny inferNJphylogeny( final PWD_DISTANCE_METHOD pwd_distance_method, + final Msa msa, + final boolean write_matrix, + final String matrix_name ) { BasicSymmetricalDistanceMatrix m = null; switch ( pwd_distance_method ) { case KIMURA_DISTANCE: @@ -246,7 +290,16 @@ public class MsaCompactor { default: throw new IllegalArgumentException( "invalid pwd method" ); } - final NeighborJoining nj = NeighborJoining.createInstance(); + if ( write_matrix ) { + try { + m.write( ForesterUtil.createBufferedWriter( matrix_name ) ); + } + catch ( final IOException e ) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + final NeighborJoiningF nj = NeighborJoiningF.createInstance( false, 5 ); final Phylogeny phy = nj.execute( m ); return phy; } @@ -255,19 +308,20 @@ public class MsaCompactor { final int step, final boolean realign, final boolean norm ) throws IOException, InterruptedException { - final Phylogeny a = pi(); - Archaeopteryx.createApplication( a ); + //final Phylogeny a = pi( "a.pwd" ); + //Archaeopteryx.createApplication( a ); final GapContribution stats[] = calcGapContribtionsStats( norm ); final List to_remove_ids = new ArrayList(); for( int j = 0; j < to_remove; ++j ) { to_remove_ids.add( stats[ j ].getId() ); _removed_seq_ids.add( stats[ j ].getId() ); } - //TODO if verbose/interactve + //TODO if verbose/interactive for( final String id : to_remove_ids ) { _msa = MsaMethods.removeSequence( _msa, id ); removeGapColumns(); - System.out.print( id ); + //System.out.print( id ); + System.out.print( ForesterUtil.pad( id, 20, ' ', false ) ); System.out.print( "\t" ); final StringBuilder sb = msaStatsAsSB(); System.out.println( sb ); @@ -278,8 +332,8 @@ public class MsaCompactor { if ( realign ) { mafft(); } - final Phylogeny b = pi(); - Archaeopteryx.createApplication( b ); + //final Phylogeny b = pi( "b.pwd" ); + //Archaeopteryx.createApplication( b ); } final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException { @@ -293,9 +347,13 @@ public class MsaCompactor { final int step, final boolean realign, final File out, - final int minimal_effective_length ) throws IOException, + final int minimal_effective_length, + final String path_to_mafft ) throws IOException, InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); + if ( realign ) { + mc.setPathToMafft( path_to_mafft ); + } mc.removeViaGapAverage( max_gap_average, step, realign, out, minimal_effective_length ); return mc; } @@ -303,8 +361,13 @@ public class MsaCompactor { public final static MsaCompactor reduceLength( final Msa msa, final int length, final int step, - final boolean realign ) throws IOException, InterruptedException { + final boolean realign, + final String path_to_mafft ) throws IOException, + InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); + if ( realign ) { + mc.setPathToMafft( path_to_mafft ); + } mc.removeViaLength( length, step, realign ); return mc; } @@ -312,10 +375,18 @@ public class MsaCompactor { public final static MsaCompactor removeWorstOffenders( final Msa msa, final int worst_offenders_to_remove, final boolean realign, - final boolean norm ) throws IOException, + final boolean norm, + final String path_to_mafft ) throws IOException, InterruptedException { final MsaCompactor mc = new MsaCompactor( msa ); + if ( realign ) { + mc.setPathToMafft( path_to_mafft ); + } mc.removeWorstOffenders( worst_offenders_to_remove, 1, realign, norm ); return mc; } + + private void setPathToMafft( final String path_to_mafft ) { + _path_to_mafft = path_to_mafft; + } }