From 2eb16fa8fc6fa40a6f9829a5d624ada26b026fcc Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Sat, 1 Mar 2014 01:14:08 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/application/surfacing.java | 3 +- .../evoinference/TestPhylogenyReconstruction.java | 9 +++++- .../evoinference/distance/NeighborJoining.java | 19 +++++++------ .../distance/PairwiseDistanceCalculator.java | 21 +++----------- .../distance/BasicSymmetricalDistanceMatrix.java | 3 ++ .../java/src/org/forester/msa/ResampleableMsa.java | 1 - .../org/forester/msa_compactor/MsaCompactor.java | 30 ++++++++++++++------ .../surfacing/BasicDomainSimilarityCalculator.java | 26 +++++++++++++++-- .../surfacing/PairwiseGenomeComparator.java | 3 +- 9 files changed, 74 insertions(+), 41 deletions(-) diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index 802b1d4..847a8db 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -1817,7 +1817,8 @@ public class surfacing { final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, sort_by_species_count_first, number_of_genomes == 2, - CALC_SIMILARITY_SCORES ); + CALC_SIMILARITY_SCORES, + true ); switch ( scoring ) { case COMBINATIONS: pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator(); diff --git a/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java b/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java index 3529d1e..6073985 100644 --- a/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java +++ b/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java @@ -33,6 +33,7 @@ import java.io.StringWriter; import java.util.Date; import java.util.List; +import org.forester.archaeopteryx.Archaeopteryx; import org.forester.evoinference.distance.NeighborJoining; import org.forester.evoinference.distance.PairwiseDistanceCalculator; import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; @@ -1941,7 +1942,13 @@ public class TestPhylogenyReconstruction { m.setRow( "1.52430 1.44650 0.59580 0.46310 0.00000 0.34840 0.30830", 4 ); m.setRow( "1.60430 1.43890 0.61790 0.50610 0.34840 0.00000 0.26920", 5 ); m.setRow( "1.59050 1.46290 0.55830 0.47100 0.30830 0.26920 0.00000", 6 ); - nj.execute( m ); + System.out.println( m.toString() ); + final Phylogeny p2 = nj.execute( m ); + p2.reRoot( p2.getNode( "Bovine" ) ); + System.out.println( p2.toString() ); + // from phylip Neighbor-Joining/UPGMA method version 3.69: + // ((((((Chimp:0.15167,Human:0.11753):0.03982,Gorilla:0.15393):0.02696,Orang:0.28469):0.04648,Gibbon:0.35793):0.42027,Mouse:0.76891):0.458845,Bovine:0.458845); + Archaeopteryx.createApplication( p2 ); m = new BasicSymmetricalDistanceMatrix( 4 ); m.setIdentifier( 0, "A" ); m.setIdentifier( 1, "B" ); diff --git a/forester/java/src/org/forester/evoinference/distance/NeighborJoining.java b/forester/java/src/org/forester/evoinference/distance/NeighborJoining.java index c624bff..bcb6df1 100644 --- a/forester/java/src/org/forester/evoinference/distance/NeighborJoining.java +++ b/forester/java/src/org/forester/evoinference/distance/NeighborJoining.java @@ -58,9 +58,9 @@ public final class NeighborJoining { if ( ( i == otu1 ) || ( i == otu2 ) ) { continue; } - // _d_values[ _mappings[ otu1 ] ][ _mappings[ i ] ] = ( getValueFromD( otu1, i ) + getValueFromD( i, otu2 ) - d ) / 2; - i_m = _mappings[ i ]; - _d_values[ otu1_m ][ i_m ] = ( ( _d_values[ otu1_m ][ i_m ] + _d_values[ i_m ][ otu2_m ] ) - 2 ) / 2; + _d_values[ _mappings[ otu1 ] ][ _mappings[ i ] ] = ( getValueFromD( otu1, i ) + getValueFromD( i, otu2 ) - d ) / 2; + //i_m = _mappings[ i ]; + //_d_values[ otu1_m ][ i_m ] = ( ( _d_values[ otu1_m ][ i_m ] + _d_values[ i_m ][ otu2_m ] ) - 2 ) / 2; } } @@ -71,8 +71,8 @@ public final class NeighborJoining { d = 0; i_m = _mappings[ i ]; for( int n = 0; n < _n; ++n ) { - d += _d_values[ i_m ][ _mappings[ n ] ]; - //d += getValueFromD( i, n ); + //d += _d_values[ i_m ][ _mappings[ n ] ]; + d += getValueFromD( i, n ); } _r[ i ] = d; } @@ -101,7 +101,7 @@ public final class NeighborJoining { // It is a condition that otu1 < otu2. if ( DEBUG ) { if ( otu1 > otu2 ) { - throw new RuntimeException( "NJ code is faulty: otu1 > otu2" ); + throw new RuntimeException( "faulty NJ code: otu1 > otu2" ); } } final PhylogenyNode node = new PhylogenyNode(); @@ -194,12 +194,15 @@ public final class NeighborJoining { r_j = _r[ j ]; j_m = _mappings[ j ]; for( int i = 0; i < j; ++i ) { - // _m_values[ i ][ j ] = getValueFromD( i, j ) - ( _r[ i ] + r_j ) / ( _n - 2 ); - _m_values[ i ][ j ] = _d_values[ _mappings[ i ] ][ j_m ] - ( ( _r[ i ] + r_j ) / ( _n_2 ) ); + _m_values[ i ][ j ] = getValueFromD( i, j ) - ( _r[ i ] + r_j ) / ( _n - 2 ); + //_m_values[ i ][ j ] = _d_values[ _mappings[ i ] ][ j_m ] - ( ( _r[ i ] + r_j ) / ( _n_2 ) ); } } } + // private final double getValueFromD( final int otu1, final int otu2 ) { + // return _d_values[ _mappings[ otu1 ] ][ _mappings[ otu2 ] ]; + // } // otu2 will, in effect, be "deleted" from the matrix. private final void updateMappings( final int otu2 ) { for( int i = otu2; i < ( _mappings.length - 1 ); ++i ) { diff --git a/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java b/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java index 0dc6c6e..67d4b53 100644 --- a/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java +++ b/forester/java/src/org/forester/evoinference/distance/PairwiseDistanceCalculator.java @@ -27,12 +27,10 @@ package org.forester.evoinference.distance; import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.msa.Msa; -import org.forester.sequence.Sequence; public final class PairwiseDistanceCalculator { - public static final double DEFAULT_VALUE_FOR_TOO_LARGE_DISTANCE_FOR_KIMURA_FORMULA = 10; // Felsenstein uses -1 - private static final char GAP = Sequence.GAP; + public static final double DEFAULT_VALUE_FOR_TOO_LARGE_DISTANCE_FOR_KIMURA_FORMULA = 10; // Felsenstein uses -1 private final Msa _msa; private final double _value_for_too_large_distance_for_kimura_formula; @@ -44,23 +42,12 @@ public final class PairwiseDistanceCalculator { private double calcFractionalDissimilarity( final int row_1, final int row_2 ) { final int length = _msa.getLength(); int nd = 0; - int n = 0; - char aa_1; - char aa_2; for( int col = 0; col < length; ++col ) { - aa_1 = _msa.getResidueAt( row_1, col ); - aa_2 = _msa.getResidueAt( row_2, col ); - if ( ( aa_1 != GAP ) && ( aa_2 != GAP ) ) { - if ( aa_1 != aa_2 ) { - nd++; - } - n++; + if ( _msa.getResidueAt( row_1, col ) != _msa.getResidueAt( row_2, col ) ) { + ++nd; } } - if ( n == 0 ) { - return 1; - } - return ( double ) nd / n; + return ( double ) nd / length; } /** diff --git a/forester/java/src/org/forester/evoinference/matrix/distance/BasicSymmetricalDistanceMatrix.java b/forester/java/src/org/forester/evoinference/matrix/distance/BasicSymmetricalDistanceMatrix.java index 616a91c..fdf2ec4 100644 --- a/forester/java/src/org/forester/evoinference/matrix/distance/BasicSymmetricalDistanceMatrix.java +++ b/forester/java/src/org/forester/evoinference/matrix/distance/BasicSymmetricalDistanceMatrix.java @@ -110,6 +110,9 @@ public final class BasicSymmetricalDistanceMatrix implements DistanceMatrix { @Override public final void setValue( final int col, final int row, final double d ) { + if ( d < 0 ) { + throw new IllegalArgumentException( "negative distance value" ); + } if ( ( col == row ) && ( d != 0.0 ) ) { throw new IllegalArgumentException( "attempt to set a non-zero value on the diagonal of a symmetrical distance matrix" ); } diff --git a/forester/java/src/org/forester/msa/ResampleableMsa.java b/forester/java/src/org/forester/msa/ResampleableMsa.java index b7c8a7d..3823241 100644 --- a/forester/java/src/org/forester/msa/ResampleableMsa.java +++ b/forester/java/src/org/forester/msa/ResampleableMsa.java @@ -35,7 +35,6 @@ public final class ResampleableMsa extends BasicMsa { public void resample( final int[] resampled_column_positions ) { if ( resampled_column_positions.length != getLength() ) { - _resampled_column_positions = null; throw new IllegalArgumentException( "illegal attempt to use " + resampled_column_positions.length + " resampled column positions on msa of length " + getLength() ); } diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index 5f30b5a..cc0875d 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -128,9 +128,9 @@ public class MsaCompactor { final MsaInferrer mafft = Mafft .createInstance( "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-7.130-without-extensions/scripts/mafft" ); final List opts = new ArrayList(); - // opts.add( "--maxiterate" ); - // opts.add( "1000" ); - // opts.add( "--localpair" ); + opts.add( "--maxiterate" ); + opts.add( "1000" ); + opts.add( "--localpair" ); opts.add( "--quiet" ); _msa = mafft.infer( _msa.asSequenceList(), opts ); } @@ -213,8 +213,8 @@ public class MsaCompactor { } } - Phylogeny pi() { - final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa ); + Phylogeny pi( String matrix ) { + final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa, true, matrix ); final int seed = 15; final int n = 100; final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa ); @@ -224,14 +224,17 @@ public class MsaCompactor { final Phylogeny[] eval_phys = new Phylogeny[ n ]; for( int i = 0; i < n; ++i ) { resampleable_msa.resample( resampled_column_positions[ i ] ); - eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa ); + eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa, false, null ); } ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 ); PhylogenyMethods.extractFastaInformation( master_phy ); return master_phy; } - private Phylogeny inferNJphylogeny( PWD_DISTANCE_METHOD pwd_distance_method, final Msa msa ) { + private Phylogeny inferNJphylogeny( PWD_DISTANCE_METHOD pwd_distance_method, + final Msa msa, + boolean write_matrix, + String matrix_name ) { BasicSymmetricalDistanceMatrix m = null; switch ( pwd_distance_method ) { case KIMURA_DISTANCE: @@ -246,6 +249,15 @@ public class MsaCompactor { default: throw new IllegalArgumentException( "invalid pwd method" ); } + if ( write_matrix ) { + try { + m.write( ForesterUtil.createBufferedWriter( matrix_name ) ); + } + catch ( IOException e ) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } final NeighborJoining nj = NeighborJoining.createInstance(); final Phylogeny phy = nj.execute( m ); return phy; @@ -255,7 +267,7 @@ public class MsaCompactor { final int step, final boolean realign, final boolean norm ) throws IOException, InterruptedException { - final Phylogeny a = pi(); + final Phylogeny a = pi( "a.pwd" ); Archaeopteryx.createApplication( a ); final GapContribution stats[] = calcGapContribtionsStats( norm ); final List to_remove_ids = new ArrayList(); @@ -278,7 +290,7 @@ public class MsaCompactor { if ( realign ) { mafft(); } - final Phylogeny b = pi(); + final Phylogeny b = pi( "b.pwd" ); Archaeopteryx.createApplication( b ); } diff --git a/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java index b245cbc..8682fd1 100644 --- a/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java +++ b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java @@ -46,6 +46,19 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat private final boolean _calc_similarity_score; private final boolean _sort_by_species_count_first; private final boolean _treat_as_binary_comparison; + private final boolean _verbose; + + public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort, + final boolean sort_by_species_count_first, + final boolean treat_as_binary_comparison, + final boolean calc_similarity_score, + final boolean verbose ) { + _sort = sort; + _sort_by_species_count_first = sort_by_species_count_first; + _treat_as_binary_comparison = treat_as_binary_comparison; + _calc_similarity_score = calc_similarity_score; + _verbose = verbose; + } public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort, final boolean sort_by_species_count_first, @@ -55,6 +68,7 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat _sort_by_species_count_first = sort_by_species_count_first; _treat_as_binary_comparison = treat_as_binary_comparison; _calc_similarity_score = calc_similarity_score; + _verbose = false; } @Override @@ -72,9 +86,13 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat } final DecimalFormat pf = new java.text.DecimalFormat( "000000" ); int counter = 1; - System.out.println( keys.size() ); + if ( _verbose ) { + System.out.println( keys.size() ); + } for( final String key : keys ) { - ForesterUtil.updateProgress( counter, pf ); + if ( _verbose ) { + ForesterUtil.updateProgress( counter, pf ); + } counter++; final List same_id_cd_list = new ArrayList( cdc_list.size() ); final List species_with_key_id_domain = new ArrayList(); @@ -111,7 +129,9 @@ public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculat throw new RuntimeException( "this should not have happened" ); } } - System.out.println(); + if ( _verbose ) { + System.out.println(); + } return similarities; } diff --git a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java index a3b0e66..d0d2590 100644 --- a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java +++ b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java @@ -139,7 +139,8 @@ public class PairwiseGenomeComparator { final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, sort_by_species_count_first, true, - calc_similarity_scores ); + calc_similarity_scores, + true ); final SortedSet similarities = calc .calculateSimilarities( pw_calc, genome_pair, -- 1.7.10.2