From ec7d093adc7417a551b42968816ec2e8573239d1 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 28 Feb 2014 02:24:30 +0000 Subject: [PATCH] inprogress --- .../archaeopteryx/tools/PhylogeneticInferrer.java | 40 +-------------- .../org/forester/msa_compactor/MsaCompactor.java | 53 ++++++++++++++++++++ .../org/forester/phylogeny/PhylogenyMethods.java | 34 +++++++++++++ 3 files changed, 89 insertions(+), 38 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java b/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java index 1144fcb..e6fe009 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/PhylogeneticInferrer.java @@ -30,7 +30,6 @@ import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.regex.Matcher; import javax.swing.JOptionPane; @@ -39,7 +38,6 @@ import org.forester.evoinference.distance.NeighborJoining; import org.forester.evoinference.distance.PairwiseDistanceCalculator; import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.evoinference.tools.BootstrapResampler; -import org.forester.io.parsers.FastaParser; import org.forester.msa.BasicMsa; import org.forester.msa.Mafft; import org.forester.msa.Msa; @@ -48,9 +46,7 @@ import org.forester.msa.MsaInferrer; import org.forester.msa.MsaMethods; import org.forester.msa.ResampleableMsa; import org.forester.phylogeny.Phylogeny; -import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.data.Accession; -import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.sequence.Sequence; import org.forester.tools.ConfidenceAssessor; import org.forester.util.ForesterUtil; @@ -148,7 +144,7 @@ public class PhylogeneticInferrer extends RunnableProcess { } final NeighborJoining nj = NeighborJoining.createInstance(); final Phylogeny phy = nj.execute( m ); - PhylogeneticInferrer.extractFastaInformation( phy ); + PhylogenyMethods.extractFastaInformation( phy ); return phy; } @@ -294,38 +290,6 @@ public class PhylogeneticInferrer extends RunnableProcess { } } - public static void extractFastaInformation( final Phylogeny phy ) { - for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode node = iter.next(); - if ( !ForesterUtil.isEmpty( node.getName() ) ) { - final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() ); - if ( name_m.lookingAt() ) { - System.out.println(); - // System.out.println( name_m.group( 1 ) ); - // System.out.println( name_m.group( 2 ) ); - // System.out.println( name_m.group( 3 ) ); - // System.out.println( name_m.group( 4 ) ); - final String acc_source = name_m.group( 1 ); - final String acc = name_m.group( 2 ); - final String seq_name = name_m.group( 3 ); - final String tax_sn = name_m.group( 4 ); - if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) { - ForesterUtil.ensurePresenceOfSequence( node ); - node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) ); - } - if ( !ForesterUtil.isEmpty( seq_name ) ) { - ForesterUtil.ensurePresenceOfSequence( node ); - node.getNodeData().getSequence( 0 ).setName( seq_name ); - } - if ( !ForesterUtil.isEmpty( tax_sn ) ) { - ForesterUtil.ensurePresenceOfTaxonomy( node ); - node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn ); - } - } - } - } - } - public enum MSA_PRG { MAFFT; } diff --git a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java index eead263..5f30b5a 100644 --- a/forester/java/src/org/forester/msa_compactor/MsaCompactor.java +++ b/forester/java/src/org/forester/msa_compactor/MsaCompactor.java @@ -13,12 +13,23 @@ import java.util.List; import java.util.SortedSet; import java.util.TreeSet; +import org.forester.archaeopteryx.Archaeopteryx; +import org.forester.evoinference.distance.NeighborJoining; +import org.forester.evoinference.distance.PairwiseDistanceCalculator; +import org.forester.evoinference.distance.PairwiseDistanceCalculator.PWD_DISTANCE_METHOD; +import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; +import org.forester.evoinference.tools.BootstrapResampler; +import org.forester.msa.BasicMsa; import org.forester.msa.Mafft; import org.forester.msa.Msa; import org.forester.msa.Msa.MSA_FORMAT; import org.forester.msa.MsaInferrer; import org.forester.msa.MsaMethods; +import org.forester.msa.ResampleableMsa; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; import org.forester.sequence.Sequence; +import org.forester.tools.ConfidenceAssessor; import org.forester.util.ForesterUtil; public class MsaCompactor { @@ -202,10 +213,50 @@ public class MsaCompactor { } } + Phylogeny pi() { + final Phylogeny master_phy = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, _msa ); + final int seed = 15; + final int n = 100; + final ResampleableMsa resampleable_msa = new ResampleableMsa( ( BasicMsa ) _msa ); + final int[][] resampled_column_positions = BootstrapResampler.createResampledColumnPositions( _msa.getLength(), + n, + seed ); + final Phylogeny[] eval_phys = new Phylogeny[ n ]; + for( int i = 0; i < n; ++i ) { + resampleable_msa.resample( resampled_column_positions[ i ] ); + eval_phys[ i ] = inferNJphylogeny( PWD_DISTANCE_METHOD.KIMURA_DISTANCE, resampleable_msa ); + } + ConfidenceAssessor.evaluate( "bootstrap", eval_phys, master_phy, true, 1 ); + PhylogenyMethods.extractFastaInformation( master_phy ); + return master_phy; + } + + private Phylogeny inferNJphylogeny( PWD_DISTANCE_METHOD pwd_distance_method, final Msa msa ) { + BasicSymmetricalDistanceMatrix m = null; + switch ( pwd_distance_method ) { + case KIMURA_DISTANCE: + m = PairwiseDistanceCalculator.calcKimuraDistances( msa ); + break; + case POISSON_DISTANCE: + m = PairwiseDistanceCalculator.calcPoissonDistances( msa ); + break; + case FRACTIONAL_DISSIMILARITY: + m = PairwiseDistanceCalculator.calcFractionalDissimilarities( msa ); + break; + default: + throw new IllegalArgumentException( "invalid pwd method" ); + } + final NeighborJoining nj = NeighborJoining.createInstance(); + final Phylogeny phy = nj.execute( m ); + return phy; + } + final private void removeWorstOffenders( final int to_remove, final int step, final boolean realign, final boolean norm ) throws IOException, InterruptedException { + final Phylogeny a = pi(); + Archaeopteryx.createApplication( a ); final GapContribution stats[] = calcGapContribtionsStats( norm ); final List to_remove_ids = new ArrayList(); for( int j = 0; j < to_remove; ++j ) { @@ -227,6 +278,8 @@ public class MsaCompactor { if ( realign ) { mafft(); } + final Phylogeny b = pi(); + Archaeopteryx.createApplication( b ); } final private void writeMsa( final String outfile, final MSA_FORMAT format ) throws IOException { diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index a916d89..806f031 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -38,8 +38,10 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.forester.io.parsers.FastaParser; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlUtil; @@ -73,6 +75,38 @@ public class PhylogenyMethods { throw new CloneNotSupportedException(); } + public static void extractFastaInformation( final Phylogeny phy ) { + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( !ForesterUtil.isEmpty( node.getName() ) ) { + final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() ); + if ( name_m.lookingAt() ) { + System.out.println(); + // System.out.println( name_m.group( 1 ) ); + // System.out.println( name_m.group( 2 ) ); + // System.out.println( name_m.group( 3 ) ); + // System.out.println( name_m.group( 4 ) ); + final String acc_source = name_m.group( 1 ); + final String acc = name_m.group( 2 ); + final String seq_name = name_m.group( 3 ); + final String tax_sn = name_m.group( 4 ); + if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) ); + } + if ( !ForesterUtil.isEmpty( seq_name ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence( 0 ).setName( seq_name ); + } + if ( !ForesterUtil.isEmpty( tax_sn ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn ); + } + } + } + } + } + public static DescriptiveStatistics calculatBranchLengthStatistics( final Phylogeny phy ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { -- 1.7.10.2