From 8dbaf0f253f5ec2721b540d1f4ad06ffb35b66b6 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Sat, 26 Jan 2013 00:24:59 +0000 Subject: [PATCH] added AGRESSIVE tax extraction ^^ --- .../src/org/forester/archaeopteryx/AptxUtil.java | 103 ++++++++++---------- .../archaeopteryx/simple_node_processor.java | 92 +++++++++++++++++ .../src/org/forester/io/parsers/nhx/NHXParser.java | 1 - .../org/forester/io/parsers/util/ParserUtils.java | 3 +- .../org/forester/phylogeny/PhylogenyMethods.java | 74 -------------- forester/java/src/org/forester/rio/TestRIO.java | 4 +- 6 files changed, 147 insertions(+), 130 deletions(-) create mode 100644 forester/java/src/org/forester/archaeopteryx/simple_node_processor.java diff --git a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java index 0dd82d7..51f6477 100644 --- a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java @@ -186,6 +186,36 @@ public final class AptxUtil { } } + public static Set obtainAllDistinctTaxonomies( final PhylogenyNode node ) { + final List descs = node.getAllExternalDescendants(); + final Set tax_set = new HashSet(); + for( final PhylogenyNode n : descs ) { + if ( n.getNodeData().isHasTaxonomy() && !n.getNodeData().getTaxonomy().isEmpty() ) { + tax_set.add( n.getNodeData().getTaxonomy() ); + } + } + return tax_set; + } + + /** + * Returns the set of distinct taxonomies of + * all external nodes of node. + * If at least one the external nodes has no taxonomy, + * null is returned. + * + */ + public static Set obtainDistinctTaxonomies( final PhylogenyNode node ) { + final List descs = node.getAllExternalDescendants(); + final Set tax_set = new HashSet(); + for( final PhylogenyNode n : descs ) { + if ( !n.getNodeData().isHasTaxonomy() || n.getNodeData().getTaxonomy().isEmpty() ) { + return null; + } + tax_set.add( n.getNodeData().getTaxonomy() ); + } + return tax_set; + } + public final static Accession obtainSequenceAccessionFromName( final String sequence_name ) { final String n = sequence_name.trim(); final Matcher matcher1 = seq_identifier_pattern_1.matcher( n ); @@ -494,40 +524,6 @@ public final class AptxUtil { } } - /** - * Returns the set of distinct taxonomies of - * all external nodes of node. - * If at least one the external nodes has no taxonomy, - * null is returned. - * - */ - public static Set obtainDistinctTaxonomies( final PhylogenyNode node ) { - final List descs = node.getAllExternalDescendants(); - final Set tax_set = new HashSet(); - for( final PhylogenyNode n : descs ) { - if ( !n.getNodeData().isHasTaxonomy() || n.getNodeData().getTaxonomy().isEmpty() ) { - return null; - } - tax_set.add( n.getNodeData().getTaxonomy() ); - } - return tax_set; - } - - public static Set obtainAllDistinctTaxonomies( final PhylogenyNode node ) { - final List descs = node.getAllExternalDescendants(); - final Set tax_set = new HashSet(); - for( final PhylogenyNode n : descs ) { - if ( n.getNodeData().isHasTaxonomy() && !n.getNodeData().getTaxonomy().isEmpty() ) { - tax_set.add( n.getNodeData().getTaxonomy() ); - System.out.println( n.getNodeData().getTaxonomy() ); - } - } - for( final Taxonomy taxonomy : tax_set ) { - System.out.println( taxonomy ); - } - return tax_set; - } - final static void collapseSubtree( final PhylogenyNode node, final boolean collapse ) { node.setCollapse( collapse ); if ( node.isExternal() ) { @@ -695,16 +691,19 @@ public final class AptxUtil { desc.append( "Rerootable: " ); desc.append( phy.isRerootable() ); desc.append( "\n" ); - desc.append( "Node sum: " ); + desc.append( "Nodes: " ); desc.append( phy.getNodeCount() ); desc.append( "\n" ); - desc.append( "External node sum: " ); + desc.append( "External nodes: " ); desc.append( phy.getNumberOfExternalNodes() ); desc.append( "\n" ); - desc.append( "Internal node sum: " ); + desc.append( "Internal nodes: " ); desc.append( phy.getNodeCount() - phy.getNumberOfExternalNodes() ); desc.append( "\n" ); - desc.append( "Branche sum: " ); + desc.append( "Internal nodes with polytomies: " ); + desc.append( PhylogenyMethods.countNumberOfPolytomies( phy ) ); + desc.append( "\n" ); + desc.append( "Branches: " ); desc.append( phy.getNumberOfBranches() ); desc.append( "\n" ); desc.append( "Depth: " ); @@ -950,6 +949,20 @@ public final class AptxUtil { } } + final static void outOfMemoryError( final OutOfMemoryError e ) { + System.err.println(); + System.err.println( "Java memory allocation might be too small, try \"-Xmx2048m\" java command line option" ); + System.err.println(); + e.printStackTrace(); + System.err.println(); + JOptionPane.showMessageDialog( null, + "Java memory allocation might be too small, try \"-Xmx2048m\" java command line option" + + "\n\nError: " + e.getLocalizedMessage(), + "Out of Memory Error [" + Constants.PRG_NAME + " " + Constants.VERSION + "]", + JOptionPane.ERROR_MESSAGE ); + System.exit( -1 ); + } + final static void printAppletMessage( final String applet_name, final String message ) { System.out.println( "[" + applet_name + "] > " + message ); } @@ -1015,20 +1028,6 @@ public final class AptxUtil { System.exit( -1 ); } - final static void outOfMemoryError( final OutOfMemoryError e ) { - System.err.println(); - System.err.println( "Java memory allocation might be too small, try \"-Xmx2048m\" java command line option" ); - System.err.println(); - e.printStackTrace(); - System.err.println(); - JOptionPane.showMessageDialog( null, - "Java memory allocation might be too small, try \"-Xmx2048m\" java command line option" - + "\n\nError: " + e.getLocalizedMessage(), - "Out of Memory Error [" + Constants.PRG_NAME + " " + Constants.VERSION + "]", - JOptionPane.ERROR_MESSAGE ); - System.exit( -1 ); - } - final static void unexpectedException( final Exception e ) { System.err.println(); e.printStackTrace( System.err ); diff --git a/forester/java/src/org/forester/archaeopteryx/simple_node_processor.java b/forester/java/src/org/forester/archaeopteryx/simple_node_processor.java new file mode 100644 index 0000000..711cbd0 --- /dev/null +++ b/forester/java/src/org/forester/archaeopteryx/simple_node_processor.java @@ -0,0 +1,92 @@ + +package org.forester.archaeopteryx; + +import java.io.File; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.CommandLineArguments; + +public class simple_node_processor { + + private final static String BASE = "b_"; + + public static void main( final String args[] ) { + File in = null; + File out = null; + try { + CommandLineArguments cla = null; + cla = new CommandLineArguments( args ); + in = cla.getFile( 0 ); + out = cla.getFile( 1 ); + if ( out.exists() ) { + System.out.println( out + " already exists" ); + System.exit( -1 ); + } + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhyloXmlParser xml_parser = new PhyloXmlParser(); + final Phylogeny[] phylogenies_0 = factory.create( in, xml_parser ); + final Phylogeny phylogeny_0 = phylogenies_0[ 0 ]; + final PhylogenyNodeIterator it = phylogeny_0.iteratorPostorder(); + int i = 0; + while ( it.hasNext() ) { + final PhylogenyNode node = it.next(); + processNode( node, i ); + i++; + } + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( out, phylogeny_0, 0 ); + } + catch ( final Exception e ) { + System.out.println( e.getLocalizedMessage() ); + e.printStackTrace(); + System.exit( -1 ); + } + } + + // private static void processNode( final PhylogenyNode node, final int i ) { + // node.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + // if ( !node.isExternal() ) { + // if ( ( node.getName() == null ) || node.getName().isEmpty() ) { + // node.setName( BASE + i ); + // } + // } + // } + private static void processNode( final PhylogenyNode node, final int i ) { + //if ( node.isExternal() ) { + // final String c = "" + node.getNodeData().getBinaryCharacters().getPresentCount(); + // final String s = node.getNodeData().getTaxonomy().getScientificName(); + // System.out.println( s + "\t" + c ); + //} + // if ( !node.isExternal() ) { + // if ( !node.getNodeData().isHasTaxonomy() ) { + // if ( !ForesterUtil.isEmpty( node.getName() ) ) { + // if ( ( node.getName().indexOf( "_" ) < 0 ) && ( node.getName().indexOf( "&" ) < 0 ) + // && ( node.getName().indexOf( " " ) < 0 ) ) { + // Taxonomy t = new Taxonomy(); + // t.setScientificName( node.getName() ); + // node.getNodeData().addTaxonomy( t ); + // node.setName( "" ); + // } + // } + // } + // } + if ( node.isExternal() ) { + if ( node.getNodeData().isHasTaxonomy() ) { + final Taxonomy t = node.getNodeData().getTaxonomy(); + t.setIdentifier( null ); + //if ( !ForesterUtil.isEmpty( t.getTaxonomyCode() ) && t.getTaxonomyCode().length() == 5 ) { + // if ( node.getName().equalsIgnoreCase( t.getTaxonomyCode() ) ) { + // node.setName( "" ); + // } + //} + } + } + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java index 397c14a..8846374 100644 --- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -58,7 +58,6 @@ import org.forester.util.ForesterUtil; public final class NHXParser implements PhylogenyParser, IteratingPhylogenyParser { public static final TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = TAXONOMY_EXTRACTION.NO; - public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern.compile( "^[A-Z0-9]+$" ); public final static Pattern NUMBERS_ONLY_PATTERN = Pattern.compile( "^[0-9\\.]+$" ); public final static Pattern MB_PROB_PATTERN = Pattern.compile( "prob=([^,]+)" ); public final static Pattern MB_PROB_SD_PATTERN = Pattern.compile( "prob_stddev=([^,]+)" ); diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 4ba38fa..d5acd19 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -250,7 +250,8 @@ public final class ParserUtils { } } } - else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) { + if ( ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) + || ( taxonomy_extraction == TAXONOMY_EXTRACTION.AGRESSIVE ) ) { final Matcher m1 = TAXOMONY_CODE_PATTERN_1.matcher( name ); if ( m1.matches() ) { return name; diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index 27f2a33..e8498c5 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -62,52 +62,15 @@ import org.forester.util.ForesterUtil; public class PhylogenyMethods { - //private static PhylogenyMethods _instance = null; - //private final PhylogenyNode _farthest_1 = null; - //private final PhylogenyNode _farthest_2 = null; private PhylogenyMethods() { // Hidden constructor. } - // public double calculateFurthestDistance( final Phylogeny phylogeny ) { - // if ( phylogeny.getNumberOfExternalNodes() < 2 ) { - // return 0.0; - // } - // _farthest_1 = null; - // _farthest_2 = null; - // PhylogenyNode node_1 = null; - // PhylogenyNode node_2 = null; - // double farthest_d = -Double.MAX_VALUE; - // final PhylogenyMethods methods = PhylogenyMethods.getInstance(); - // final List ext_nodes = phylogeny.getRoot().getAllExternalDescendants(); - // for( int i = 1; i < ext_nodes.size(); ++i ) { - // for( int j = 0; j < i; ++j ) { - // final double d = methods.calculateDistance( ext_nodes.get( i ), ext_nodes.get( j ) ); - // if ( d < 0.0 ) { - // throw new RuntimeException( "distance cannot be negative" ); - // } - // if ( d > farthest_d ) { - // farthest_d = d; - // node_1 = ext_nodes.get( i ); - // node_2 = ext_nodes.get( j ); - // } - // } - // } - // _farthest_1 = node_1; - // _farthest_2 = node_2; - // return farthest_d; - // } @Override public Object clone() throws CloneNotSupportedException { throw new CloneNotSupportedException(); } - // public PhylogenyNode getFarthestNode1() { - // return _farthest_1; - // } - // public PhylogenyNode getFarthestNode2() { - // return _farthest_2; - // } public static DescriptiveStatistics calculatBranchLengthStatistics( final Phylogeny phy ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { @@ -331,10 +294,6 @@ public class PhylogenyMethods { for( final PhylogenyNode n : ext ) { nodes.put( n.getName(), n ); } - // for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { - // final PhylogenyNode n = iter.next(); - // nodes.put( n.getName(), n ); - // } return nodes; } @@ -704,39 +663,6 @@ public class PhylogenyMethods { phylogeny.recalculateNumberOfExternalDescendants( true ); } - public static void midpointRootOLD( final Phylogeny phylogeny ) { - // if ( phylogeny.getNumberOfExternalNodes() < 2 ) { - // return; - // } - // final PhylogenyMethods methods = getInstance(); - //final double farthest_d = methods.calculateFurthestDistance( phylogeny ); - // final PhylogenyNode f1 = methods.getFarthestNode1(); - // final PhylogenyNode f2 = methods.getFarthestNode2(); - // if ( farthest_d <= 0.0 ) { - // return; - // } - // double x = farthest_d / 2.0; - // PhylogenyNode n = f1; - // if ( PhylogenyMethods.getDistance( f1, phylogeny.getRoot() ) < PhylogenyMethods.getDistance( f2, phylogeny - // .getRoot() ) ) { - // n = f2; - // } - // while ( ( x > n.getDistanceToParent() ) && !n.isRoot() ) { - // x -= ( n.getDistanceToParent() > 0 ? n.getDistanceToParent() : 0 ); - // n = n.getParent(); - // } - // phylogeny.reRoot( n, x ); - // phylogeny.recalculateNumberOfExternalDescendants( true ); - // final PhylogenyNode a = getFurthestDescendant( phylogeny.getRoot().getChildNode1() ); - // final PhylogenyNode b = getFurthestDescendant( phylogeny.getRoot().getChildNode2() ); - // final double da = getDistance( a, phylogeny.getRoot() ); - // final double db = getDistance( b, phylogeny.getRoot() ); - // if ( Math.abs( da - db ) > 0.000001 ) { - // throw new FailedConditionCheckException( "this should not have happened: midpoint rooting failed: da=" - // + da + ", db=" + db + ", diff=" + Math.abs( da - db ) ); - // } - } - public static void normalizeBootstrapValues( final Phylogeny phylogeny, final double max_bootstrap_value, final double max_normalized_value ) { diff --git a/forester/java/src/org/forester/rio/TestRIO.java b/forester/java/src/org/forester/rio/TestRIO.java index 8160446..5d2c34e 100644 --- a/forester/java/src/org/forester/rio/TestRIO.java +++ b/forester/java/src/org/forester/rio/TestRIO.java @@ -48,7 +48,7 @@ public final class TestRIO { final NHXParser nhx = new NHXParser(); nhx.setReplaceUnderscores( false ); nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.AGRESSIVE ); final String gene_trees_1_str = "(((((MOUSE,RAT),HUMAN),CAEEL),YEAST),ARATH);" + "((((MOUSE,RAT),HUMAN),(ARATH,YEAST)),CAEEL);" + "((MOUSE,RAT),(((ARATH,YEAST),CAEEL),HUMAN));" + "(((((MOUSE,HUMAN),RAT),CAEEL),YEAST),ARATH);" + "((((HUMAN,MOUSE),RAT),(ARATH,YEAST)),CAEEL);"; @@ -134,7 +134,7 @@ public final class TestRIO { final NHXParser nhx = new NHXParser(); nhx.setReplaceUnderscores( false ); nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.AGRESSIVE ); // final String gene_trees_00_str = "(MOUSE,RAT);(MOUSE,RAT);(MOUSE,RAT);(RAT,MOUSE);"; final Phylogeny[] gene_trees_00 = factory.create( gene_trees_00_str, nhx ); -- 1.7.10.2