From 7bbedba7ee0f24dba48d04ee191dae23d71ca533 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 8 Oct 2013 02:23:09 +0000 Subject: [PATCH] in progress --- .../forester/applications/tax_code_cleaner.java | 214 ++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 forester_applications/src/org/forester/applications/tax_code_cleaner.java diff --git a/forester_applications/src/org/forester/applications/tax_code_cleaner.java b/forester_applications/src/org/forester/applications/tax_code_cleaner.java new file mode 100644 index 0000000..d0ea276 --- /dev/null +++ b/forester_applications/src/org/forester/applications/tax_code_cleaner.java @@ -0,0 +1,214 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org +// javac -cp ~/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/java/forester.jar +// ~/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester_applications/src/org/forester/applications/simple_node_processor.java +// java -Xmx2048m -cp +// /home/czmasek/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester_applications/src/:/home/czmasek/SOFTWARE_DEV/ECLIPSE_WORKSPACE/forester/java/forester.jar +// org.forester.applications.simple_node_processor + +package org.forester.applications; + +import java.io.File; + +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.CommandLineArguments; +import org.forester.util.ForesterUtil; + +public class tax_code_cleaner { + + private final static String BASE = "b_"; + + public static void main( final String args[] ) { + File in = null; + File out = null; + try { + CommandLineArguments cla = null; + cla = new CommandLineArguments( args ); + in = cla.getFile( 0 ); + out = cla.getFile( 1 ); + // if ( out.exists() ) { + // System.out.println( out + " already exists" ); + // System.exit( -1 ); + // } + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhyloXmlParser xml_parser = new PhyloXmlParser(); + final Phylogeny[] phylogenies_0 = factory.create( in, xml_parser ); + final Phylogeny phylogeny_0 = phylogenies_0[ 0 ]; + final PhylogenyNodeIterator it = phylogeny_0.iteratorPostorder(); + int i = 0; + while ( it.hasNext() ) { + final PhylogenyNode node = it.next(); + processNode( node, i ); + i++; + } + final PhylogenyWriter writer = new PhylogenyWriter(); + writer.toPhyloXML( out, phylogeny_0, 0 ); + } + catch ( final Exception e ) { + System.out.println( e.getLocalizedMessage() ); + e.printStackTrace(); + System.exit( -1 ); + } + } + + private static void processNode( final PhylogenyNode node, final int i ) throws PhyloXmlDataFormatException { + if ( node.isExternal() ) { + if ( node.getNodeData().isHasTaxonomy() ) { + final Taxonomy t = node.getNodeData().getTaxonomy(); + if ( !ForesterUtil.isEmpty( t.getTaxonomyCode() ) ) { + final String tc = t.getTaxonomyCode(); + if ( tc.equals( "ACRALC" ) ) { + t.setScientificName( "Acremonium alcalophilum" ); + t.setTaxonomyCode( "AALXX" ); + } + else if ( tc.equals( "AMPQU" ) ) { + t.setScientificName( "Amphimedon queenslandica" ); + t.setTaxonomyCode( "AMPQE" ); + } + else if ( tc.equals( "AQUAE" ) ) { + t.setScientificName( "Aquifex aeolicus (strain VF5)" ); + } + else if ( tc.equals( "ASTSPC" ) ) { + t.setScientificName( "Asterochloris sp. Cgr/DA1pho" ); + t.setTaxonomyCode( "ASCXX" ); + } + else if ( tc.equals( "BAUCOM" ) ) { + t.setScientificName( "Baudoinia compniacensis" ); + t.setTaxonomyCode( "BCOXX" ); + } + else if ( tc.equals( "CAP" ) ) { + t.setScientificName( "Capitella sp.1" ); + t.setTaxonomyCode( "CTEXX" ); + } + else if ( tc.equals( "CAPOWC" ) ) { + t.setScientificName( "Capsaspora owczarzaki (strain ATCC 30864)" ); + t.setTaxonomyCode( "CAPO3" ); + } + else if ( tc.equals( "CHLVUL" ) ) { + t.setScientificName( "Chlorella variabilis" ); + t.setTaxonomyCode( "CHLVA" ); + } + else if ( tc.equals( "CITCLE" ) ) { + t.setScientificName( "Citrus clementina" ); + t.setTaxonomyCode( "CCLXX" ); + } + else if ( tc.equals( "CLAGRA" ) ) { + t.setScientificName( "Cladonia grayi" ); + t.setTaxonomyCode( "" ); + } + else if ( tc.equals( "COEREV" ) ) { + t.setScientificName( "Coemansia reversa" ); + t.setTaxonomyCode( "CREXX" ); + } + else if ( tc.equals( "CONPUT" ) ) { + t.setScientificName( "Coniophora puteana" ); + t.setTaxonomyCode( "CPUXX" ); + } + else if ( tc.equals( "DICSQU" ) ) { + t.setScientificName( "Dichomitus squalens" ); + t.setTaxonomyCode( "DICSQ" ); + } + else if ( tc.equals( "FOMPIN" ) ) { + t.setScientificName( "Fomitopsis pinicola" ); + t.setTaxonomyCode( "FPIXX" ); + } + else if ( tc.equals( "GONPRO" ) ) { + t.setScientificName( "Gonapodya prolifera" ); + t.setTaxonomyCode( "GONPR" ); + } + else if ( tc.equals( "GYMLUX" ) ) { + t.setScientificName( "Gymnopus luxurians" ); + t.setTaxonomyCode( "" ); + } + else if ( tc.equals( "HYDPIN" ) ) { + t.setScientificName( "Hydnomerulius pinastri" ); + t.setTaxonomyCode( "" ); + } + else if ( tc.equals( "JAAARG" ) ) { + t.setScientificName( "Jaapia argillacea" ); + t.setTaxonomyCode( "" ); + } + else if ( tc.equals( "MYCPOP" ) ) { + t.setScientificName( "Mycosphaerella populorum" ); + t.setTaxonomyCode( "MYCPS" ); + } + else if ( tc.equals( "MYCTHE" ) ) { + t.setScientificName( "Myceliophthora thermophila" ); + t.setTaxonomyCode( "THIHA" ); + } + else if ( tc.equals( "OIDMAI" ) ) { + t.setScientificName( "Oidiodendron maius" ); + t.setTaxonomyCode( "" ); + } + else if ( tc.equals( "PANVIR" ) ) { + t.setScientificName( "Panicum virgatum" ); + t.setTaxonomyCode( "PANVG" ); + } + else if ( tc.equals( "PIRSPE" ) ) { + t.setScientificName( "Piromyces sp. E2" ); + t.setTaxonomyCode( "PIRSE" ); + } + else if ( tc.equals( "SAICOM" ) ) { + t.setScientificName( "Saitoella complicata" ); + t.setTaxonomyCode( "" ); + } + else if ( tc.equals( "SERLAC" ) ) { + t.setScientificName( "Serpula lacrymans" ); + t.setTaxonomyCode( "SERL9" ); + } + else if ( tc.equals( "SPHARC" ) ) { + t.setScientificName( "Sphaeroforma arctica" ); + t.setTaxonomyCode( "SARXX" ); + } + else if ( tc.equals( "THETRA" ) ) { + t.setScientificName( "Thecamonas trahens" ); + t.setTaxonomyCode( "TTRXX" ); + } + else if ( tc.equals( "THITER" ) ) { + t.setScientificName( "Thielavia terrestris (strain ATCC 38088 / NRRL 8126)" ); + t.setTaxonomyCode( "THITE" ); + } + else if ( tc.equals( "WOLCOC" ) ) { + t.setScientificName( "Wolfiporia cocos MD-104 SS10" ); + t.setTaxonomyCode( "WOLCO" ); + } + else if ( tc.equals( "XANPAR" ) ) { + t.setScientificName( "Xanthoria parietina 46-1" ); + t.setTaxonomyCode( "" ); + } + } + } + } + } +} -- 1.7.10.2