3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
27 package org.forester.application;
30 import java.io.IOException;
31 import java.util.ArrayList;
32 import java.util.List;
34 import org.forester.io.parsers.PhylogenyParser;
35 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
36 import org.forester.io.parsers.nhx.NHXParser;
37 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
38 import org.forester.io.parsers.util.ParserUtils;
39 import org.forester.io.writers.PhylogenyWriter;
40 import org.forester.phylogeny.Phylogeny;
41 import org.forester.phylogeny.PhylogenyMethods;
42 import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
43 import org.forester.phylogeny.PhylogenyNode;
44 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
45 import org.forester.phylogeny.factories.PhylogenyFactory;
46 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
47 import org.forester.util.CommandLineArguments;
48 import org.forester.util.ForesterUtil;
50 public class phyloxml_converter {
52 final static private String HELP_OPTION_1 = "help";
53 final static private String HELP_OPTION_2 = "h";
54 final static private String FIELD_OPTION = "f";
55 final static private String FIELD_CLADE_NAME = "nn";
56 final static private String FIELD_TAXONOMY_CODE = "tc";
57 final static private String FIELD_TAXONOMY_SCI_NAME = "sn";
58 final static private String FIELD_TAXONOMY_COMM_NAME = "cn";
59 final static private String FIELD_SEQUENCE_GENE_NAME = "gn";
60 final static private String FIELD_SEQUENCE_SYMBOL = "sy";
61 final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1";
62 final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2";
63 final static private String FIELD_DUMMY = "dummy";
64 final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i";
65 final static private String MIDPOINT_REROOT = "m";
66 final static private String EXTRACT_TAXONOMY = "xt";
67 final static private String EXTRACT_TAXONOMY_PF = "xp";
68 final static private String ORDER_SUBTREES = "o";
69 final static private String NO_TREE_LEVEL_INDENDATION = "ni";
70 final static private String REPLACE_UNDER_SCORES = "ru";
71 final static private String PRG_NAME = "phyloxml_converter";
72 final static private String PRG_VERSION = "1.30";
73 final static private String PRG_DATE = "2011.03.01";
74 final static private String E_MAIL = "phylosoft@gmail.com";
75 final static private String WWW = "www.phylosoft.org/forester/";
76 final static private boolean SPECIAL = false;
78 public static void main( final String args[] ) throws PhyloXmlDataFormatException {
79 ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
80 CommandLineArguments cla = null;
82 cla = new CommandLineArguments( args );
84 catch ( final Exception e ) {
85 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
87 if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) {
91 if ( args.length < 3 ) {
93 System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" );
98 final List<String> allowed_options = new ArrayList<String>();
99 allowed_options.add( NO_TREE_LEVEL_INDENDATION );
100 allowed_options.add( FIELD_OPTION );
101 allowed_options.add( MIDPOINT_REROOT );
102 allowed_options.add( ORDER_SUBTREES );
103 allowed_options.add( INTERNAL_NAMES_ARE_BOOT_SUPPPORT );
104 allowed_options.add( REPLACE_UNDER_SCORES );
105 allowed_options.add( EXTRACT_TAXONOMY );
106 allowed_options.add( EXTRACT_TAXONOMY_PF );
107 if ( cla.getNumberOfNames() != 2 ) {
108 System.out.println();
109 System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" );
110 System.out.println();
114 final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
115 if ( dissallowed_options.length() > 0 ) {
116 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
118 final List<String> mandatory_options = new ArrayList<String>();
119 mandatory_options.add( FIELD_OPTION );
120 final String missing_options = cla.validateMandatoryOptionsAsString( mandatory_options );
121 if ( missing_options.length() > 0 ) {
122 ForesterUtil.fatalError( PRG_NAME, "missing option(s): " + missing_options );
124 if ( !cla.isOptionValueSet( FIELD_OPTION ) ) {
125 System.out.println();
129 final String field_option_value = cla.getOptionValue( FIELD_OPTION );
130 PhylogenyMethods.PhylogenyNodeField field = null;
131 if ( field_option_value.equals( FIELD_CLADE_NAME ) ) {
132 field = PhylogenyMethods.PhylogenyNodeField.CLADE_NAME;
134 else if ( field_option_value.equals( FIELD_TAXONOMY_CODE ) ) {
135 field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE;
137 else if ( field_option_value.equals( FIELD_TAXONOMY_SCI_NAME ) ) {
138 field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME;
140 else if ( field_option_value.equals( FIELD_TAXONOMY_COMM_NAME ) ) {
141 field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_COMMON_NAME;
143 else if ( field_option_value.equals( FIELD_SEQUENCE_GENE_NAME ) ) {
144 field = PhylogenyMethods.PhylogenyNodeField.SEQUENCE_NAME;
146 else if ( field_option_value.equals( FIELD_SEQUENCE_SYMBOL ) ) {
147 field = PhylogenyMethods.PhylogenyNodeField.SEQUENCE_SYMBOL;
149 else if ( field_option_value.equals( FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 ) ) {
150 field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_1;
152 else if ( field_option_value.equals( FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 ) ) {
153 field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_2;
155 else if ( field_option_value.equals( FIELD_DUMMY ) ) {
158 ForesterUtil.fatalError( PRG_NAME, "unknown value for -\"" + FIELD_OPTION + "\" option: \""
159 + field_option_value + "\"" );
161 boolean int_values_are_boots = false;
162 if ( cla.isOptionSet( INTERNAL_NAMES_ARE_BOOT_SUPPPORT ) ) {
163 int_values_are_boots = true;
165 boolean midpoint_reroot = false;
166 if ( cla.isOptionSet( MIDPOINT_REROOT ) ) {
167 midpoint_reroot = true;
169 boolean order_subtrees = false;
170 if ( cla.isOptionSet( ORDER_SUBTREES ) ) {
171 order_subtrees = true;
173 boolean replace_underscores = false;
174 if ( cla.isOptionSet( REPLACE_UNDER_SCORES ) ) {
175 replace_underscores = true;
177 boolean no_indendation = false;
178 if ( cla.isOptionSet( NO_TREE_LEVEL_INDENDATION ) ) {
179 no_indendation = true;
181 boolean extr_taxonomy = false;
182 if ( cla.isOptionSet( EXTRACT_TAXONOMY ) ) {
183 extr_taxonomy = true;
185 boolean extr_taxonomy_pf_only = false;
186 if ( cla.isOptionSet( EXTRACT_TAXONOMY_PF ) ) {
187 extr_taxonomy_pf_only = true;
189 final File infile = cla.getFile( 0 );
190 final File outfile = cla.getFile( 1 );
191 if ( outfile.exists() ) {
192 ForesterUtil.fatalError( PRG_NAME, "[" + outfile + "] already exists" );
194 if ( !infile.exists() ) {
195 ForesterUtil.fatalError( PRG_NAME, "[" + infile + "] does not exist" );
197 Phylogeny[] phys = null;
199 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
200 final PhylogenyParser parser = ParserUtils.createParserDependingOnFileType( infile, true );
201 if ( parser instanceof NHXParser ) {
202 if ( ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE )
203 && ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_COMMON_NAME )
204 && ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ) ) {
205 if ( extr_taxonomy_pf_only ) {
206 ( ( NHXParser ) parser )
207 .setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY );
208 replace_underscores = false;
210 else if ( extr_taxonomy ) {
211 ( ( NHXParser ) parser ).setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.YES );
212 replace_underscores = false;
216 ( ( NHXParser ) parser ).setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.NO );
218 ( ( NHXParser ) parser ).setReplaceUnderscores( replace_underscores );
219 ( ( NHXParser ) parser ).setIgnoreQuotes( false );
221 else if ( parser instanceof NexusPhylogeniesParser ) {
222 ( ( NexusPhylogeniesParser ) parser ).setReplaceUnderscores( replace_underscores );
223 ( ( NexusPhylogeniesParser ) parser ).setIgnoreQuotes( false );
225 phys = factory.create( infile, parser );
227 catch ( final IOException e ) {
228 ForesterUtil.fatalError( PRG_NAME, "failed to read phylogeny from [" + infile + "]: " + e.getMessage() );
231 for( final Phylogeny phy : phys ) {
232 performSpecialProcessing( phy );
235 if ( int_values_are_boots ) {
236 for( final Phylogeny phy : phys ) {
237 PhylogenyMethods.transferInternalNamesToBootstrapSupport( phy );
240 if ( field != null ) {
241 for( final Phylogeny phy : phys ) {
242 PhylogenyMethods.transferNodeNameToField( phy, field, false );
245 if ( midpoint_reroot ) {
247 for( final Phylogeny phy : phys ) {
248 PhylogenyMethods.midpointRoot( phy );
251 catch ( final Exception e ) {
252 System.out.println( "" );
253 ForesterUtil.printWarningMessage( PRG_NAME, "midpoint rerooting failed: " + e.getLocalizedMessage() );
256 if ( order_subtrees ) {
257 for( final Phylogeny phy : phys ) {
258 PhylogenyMethods.orderAppearance( phy.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.TAXONOMY );
259 phy.externalNodesHaveChanged();
261 phy.recalculateNumberOfExternalDescendants( true );
265 final PhylogenyWriter writer = new PhylogenyWriter();
266 if ( no_indendation ) {
267 writer.setIndentPhyloxml( false );
269 writer.toPhyloXML( phys, 0, outfile, ForesterUtil.LINE_SEPARATOR );
271 catch ( final IOException e ) {
272 ForesterUtil.fatalError( PRG_NAME, "failed to write to [" + outfile + "]: " + e.getMessage() );
274 System.out.println( "[" + PRG_NAME + "] wrote: [" + outfile + "]" );
275 System.out.println( "[" + PRG_NAME + "] OK" );
276 System.out.println();
279 private static void performSpecialProcessing( final Phylogeny phy ) {
280 // Can place some kind of custom processing here.
281 // final List<PhylogenyNode> remove_us = new ArrayList<PhylogenyNode>();
283 // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
284 // final PhylogenyNode node = it.next();
285 // final String name = node.getNodeName().toLowerCase();
286 // if ( name.startsWith( "environmental_samples" ) || name.startsWith( "unclassified" )
287 // || name.startsWith( "bacteria" ) || name.startsWith( "other" )
288 // || name.startsWith( "viroids" ) || name.startsWith( "viruses" ) ) {
289 // remove_us.add( node );
290 // System.out.println( counter++ );
294 // for( final PhylogenyNode node : remove_us ) {
295 // if ( phy.getNode( node.getNodeId() ) != null ) {
296 // phy.deleteSubtree( node );
297 // System.out.println( "deleted: " + node );
302 // for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
303 // final PhylogenyNode node = it.next();
304 // node.getNodeData().setTaxonomy( null );
306 // phy.reRoot( phy.getFirstExternalNode() );
307 // PhylogenyMethods.midpointRoot( phy );
308 // phy.orderAppearance( true );
309 for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) {
310 final PhylogenyNode node = it.next();
311 final String name = node.getName();
312 if ( !ForesterUtil.isEmpty( name ) ) {
313 // final Taxonomy taxo = new Taxonomy();
314 // if ( node.isExternal() ) {
315 // taxo.setTaxonomyCode( name );
316 // node.getNodeData().setTaxonomy( taxo );
318 // else if ( name.indexOf( '_' ) == -1 || name.length() > 6 ) {
319 // taxo.setScientificName( name );
320 // node.getNodeData().setTaxonomy( taxo );
322 // node.setName( "" );
323 // if ( name.indexOf( "BF" ) >= 0 ) {
324 // taxo.setTaxonomyCode( "BACFR" );
326 // else if ( name.indexOf( "BT" ) >= 0 ) {
327 // taxo.setTaxonomyCode( "BACTN" );
329 // else if ( name.indexOf( "MXAN" ) >= 0 ) {
330 // taxo.setTaxonomyCode( "MYXXD" );
332 // else if ( name.indexOf( "STIAU" ) >= 0 ) {
333 // taxo.setTaxonomyCode( "STIAU" );
335 // else if ( name.indexOf( "BOVA" ) >= 0 ) {
336 // taxo.setTaxonomyCode( "BACOV" );
338 // else if ( name.indexOf( "BUNI" ) >= 0 ) {
339 // taxo.setTaxonomyCode( "BACUN" );
341 // else if ( name.indexOf( "Pgin" ) >= 0 ) {
342 // taxo.setTaxonomyCode( "PORGI" );
344 // else if ( name.equals( "3CGH" ) || name.equals( "3CK7" ) ) {
345 // taxo.setTaxonomyCode( "BACTN" );
347 // node.getNodeData().setTaxonomy( taxo );
352 private static void printHelp() {
353 System.out.println( "Usage:" );
354 System.out.println();
359 + "=<field option> [options] <infile in New Hamphshire, NHX, Nexus, ToL XML, or phyloXML format> <outfile>" );
360 System.out.println();
361 System.out.println( " field options: " );
362 System.out.println();
363 System.out.println( " " + FIELD_CLADE_NAME + ": transfer name to node/clade name" );
364 System.out.println( " " + FIELD_TAXONOMY_CODE + ": transfer name to taxonomy code" );
365 System.out.println( " " + FIELD_TAXONOMY_SCI_NAME + ": transfer name to taxonomy scientific name" );
366 System.out.println( " " + FIELD_TAXONOMY_COMM_NAME + ": transfer name to taxonomy common name" );
367 System.out.println( " " + FIELD_SEQUENCE_GENE_NAME + ": transfer name to sequence name" );
368 System.out.println( " " + FIELD_SEQUENCE_SYMBOL + ": transfer name to sequence symbol" );
371 + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1
372 + ": transfer/split name to taxonomy uniprot identifier\n (split at underscore if \"id_name\" pattern, e.g. \"817_SusD\")" );
375 + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2
376 + ": transfer/split name to taxonomy uniprot identifier\n (split at underscore if \"name_id\" pattern, e.g. \"SusD_817\")" );
377 System.out.println();
378 System.out.println( " options: " );
379 System.out.println( " -" + INTERNAL_NAMES_ARE_BOOT_SUPPPORT
380 + " : internal names in NH or NHX tree are bootstrap support values" );
381 System.out.println( " -" + REPLACE_UNDER_SCORES + ": replace all underscores with spaces" );
382 System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" );
383 System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" );
387 + ": extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: "
388 + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" );
391 + EXTRACT_TAXONOMY_PF
392 + ": extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: "
393 + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" );
394 System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + ": no tree level indendation in phyloXML output" );
395 System.out.println();