X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fapplication%2Fphyloxml_converter.java;h=0dac775e0b5edc617b65496e47e93fabe01ee1cb;hb=88718609970e490e94727d12ebbca1270ba2c0a7;hp=9822e1fc40a6e11964037d126c9a55410fc46abb;hpb=493e40b0c936b65da342134da37e8b856b9b80af;p=jalview.git diff --git a/forester/java/src/org/forester/application/phyloxml_converter.java b/forester/java/src/org/forester/application/phyloxml_converter.java index 9822e1f..0dac775 100644 --- a/forester/java/src/org/forester/application/phyloxml_converter.java +++ b/forester/java/src/org/forester/application/phyloxml_converter.java @@ -6,7 +6,7 @@ // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved -// +// // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -16,13 +16,13 @@ // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. -// +// // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.application; @@ -34,44 +34,49 @@ import java.util.List; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.CommandLineArguments; import org.forester.util.ForesterUtil; -import org.forester.util.ForesterUtil.PhylogenyNodeField; public class phyloxml_converter { - final static private String HELP_OPTION_1 = "help"; - final static private String HELP_OPTION_2 = "h"; - final static private String FIELD_OPTION = "f"; - final static private String FIELD_CLADE_NAME = "nn"; - final static private String FIELD_TAXONOMY_CODE = "tc"; - final static private String FIELD_TAXONOMY_SCI_NAME = "sn"; - final static private String FIELD_TAXONOMY_COMM_NAME = "cn"; - final static private String FIELD_SEQUENCE_GENE_NAME = "gn"; - final static private String FIELD_SEQUENCE_SYMBOL = "sy"; - final static private String FIELD_DUMMY = "dummy"; - final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i"; - final static private String MIDPOINT_REROOT = "m"; - final static private String EXTRACT_TAXONOMY = "xt"; - final static private String EXTRACT_TAXONOMY_PF = "xp"; - final static private String ORDER_SUBTREES = "o"; - final static private String NO_TREE_LEVEL_INDENDATION = "ni"; - final static private String REPLACE_UNDER_SCORES = "ru"; - final static private String PRG_NAME = "phyloxml_converter"; - final static private String PRG_VERSION = "1.21"; - final static private String PRG_DATE = "2010.10.02"; - final static private String E_MAIL = "czmasek@burnham.org"; - final static private String WWW = "www.phylosoft.org/forester/"; - final static private boolean SPECIAL = false; + final static private String HELP_OPTION_1 = "help"; + final static private String HELP_OPTION_2 = "h"; + final static private String FIELD_OPTION = "f"; + final static private String FIELD_CLADE_NAME = "nn"; + final static private String FIELD_TAXONOMY_CODE = "tc"; + final static private String FIELD_TAXONOMY_SCI_NAME = "sn"; + final static private String FIELD_TAXONOMY_COMM_NAME = "cn"; + final static private String FIELD_SEQUENCE_GENE_NAME = "gn"; + final static private String FIELD_SEQUENCE_SYMBOL = "sy"; + final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 = "i1"; + final static private String FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 = "i2"; + final static private String FIELD_DUMMY = "dummy"; + final static private String INTERNAL_NAMES_ARE_BOOT_SUPPPORT = "i"; + final static private String MIDPOINT_REROOT = "m"; + final static private String EXTRACT_TAXONOMY = "xt"; + final static private String EXTRACT_TAXONOMY_PF = "xp"; + final static private String ORDER_SUBTREES = "o"; + final static private String NO_TREE_LEVEL_INDENDATION = "ni"; + final static private String REPLACE_UNDER_SCORES = "ru"; + final static private String IGNORE_QUOTES = "iqs"; + final static private String PRG_NAME = "phyloxml_converter"; + final static private String PRG_VERSION = "1.302"; + final static private String PRG_DATE = "140516"; + final static private String E_MAIL = "phyloxml@gmail.com"; + final static private String WWW = "sites.google.com/site/cmzmasek/home/software/forester"; + final static private boolean SPECIAL = false; - public static void main( final String args[] ) { + public static void main( final String args[] ) throws PhyloXmlDataFormatException { ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW ); CommandLineArguments cla = null; try { @@ -100,6 +105,7 @@ public class phyloxml_converter { allowed_options.add( REPLACE_UNDER_SCORES ); allowed_options.add( EXTRACT_TAXONOMY ); allowed_options.add( EXTRACT_TAXONOMY_PF ); + allowed_options.add( IGNORE_QUOTES ); if ( cla.getNumberOfNames() != 2 ) { System.out.println(); System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); @@ -123,24 +129,30 @@ public class phyloxml_converter { System.exit( -1 ); } final String field_option_value = cla.getOptionValue( FIELD_OPTION ); - PhylogenyNodeField field = null; + PhylogenyMethods.PhylogenyNodeField field = null; if ( field_option_value.equals( FIELD_CLADE_NAME ) ) { - field = PhylogenyNodeField.CLADE_NAME; + field = PhylogenyMethods.PhylogenyNodeField.CLADE_NAME; } else if ( field_option_value.equals( FIELD_TAXONOMY_CODE ) ) { - field = PhylogenyNodeField.TAXONOMY_CODE; + field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE; } else if ( field_option_value.equals( FIELD_TAXONOMY_SCI_NAME ) ) { - field = PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME; + field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME; } else if ( field_option_value.equals( FIELD_TAXONOMY_COMM_NAME ) ) { - field = PhylogenyNodeField.TAXONOMY_COMMON_NAME; + field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_COMMON_NAME; } else if ( field_option_value.equals( FIELD_SEQUENCE_GENE_NAME ) ) { - field = PhylogenyNodeField.SEQUENCE_NAME; + field = PhylogenyMethods.PhylogenyNodeField.SEQUENCE_NAME; } else if ( field_option_value.equals( FIELD_SEQUENCE_SYMBOL ) ) { - field = PhylogenyNodeField.SEQUENCE_SYMBOL; + field = PhylogenyMethods.PhylogenyNodeField.SEQUENCE_SYMBOL; + } + else if ( field_option_value.equals( FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 ) ) { + field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_1; + } + else if ( field_option_value.equals( FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 ) ) { + field = PhylogenyMethods.PhylogenyNodeField.TAXONOMY_ID_UNIPROT_2; } else if ( field_option_value.equals( FIELD_DUMMY ) ) { } @@ -148,6 +160,10 @@ public class phyloxml_converter { ForesterUtil.fatalError( PRG_NAME, "unknown value for -\"" + FIELD_OPTION + "\" option: \"" + field_option_value + "\"" ); } + boolean ignore_quotes = false; + if ( cla.isOptionSet( IGNORE_QUOTES ) ) { + ignore_quotes = true; + } boolean int_values_are_boots = false; if ( cla.isOptionSet( INTERNAL_NAMES_ARE_BOOT_SUPPPORT ) ) { int_values_are_boots = true; @@ -187,26 +203,27 @@ public class phyloxml_converter { Phylogeny[] phys = null; try { final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - final PhylogenyParser parser = ForesterUtil.createParserDependingOnFileType( infile, true ); + final PhylogenyParser parser = ParserUtils.createParserDependingOnFileType( infile, true ); if ( parser instanceof NHXParser ) { - if ( ( field != PhylogenyNodeField.TAXONOMY_CODE ) - && ( field != PhylogenyNodeField.TAXONOMY_COMMON_NAME ) - && ( field != PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ) ) { + if ( ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_CODE ) + && ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_COMMON_NAME ) + && ( field != PhylogenyMethods.PhylogenyNodeField.TAXONOMY_SCIENTIFIC_NAME ) ) { if ( extr_taxonomy_pf_only ) { ( ( NHXParser ) parser ) - .setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_STRICT ); replace_underscores = false; } else if ( extr_taxonomy ) { - ( ( NHXParser ) parser ).setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.YES ); + ( ( NHXParser ) parser ) + .setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); replace_underscores = false; } } else { - ( ( NHXParser ) parser ).setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.NO ); + ( ( NHXParser ) parser ).setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.NO ); } ( ( NHXParser ) parser ).setReplaceUnderscores( replace_underscores ); - ( ( NHXParser ) parser ).setIgnoreQuotes( false ); + ( ( NHXParser ) parser ).setIgnoreQuotes( ignore_quotes ); } else if ( parser instanceof NexusPhylogeniesParser ) { ( ( NexusPhylogeniesParser ) parser ).setReplaceUnderscores( replace_underscores ); @@ -224,12 +241,12 @@ public class phyloxml_converter { } if ( int_values_are_boots ) { for( final Phylogeny phy : phys ) { - ForesterUtil.transferInternalNamesToBootstrapSupport( phy ); + PhylogenyMethods.transferInternalNamesToBootstrapSupport( phy ); } } if ( field != null ) { for( final Phylogeny phy : phys ) { - ForesterUtil.transferNodeNameToField( phy, field ); + PhylogenyMethods.transferNodeNameToField( phy, field, false ); } } if ( midpoint_reroot ) { @@ -245,7 +262,10 @@ public class phyloxml_converter { } if ( order_subtrees ) { for( final Phylogeny phy : phys ) { - phy.orderAppearance( true ); + PhylogenyMethods.orderAppearance( phy.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.TAXONOMY ); + phy.externalNodesHaveChanged(); + phy.clearHashIdToNodeMap(); + phy.recalculateNumberOfExternalDescendants( true ); } } try { @@ -340,37 +360,45 @@ public class phyloxml_converter { System.out.println( "Usage:" ); System.out.println(); System.out - .println( PRG_NAME - + " -" - + FIELD_OPTION - + "= [options] " ); + .println( PRG_NAME + + " -" + + FIELD_OPTION + + "= [options] " ); System.out.println(); System.out.println( " field options: " ); System.out.println(); - System.out.println( " " + FIELD_CLADE_NAME + ": transfer name to node/clade name" ); - System.out.println( " " + FIELD_TAXONOMY_CODE + ": transfer name to taxonomy code" ); - System.out.println( " " + FIELD_TAXONOMY_SCI_NAME + ": transfer name to taxonomy scientific name" ); - System.out.println( " " + FIELD_TAXONOMY_COMM_NAME + ": transfer name to taxonomy common name" ); - System.out.println( " " + FIELD_SEQUENCE_GENE_NAME + ": transfer name to sequence name" ); - System.out.println( " " + FIELD_SEQUENCE_SYMBOL + ": transfer name to sequence symbol" ); + System.out.println( " " + FIELD_CLADE_NAME + ": transfer name to node/clade name" ); + System.out.println( " " + FIELD_TAXONOMY_CODE + ": transfer name to taxonomy code" ); + System.out.println( " " + FIELD_TAXONOMY_SCI_NAME + ": transfer name to taxonomy scientific name" ); + System.out.println( " " + FIELD_TAXONOMY_COMM_NAME + ": transfer name to taxonomy common name" ); + System.out.println( " " + FIELD_SEQUENCE_GENE_NAME + ": transfer name to sequence name" ); + System.out.println( " " + FIELD_SEQUENCE_SYMBOL + ": transfer name to sequence symbol" ); + System.out.println( " " + FIELD_DUMMY + ": to convert NHX formatted trees to phyloXML" ); + System.out.println( " " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_1 + + ": transfer/split name to taxonomy uniprot identifier" ); + System.out.println( " (split at underscore if \"id_name\" pattern, e.g. \"817_SusD\")" ); + System.out.println( " " + FIELD_UNIPROT_TAXONOMY_ID_SPLIT_2 + + ": transfer/split name to taxonomy uniprot identifier" ); + System.out.println( " (split at underscore if \"name_id\" pattern, e.g. \"SusD_817\")" ); System.out.println(); System.out.println( " options: " ); System.out.println( " -" + INTERNAL_NAMES_ARE_BOOT_SUPPPORT - + " : internal names in NH or NHX tree are bootstrap support values" ); - System.out.println( " -" + REPLACE_UNDER_SCORES + ": replace all underscores with spaces" ); - System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" ); - System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" ); + + " : internal names in NH or NHX tree are bootstrap support values" ); + System.out.println( " -" + REPLACE_UNDER_SCORES + " : replace all underscores with spaces" ); + System.out.println( " -" + MIDPOINT_REROOT + " : midpoint reroot" ); + System.out.println( " -" + ORDER_SUBTREES + " : order subtrees" ); System.out - .println( " -" - + EXTRACT_TAXONOMY - + ": extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: " - + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); + .println( " -" + + EXTRACT_TAXONOMY + + " : extract taxonomy to taxonomy code from \"seqname_TAXON\"-style names (cannot be used with the following field options: " + + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); System.out - .println( " -" - + EXTRACT_TAXONOMY_PF - + ": extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: " - + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); - System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + ": no tree level indendation in phyloXML output" ); + .println( " -" + + EXTRACT_TAXONOMY_PF + + " : extract taxonomy to taxonomy code from Pfam (\"seqname_TAXON/x-y\") style names only (cannot be used with the following field options: " + + FIELD_TAXONOMY_CODE + ", " + FIELD_TAXONOMY_COMM_NAME + ", " + FIELD_TAXONOMY_SCI_NAME + ")" ); + System.out.println( " -" + NO_TREE_LEVEL_INDENDATION + " : no tree level indendation in phyloXML output" ); + System.out.println( " -" + IGNORE_QUOTES + ": ignore quotes and whitespace (e.g. \"a b\" becomes ab)" ); System.out.println(); } }