From e0914db6b0fa3516bc77186eec4d36dd9a753a24 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 12 Oct 2012 02:36:51 +0000 Subject: [PATCH] in progress --- .../src/org/forester/application/decorator.java | 75 +++++++++++--------- .../src/org/forester/tools/PhylogenyDecorator.java | 47 ++++++------ 2 files changed, 65 insertions(+), 57 deletions(-) diff --git a/forester/java/src/org/forester/application/decorator.java b/forester/java/src/org/forester/application/decorator.java index 88296e9..f1b2fbc 100644 --- a/forester/java/src/org/forester/application/decorator.java +++ b/forester/java/src/org/forester/application/decorator.java @@ -47,32 +47,32 @@ import org.forester.util.ForesterUtil; public final class decorator { - private static final String SEQUENCE_NAME_FIELD = "s"; - private static final String TAXONOMY_CODE_FIELD = "c"; - private static final String TAXONOMY_SCIENTIFIC_NAME_FIELD = "sn"; - private static final String DS_FILED = "d"; - private static final String SEQUENCE_ANNOTATION_DESC = "a"; - private static final String NODE_NAME_FIELD = "n"; - final static private String PICKY_OPTION = "p"; - final static private String FIELD_OPTION = "f"; - final static private String TRIM_AFTER_TILDE_OPTION = "t"; - final static private String MOVE_DOMAIN_NUMBER_OPTION = "mdn"; // Hidden expert option. - final static private String TREE_NAME_OPTION = "pn"; - final static private String TREE_ID_OPTION = "pi"; - final static private String TREE_DESC_OPTION = "pd"; - final static private String EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION = "sn"; - final static private String PROCESS_NAME_INTELLIGENTLY_OPTION = "x"; - final static private String PROCESS_SIMILAR_TO_OPTION = "xs"; - final static private String CUT_NAME_AFTER_FIRST_SPACE_OPTION = "c"; - final static private String ALLOW_REMOVAL_OF_CHARS_OPTION = "r"; - final static private String ADVANCED_TABLE_OPTION = "table"; - final static private String KEY_COLUMN = "k"; - final static private String VALUE_COLUMN = "v"; - final static private String MAPPING_FILE_SEPARATOR_OPTION = "s"; - final static private String MAPPING_FILE_SEPARATOR_DEFAULT = ": "; - final static private String PRG_NAME = "decorator"; - final static private String PRG_VERSION = "1.11"; - final static private String PRG_DATE = "2012.09.15"; + private static final String SEQUENCE_NAME_FIELD = "s"; + private static final String TAXONOMY_CODE_FIELD = "c"; + private static final String TAXONOMY_SCIENTIFIC_NAME_FIELD = "sn"; + private static final String DS_FILED = "d"; + private static final String SEQUENCE_ANNOTATION_DESC = "a"; + private static final String NODE_NAME_FIELD = "n"; + final static private String PICKY_OPTION = "p"; + final static private String FIELD_OPTION = "f"; + final static private String TRIM_AFTER_TILDE_OPTION = "t"; + final static private String TREE_NAME_OPTION = "pn"; + final static private String TREE_ID_OPTION = "pi"; + final static private String TREE_DESC_OPTION = "pd"; + final static private String EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION = "sn"; + final static private String EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION = "tc"; + final static private String PROCESS_NAME_INTELLIGENTLY_OPTION = "x"; + final static private String PROCESS_SIMILAR_TO_OPTION = "xs"; + final static private String CUT_NAME_AFTER_FIRST_SPACE_OPTION = "c"; + final static private String ALLOW_REMOVAL_OF_CHARS_OPTION = "r"; + final static private String ADVANCED_TABLE_OPTION = "table"; + final static private String KEY_COLUMN = "k"; + final static private String VALUE_COLUMN = "v"; + final static private String MAPPING_FILE_SEPARATOR_OPTION = "s"; + final static private String MAPPING_FILE_SEPARATOR_DEFAULT = ": "; + final static private String PRG_NAME = "decorator"; + final static private String PRG_VERSION = "1.11"; + final static private String PRG_DATE = "2012.09.15"; private static void argumentsError() { System.out.println(); @@ -105,7 +105,9 @@ public final class decorator { System.out.println( " -v= : value column in mapping table (0 based)," ); System.out.println( " data which with to decorate - default is 1" ); System.out.println( " -" + EXTRACT_BRACKETED_SCIENTIC_NAME_OPTION - + " : to extract bracketed scientific names" ); + + " : to extract bracketed scientific names, e.g. [Nematostella vectensis]" ); + System.out.println( " -" + EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION + + " : to extract bracketed taxonomic codes, e.g. [NEMVE]" ); System.out.println( " -s= : column separator in mapping file, default is \"" + decorator.MAPPING_FILE_SEPARATOR_DEFAULT + "\"" ); System.out.println( " -x : process name \"intelligently\" (only for -f=n)" ); @@ -154,7 +156,6 @@ public final class decorator { allowed_options.add( decorator.TREE_NAME_OPTION ); allowed_options.add( decorator.TREE_ID_OPTION ); allowed_options.add( decorator.TREE_DESC_OPTION ); - allowed_options.add( decorator.MOVE_DOMAIN_NUMBER_OPTION ); allowed_options.add( decorator.TRIM_AFTER_TILDE_OPTION ); final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); if ( dissallowed_options.length() > 0 ) { @@ -186,7 +187,7 @@ public final class decorator { boolean process_name_intelligently = false; boolean process_similar_to = false; boolean extract_bracketed_scientific_name = false; - boolean move_domain_numbers_at_end_to_middle = false; + boolean extract_bracketed_tax_code = false; boolean trim_after_tilde = false; String tree_name = ""; String tree_id = ""; @@ -207,6 +208,12 @@ public final class decorator { } extract_bracketed_scientific_name = true; } + if ( cla.isOptionSet( decorator.EXTRACT_BRACKETED_TAXONOMIC_CODE_OPTION ) ) { + if ( advanced_table ) { + argumentsError(); + } + extract_bracketed_tax_code = true; + } if ( cla.isOptionSet( decorator.KEY_COLUMN ) ) { if ( advanced_table ) { argumentsError(); @@ -247,9 +254,6 @@ public final class decorator { numbers_of_chars_allowed_to_remove_if_not_found_in_map = cla .getOptionValueAsInt( decorator.ALLOW_REMOVAL_OF_CHARS_OPTION ); } - if ( cla.isOptionSet( decorator.MOVE_DOMAIN_NUMBER_OPTION ) ) { - move_domain_numbers_at_end_to_middle = true; - } if ( cla.isOptionSet( decorator.FIELD_OPTION ) ) { field_str = cla.getOptionValue( decorator.FIELD_OPTION ); if ( field_str.equals( NODE_NAME_FIELD ) ) { @@ -261,6 +265,7 @@ public final class decorator { else if ( field_str.equals( DS_FILED ) ) { field = FIELD.DOMAIN_STRUCTURE; extract_bracketed_scientific_name = false; + extract_bracketed_tax_code = false; } else if ( field_str.equals( TAXONOMY_CODE_FIELD ) ) { field = FIELD.TAXONOMY_CODE; @@ -271,6 +276,7 @@ public final class decorator { else if ( field_str.equals( TAXONOMY_SCIENTIFIC_NAME_FIELD ) ) { field = FIELD.TAXONOMY_SCIENTIFIC_NAME; extract_bracketed_scientific_name = false; + extract_bracketed_tax_code = false; } else { ForesterUtil.fatalError( decorator.PRG_NAME, "unknown value for \"" + decorator.FIELD_OPTION @@ -299,6 +305,9 @@ public final class decorator { ForesterUtil.fatalError( decorator.PRG_NAME, "attempt to use -" + decorator.PROCESS_SIMILAR_TO_OPTION + " and -c option together" ); } + if ( extract_bracketed_scientific_name && extract_bracketed_tax_code ) { + argumentsError(); + } Phylogeny[] phylogenies = null; try { final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); @@ -367,12 +376,12 @@ public final class decorator { map, field, extract_bracketed_scientific_name, + extract_bracketed_tax_code, picky, cut_name_after_space, process_name_intelligently, process_similar_to, numbers_of_chars_allowed_to_remove_if_not_found_in_map, - move_domain_numbers_at_end_to_middle, trim_after_tilde ); } } diff --git a/forester/java/src/org/forester/tools/PhylogenyDecorator.java b/forester/java/src/org/forester/tools/PhylogenyDecorator.java index 3191201..cf3d280 100644 --- a/forester/java/src/org/forester/tools/PhylogenyDecorator.java +++ b/forester/java/src/org/forester/tools/PhylogenyDecorator.java @@ -29,7 +29,6 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import java.util.regex.Matcher; import java.util.regex.Pattern; import org.forester.archaeopteryx.AptxUtil; @@ -183,25 +182,25 @@ public final class PhylogenyDecorator { final Map map, final FIELD field, final boolean extract_bracketed_scientific_name, + final boolean extract_bracketed_tax_code, final boolean picky, final boolean cut_name_after_space, final boolean process_name_intelligently, final boolean process_similar_to, final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle, final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException { PhylogenyDecorator.decorate( phylogeny, map, field, extract_bracketed_scientific_name, + extract_bracketed_tax_code, picky, null, cut_name_after_space, process_name_intelligently, process_similar_to, numbers_of_chars_allowed_to_remove_if_not_found_in_map, - move_domain_numbers_at_end_to_middle, trim_after_tilde ); } @@ -224,13 +223,13 @@ public final class PhylogenyDecorator { final Map map, final FIELD field, final boolean extract_bracketed_scientific_name, + final boolean extract_bracketed_tax_code, final boolean picky, final Map intermediate_map, final boolean cut_name_after_space, final boolean process_name_intelligently, final boolean process_similar_to, final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle, final boolean trim_after_tilde ) throws IllegalArgumentException, PhyloXmlDataFormatException { if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) { @@ -278,6 +277,9 @@ public final class PhylogenyDecorator { if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) { new_value = extractBracketedScientificNames( node, new_value ); } + else if ( extract_bracketed_tax_code && new_value.endsWith( "]" ) ) { + new_value = extractBracketedTaxCodes( node, new_value ); + } switch ( field ) { case SEQUENCE_ANNOTATION_DESC: if ( PhylogenyDecorator.VERBOSE ) { @@ -356,9 +358,6 @@ public final class PhylogenyDecorator { default: throw new RuntimeException( "unknown field \"" + field + "\"" ); } - if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) { - node.setName( moveDomainNumbersAtEnd( node.getName() ) ); - } } } else if ( picky ) { @@ -385,12 +384,12 @@ public final class PhylogenyDecorator { final Map map, final FIELD field, final boolean extract_bracketed_scientific_name, + final boolean extract_bracketed_tax_code, final boolean picky, final boolean cut_name_after_space, final boolean process_name_intelligently, final boolean process_similar_to, final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle, final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException { for( int i = 0; i < phylogenies.length; ++i ) { @@ -398,12 +397,12 @@ public final class PhylogenyDecorator { map, field, extract_bracketed_scientific_name, + extract_bracketed_tax_code, picky, cut_name_after_space, process_name_intelligently, process_similar_to, numbers_of_chars_allowed_to_remove_if_not_found_in_map, - move_domain_numbers_at_end_to_middle, trim_after_tilde ); } } @@ -412,13 +411,13 @@ public final class PhylogenyDecorator { final Map map, final FIELD field, final boolean extract_bracketed_scientific_name, + final boolean extract_bracketed_tax_code, final boolean picky, final Map intermediate_map, final boolean cut_name_after_space, final boolean process_name_intelligently, final boolean process_similar_to, final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle, final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException { for( int i = 0; i < phylogenies.length; ++i ) { @@ -426,13 +425,13 @@ public final class PhylogenyDecorator { map, field, extract_bracketed_scientific_name, + extract_bracketed_tax_code, picky, intermediate_map, cut_name_after_space, process_name_intelligently, process_similar_to, numbers_of_chars_allowed_to_remove_if_not_found_in_map, - move_domain_numbers_at_end_to_middle, trim_after_tilde ); } } @@ -453,6 +452,19 @@ public final class PhylogenyDecorator { return new_value.substring( 0, i - 1 ).trim(); } + private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) { + final int i = new_value.lastIndexOf( "[" ); + final String tc = new_value.substring( i + 1, new_value.length() - 1 ); + AptxUtil.ensurePresenceOfTaxonomy( node ); + try { + node.getNodeData().getTaxonomy().setTaxonomyCode( tc ); + } + catch ( final PhyloXmlDataFormatException e ) { + throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc ); + } + return new_value.substring( 0, i - 1 ).trim(); + } + private static String extractIntermediate( final Map intermediate_map, final String name ) { String new_name = null; if ( PhylogenyDecorator.VERBOSE ) { @@ -473,19 +485,6 @@ public final class PhylogenyDecorator { return new_name; } - private static String moveDomainNumbersAtEnd( final String node_name ) { - final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name ); - if ( m.matches() ) { - final String seq_number = m.group( 1 ); - final String tax = m.group( 2 ); - final String domain_number = m.group( 3 ); - return seq_number + "_[" + domain_number + "]_" + tax; - } - else { - return node_name; - } - } - public static Map> parseMappingTable( final File mapping_table_file ) throws IOException { final Map> map = new HashMap>(); -- 1.7.10.2