X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Ftools%2FPhylogenyDecorator.java;h=74c4fe95cf967e803db392e526dacb2c5d7ce3b6;hb=36cad5d7aee97eedcf1e42889ba4a1384b887777;hp=f01701a0685ea380358078bfeda890751ff4a7cf;hpb=0cbfc79c69ccbfca7ac42a1381d62d449bf1adf6;p=jalview.git diff --git a/forester/java/src/org/forester/tools/PhylogenyDecorator.java b/forester/java/src/org/forester/tools/PhylogenyDecorator.java index f01701a..74c4fe9 100644 --- a/forester/java/src/org/forester/tools/PhylogenyDecorator.java +++ b/forester/java/src/org/forester/tools/PhylogenyDecorator.java @@ -21,7 +21,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.tools; @@ -30,11 +30,10 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.forester.archaeopteryx.AptxUtil; import org.forester.io.parsers.nhx.NHXFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Accession; @@ -49,25 +48,21 @@ import org.forester.util.ForesterUtil; public final class PhylogenyDecorator { + final private static String TP_NODE_NAME = "NODE_NAME"; + final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION"; + final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE"; + final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC"; + final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF"; + final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ"; + final private static String TP_SEQ_NAME = "SEQ_NAME"; + final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL"; + final private static String TP_TAXONOMY_CN = "TAXONOMY_CN"; // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb: - final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE"; - final private static String TP_TAXONOMY_ID = "TAXONOMY_ID"; - final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER"; - final private static String TP_TAXONOMY_SN = "TAXONOMY_SN"; - final private static String TP_TAXONOMY_CN = "TAXONOMY_CN"; - final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN"; - final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL"; - final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION"; - final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE"; - final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC"; - final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF"; - final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ"; - final private static String TP_SEQ_NAME = "SEQ_NAME"; - final private static String TP_NODE_NAME = "NODE_NAME"; - final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern - .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" ); - public final static boolean SANITIZE = false; - public final static boolean VERBOSE = true; + final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE"; + final private static String TP_TAXONOMY_ID = "TAXONOMY_ID"; + final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER"; + final private static String TP_TAXONOMY_SN = "TAXONOMY_SN"; + final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN"; private PhylogenyDecorator() { // Not needed. @@ -75,80 +70,72 @@ public final class PhylogenyDecorator { public static void decorate( final Phylogeny phylogeny, final Map> map, - final boolean picky, - final int numbers_of_chars_allowed_to_remove_if_not_found_in_map ) - throws IllegalArgumentException, PhyloXmlDataFormatException { + final boolean picky ) throws IllegalArgumentException, PhyloXmlDataFormatException { for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); final String name = node.getName(); if ( !ForesterUtil.isEmpty( name ) ) { - if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) { - Map new_values = map.get( name ); - int x = 0; - while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) - && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) { - new_values = map.get( name.substring( 0, name.length() - x ) ); - ++x; - } + if ( map.containsKey( name ) ) { + final Map new_values = map.get( name ); if ( new_values != null ) { if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) { - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) ); } if ( new_values.containsKey( TP_TAXONOMY_ID ) && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) { - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData() .getTaxonomy() .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ), new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) ); } else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) { - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy() .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) ); } if ( new_values.containsKey( TP_TAXONOMY_SN ) ) { - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) ); } if ( new_values.containsKey( TP_TAXONOMY_CN ) ) { - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) ); } if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) { - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) ); } if ( new_values.containsKey( TP_SEQ_ACCESSION ) && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) { - AptxUtil.ensurePresenceOfSequence( node ); + ForesterUtil.ensurePresenceOfSequence( node ); node.getNodeData() .getSequence() .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ), new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) ); } if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) { - AptxUtil.ensurePresenceOfSequence( node ); - final Annotation ann = new Annotation( "?" ); + ForesterUtil.ensurePresenceOfSequence( node ); + final Annotation ann = new Annotation(); ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) ); node.getNodeData().getSequence().addAnnotation( ann ); } if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) { - AptxUtil.ensurePresenceOfSequence( node ); + ForesterUtil.ensurePresenceOfSequence( node ); final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) ); node.getNodeData().getSequence().addAnnotation( ann ); } if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) { - AptxUtil.ensurePresenceOfSequence( node ); + ForesterUtil.ensurePresenceOfSequence( node ); node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) ); } if ( new_values.containsKey( TP_SEQ_NAME ) ) { - AptxUtil.ensurePresenceOfSequence( node ); + ForesterUtil.ensurePresenceOfSequence( node ); node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) ); } if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) { - AptxUtil.ensurePresenceOfSequence( node ); + ForesterUtil.ensurePresenceOfSequence( node ); node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) ); } if ( new_values.containsKey( TP_NODE_NAME ) ) { @@ -157,50 +144,32 @@ public final class PhylogenyDecorator { } // if ( new_values != null ) } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) else if ( picky ) { - System.out.println( map.toString() ); throw new IllegalArgumentException( "\"" + name + "\" not found in name map" ); } } } } - /** - * - * - * - * - * - * @param phylogeny - * @param map - * maps names (in phylogeny) to new values - * @param field - * @param picky - * @throws IllegalArgumentException - * @throws NHXFormatException - * @throws PhyloXmlDataFormatException - */ - public static void decorate( final Phylogeny phylogeny, - final Map map, - final FIELD field, - final boolean extract_bracketed_scientific_name, - final boolean picky, - final boolean cut_name_after_space, - final boolean process_name_intelligently, - final boolean process_similar_to, - final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException, - NHXFormatException, PhyloXmlDataFormatException { - PhylogenyDecorator.decorate( phylogeny, - map, - field, - extract_bracketed_scientific_name, - picky, - null, - cut_name_after_space, - process_name_intelligently, - process_similar_to, - numbers_of_chars_allowed_to_remove_if_not_found_in_map, - move_domain_numbers_at_end_to_middle ); + public static String decorate( final Phylogeny phylogeny, + final Map map, + final FIELD field, + final boolean extract_bracketed_scientific_name, + final boolean extract_bracketed_tax_code, + final boolean picky, + final boolean cut_name_after_space, + final boolean trim_after_tilde, + final boolean verbose ) throws IllegalArgumentException, NHXFormatException, + PhyloXmlDataFormatException { + return PhylogenyDecorator.decorate( phylogeny, + map, + field, + extract_bracketed_scientific_name, + extract_bracketed_tax_code, + picky, + null, + cut_name_after_space, + trim_after_tilde, + verbose ); } /** @@ -218,56 +187,96 @@ public final class PhylogenyDecorator { * @throws IllegalArgumentException * @throws PhyloXmlDataFormatException */ - public static void decorate( final Phylogeny phylogeny, - final Map map, - final FIELD field, - final boolean extract_bracketed_scientific_name, - final boolean picky, - final Map intermediate_map, - final boolean cut_name_after_space, - final boolean process_name_intelligently, - final boolean process_similar_to, - final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException, - PhyloXmlDataFormatException { + public static String decorate( final Phylogeny phylogeny, + final Map map, + final FIELD field, + final boolean extract_bracketed_scientific_name, + final boolean extract_bracketed_tax_code, + final boolean picky, + final Map intermediate_map, + final boolean cut_name_after_space, + final boolean trim_after_tilde, + final boolean verbose ) throws IllegalArgumentException, PhyloXmlDataFormatException { if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) { - throw new IllegalArgumentException( "Attempt to extract bracketed scientific name together with data field pointing to scientific name" ); + throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" ); + } + if ( map.isEmpty() ) { + throw new IllegalArgumentException( "map is empty" ); } + int ext_nodes = 0; + int ext_nodes_updated = 0; + int int_nodes = 0; + int int_nodes_updated = 0; for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); + if ( node.isExternal() ) { + ++ext_nodes; + } + else { + ++int_nodes; + } String name = node.getName(); + if ( picky && node.isExternal() && ForesterUtil.isEmpty( name ) ) { + throw new IllegalArgumentException( "external node with no name present" ); + } + String tilde_annotation = null; + if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) { + final int ti = name.indexOf( '~' ); + final String orig = name; + tilde_annotation = name.substring( ti ); + name = name.substring( 0, ti ); + if ( node.isExternal() && ForesterUtil.isEmpty( name ) ) { + throw new IllegalArgumentException( "external node with illegal name: " + orig ); + } + } if ( !ForesterUtil.isEmpty( name ) ) { if ( intermediate_map != null ) { - name = PhylogenyDecorator.extractIntermediate( intermediate_map, name ); + name = PhylogenyDecorator.extractIntermediate( intermediate_map, name, verbose ); } - if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) { - String new_value = map.get( name ); - int x = 0; - while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) - && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) { - new_value = map.get( name.substring( 0, name.length() - x ) ); - ++x; - } - if ( new_value != null ) { - new_value = new_value.trim(); - new_value.replaceAll( "/\\s+/", " " ); + if ( map.containsKey( name ) ) { + String new_value = map.get( name ).trim().replaceAll( "/\\s+/", " " ); + if ( !ForesterUtil.isEmpty( new_value ) ) { + if ( node.isExternal() ) { + ++ext_nodes_updated; + } + else { + ++int_nodes_updated; + } if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) { - extractBracketedScientificNames( node, new_value ); + new_value = extractBracketedScientificNames( node, new_value ); + } + else if ( extract_bracketed_tax_code ) { + if ( ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( new_value ).find() ) { + new_value = extractBracketedTaxCodes( node, new_value ); + } + else if ( picky ) { + throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value + + "\"" ); + } } switch ( field ) { + case MOL_SEQ: + if ( verbose ) { + System.out.println( name + ": " + new_value ); + } + if ( !node.getNodeData().isHasSequence() ) { + node.getNodeData().setSequence( new Sequence() ); + } + node.getNodeData().getSequence().setMolecularSequence( new_value ); + break; case SEQUENCE_ANNOTATION_DESC: - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.println( name + ": " + new_value ); } if ( !node.getNodeData().isHasSequence() ) { node.getNodeData().setSequence( new Sequence() ); } - final Annotation annotation = new Annotation( "?" ); + final Annotation annotation = new Annotation(); annotation.setDesc( new_value ); node.getNodeData().getSequence().addAnnotation( annotation ); break; case DOMAIN_STRUCTURE: - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.println( name + ": " + new_value ); } if ( !node.getNodeData().isHasSequence() ) { @@ -277,21 +286,24 @@ public final class PhylogenyDecorator { .setDomainArchitecture( new DomainArchitecture( new_value ) ); break; case TAXONOMY_CODE: - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.println( name + ": " + new_value ); } - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy().setTaxonomyCode( new_value ); break; case TAXONOMY_SCIENTIFIC_NAME: - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.println( name + ": " + new_value ); } - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy().setScientificName( new_value ); break; case SEQUENCE_NAME: - if ( PhylogenyDecorator.VERBOSE ) { + if ( trim_after_tilde ) { + new_value = addTildeAnnotation( tilde_annotation, new_value ); + } + if ( verbose ) { System.out.println( name + ": " + new_value ); } if ( !node.getNodeData().isHasSequence() ) { @@ -300,31 +312,19 @@ public final class PhylogenyDecorator { node.getNodeData().getSequence().setName( new_value ); break; case NODE_NAME: - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.print( name + " -> " ); } if ( cut_name_after_space ) { - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.print( new_value + " -> " ); } new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value ); } - else if ( process_name_intelligently ) { - if ( PhylogenyDecorator.VERBOSE ) { - System.out.print( new_value + " -> " ); - } - new_value = PhylogenyDecorator.processNameIntelligently( new_value ); + if ( trim_after_tilde ) { + new_value = addTildeAnnotation( tilde_annotation, new_value ); } - else if ( process_similar_to ) { - if ( PhylogenyDecorator.VERBOSE ) { - System.out.print( new_value + " -> " ); - } - new_value = PhylogenyDecorator.processSimilarTo( new_value ); - } - if ( PhylogenyDecorator.SANITIZE ) { - new_value = PhylogenyDecorator.sanitize( new_value ); - } - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.println( new_value ); } node.setName( new_value ); @@ -332,81 +332,49 @@ public final class PhylogenyDecorator { default: throw new RuntimeException( "unknown field \"" + field + "\"" ); } - if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) { - node.setName( moveDomainNumbersAtEnd( node.getName() ) ); - } + } + else { + throw new IllegalArgumentException( "node name \"" + name + "\" maps to empty value" ); } } else if ( picky ) { - throw new IllegalArgumentException( "\"" + name + "\" not found in name map" ); + throw new IllegalArgumentException( "node name \"" + name + "\" not found in map" ); } } } + return "updated " + ext_nodes_updated + "/" + ext_nodes + " external nodes, updated " + int_nodes_updated + "/" + + int_nodes + " internal nodes"; } - public static void decorate( final Phylogeny[] phylogenies, - final Map> map, - final boolean picky, - final int numbers_of_chars_allowed_to_remove_if_not_found_in_map ) - throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException { - for( int i = 0; i < phylogenies.length; ++i ) { - PhylogenyDecorator.decorate( phylogenies[ i ], - map, - picky, - numbers_of_chars_allowed_to_remove_if_not_found_in_map ); - } - } - - public static void decorate( final Phylogeny[] phylogenies, - final Map map, - final FIELD field, - final boolean extract_bracketed_scientific_name, - final boolean picky, - final boolean cut_name_after_space, - final boolean process_name_intelligently, - final boolean process_similar_to, - final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException, - NHXFormatException, PhyloXmlDataFormatException { - for( int i = 0; i < phylogenies.length; ++i ) { - PhylogenyDecorator.decorate( phylogenies[ i ], - map, - field, - extract_bracketed_scientific_name, - picky, - cut_name_after_space, - process_name_intelligently, - process_similar_to, - numbers_of_chars_allowed_to_remove_if_not_found_in_map, - move_domain_numbers_at_end_to_middle ); + public static Map> parseMappingTable( final File mapping_table_file ) + throws IOException { + final Map> map = new HashMap>(); + BasicTable mapping_table = null; + mapping_table = BasicTableParser.parse( mapping_table_file, '\t', false, false ); + for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) { + final Map row_map = new HashMap(); + String name = null; + for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) { + final String table_cell = mapping_table.getValue( col, row ); + if ( col == 0 ) { + name = table_cell; + } + else if ( table_cell != null ) { + final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) ); + final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() ); + row_map.put( key, val ); + } + } + map.put( name, row_map ); } + return map; } - public static void decorate( final Phylogeny[] phylogenies, - final Map map, - final FIELD field, - final boolean extract_bracketed_scientific_name, - final boolean picky, - final Map intermediate_map, - final boolean cut_name_after_space, - final boolean process_name_intelligently, - final boolean process_similar_to, - final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, - final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException, - NHXFormatException, PhyloXmlDataFormatException { - for( int i = 0; i < phylogenies.length; ++i ) { - PhylogenyDecorator.decorate( phylogenies[ i ], - map, - field, - extract_bracketed_scientific_name, - picky, - intermediate_map, - cut_name_after_space, - process_name_intelligently, - process_similar_to, - numbers_of_chars_allowed_to_remove_if_not_found_in_map, - move_domain_numbers_at_end_to_middle ); + private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) { + if ( ForesterUtil.isEmpty( tilde_annotation ) ) { + return new_value; } + return new_value + tilde_annotation; } private static String deleteAtFirstSpace( final String name ) { @@ -417,16 +385,46 @@ public final class PhylogenyDecorator { return name; } - private static void extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) { + private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) { final int i = new_value.lastIndexOf( "[" ); final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 ); - AptxUtil.ensurePresenceOfTaxonomy( node ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); node.getNodeData().getTaxonomy().setScientificName( scientific_name ); + return new_value.substring( 0, i - 1 ).trim(); + } + + private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) { + final StringBuilder sb = new StringBuilder(); + sb.append( new_value ); + final String tc = extractBracketedTaxCodes( sb ); + if ( !ForesterUtil.isEmpty( tc ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + try { + node.getNodeData().getTaxonomy().setTaxonomyCode( tc ); + } + catch ( final PhyloXmlDataFormatException e ) { + throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc ); + } + return sb.toString().trim(); + } + return new_value; + } + + private static String extractBracketedTaxCodes( final StringBuilder sb ) { + final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( sb ); + if ( m.find() ) { + final String tc = m.group( 1 ); + sb.delete( m.start( 1 ) - 1, m.end( 1 ) + 1 ); + return tc; + } + return null; } - private static String extractIntermediate( final Map intermediate_map, final String name ) { + private static String extractIntermediate( final Map intermediate_map, + final String name, + final boolean verbose ) { String new_name = null; - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.print( name + " => " ); } if ( intermediate_map.containsKey( name ) ) { @@ -438,98 +436,19 @@ public final class PhylogenyDecorator { else { throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" ); } - if ( PhylogenyDecorator.VERBOSE ) { + if ( verbose ) { System.out.println( new_name + " " ); } return new_name; } - private static String moveDomainNumbersAtEnd( final String node_name ) { - final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name ); - if ( m.matches() ) { - final String seq_number = m.group( 1 ); - final String tax = m.group( 2 ); - final String domain_number = m.group( 3 ); - return seq_number + "_[" + domain_number + "]_" + tax; - } - else { - return node_name; - } - } - - public static Map> parseMappingTable( final File mapping_table_file ) - throws IOException { - final Map> map = new HashMap>(); - BasicTable mapping_table = null; - mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false ); - for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) { - final Map row_map = new HashMap(); - String name = null; - for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) { - final String table_cell = mapping_table.getValue( col, row ); - if ( col == 0 ) { - name = table_cell; - } - else if ( table_cell != null ) { - final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) ); - final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() ); - row_map.put( key, val ); - } - } - map.put( name, row_map ); - } - return map; - } - - private static String processNameIntelligently( final String name ) { - final String[] s = name.split( " " ); - if ( s.length < 2 ) { - return name; - } - else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) { - return s[ 0 ]; - } - else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) { - return s[ 1 ]; - } - else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) { - return s[ 0 ]; - } - else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) { - return s[ 1 ]; - } - else if ( s[ 0 ].indexOf( "_" ) > 0 ) { - return s[ 0 ]; - } - else if ( s[ 1 ].indexOf( "_" ) > 0 ) { - return s[ 1 ]; - } - else { - return s[ 0 ]; - } - } - - private static String processSimilarTo( final String name ) { - final int i = name.toLowerCase().indexOf( "similar to" ); - String similar_to = ""; - if ( i >= 0 ) { - similar_to = " similarity=" + name.substring( i + 10 ).trim(); - } - final String pi = processNameIntelligently( name ); - return pi + similar_to; - } - - private static String sanitize( String s ) { - s = s.replace( ' ', '_' ); - s = s.replace( '(', '{' ); - s = s.replace( ')', '}' ); - s = s.replace( '[', '{' ); - s = s.replace( ']', '}' ); - s = s.replace( ',', '_' ); - return s; - } - public static enum FIELD { - NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME; + DOMAIN_STRUCTURE, + MOL_SEQ, + NODE_NAME, + SEQUENCE_ANNOTATION_DESC, + SEQUENCE_NAME, + TAXONOMY_CODE, + TAXONOMY_SCIENTIFIC_NAME; } }