// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.tools;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import org.forester.archaeopteryx.AptxUtil;
import org.forester.io.parsers.nhx.NHXFormatException;
import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
+import org.forester.io.parsers.util.ParserUtils;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Accession;
import org.forester.phylogeny.data.Identifier;
import org.forester.phylogeny.data.Sequence;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sequence.MolecularSequence.TYPE;
import org.forester.util.BasicTable;
import org.forester.util.BasicTableParser;
import org.forester.util.ForesterUtil;
public final class PhylogenyDecorator {
+ final private static String TP_NODE_NAME = "NODE_NAME";
+ final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
+ final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
+ final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
+ final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
+ final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
+ final private static String TP_SEQ_NAME = "SEQ_NAME";
+ final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
+ final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
// From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
- final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
- final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
- final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
- final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
- final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
- final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
- final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
- final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
- final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
- final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
- final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
- final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
- final private static String TP_SEQ_NAME = "SEQ_NAME";
- final private static String TP_NODE_NAME = "NODE_NAME";
- final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern
- .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" );
- public final static boolean SANITIZE = false;
- public final static boolean VERBOSE = true;
- private static final boolean CUT = true;
+ final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
+ final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
+ final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
+ final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
+ final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
private PhylogenyDecorator() {
// Not needed.
public static void decorate( final Phylogeny phylogeny,
final Map<String, Map<String, String>> map,
- final boolean picky,
- final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
- throws IllegalArgumentException, PhyloXmlDataFormatException {
+ final boolean picky ) throws IllegalArgumentException, PhyloXmlDataFormatException {
for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
final PhylogenyNode node = iter.next();
final String name = node.getName();
if ( !ForesterUtil.isEmpty( name ) ) {
- if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
- Map<String, String> new_values = map.get( name );
- int x = 0;
- while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
- && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
- new_values = map.get( name.substring( 0, name.length() - x ) );
- ++x;
- }
+ if ( map.containsKey( name ) ) {
+ final Map<String, String> new_values = map.get( name );
if ( new_values != null ) {
if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
}
if ( new_values.containsKey( TP_TAXONOMY_ID )
&& new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData()
- .getTaxonomy()
- .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
- new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
+ .getTaxonomy()
+ .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
+ new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
}
else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy()
- .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
+ .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
}
if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
}
if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
}
if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
}
if ( new_values.containsKey( TP_SEQ_ACCESSION )
&& new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
- AptxUtil.ensurePresenceOfSequence( node );
+ ForesterUtil.ensurePresenceOfSequence( node );
node.getNodeData()
- .getSequence()
- .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
- new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
+ .getSequence()
+ .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
+ new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
}
if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
- AptxUtil.ensurePresenceOfSequence( node );
- final Annotation ann = new Annotation( "?" );
+ ForesterUtil.ensurePresenceOfSequence( node );
+ final Annotation ann = new Annotation();
ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
node.getNodeData().getSequence().addAnnotation( ann );
}
if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
- AptxUtil.ensurePresenceOfSequence( node );
+ ForesterUtil.ensurePresenceOfSequence( node );
final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
node.getNodeData().getSequence().addAnnotation( ann );
}
if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
- AptxUtil.ensurePresenceOfSequence( node );
+ ForesterUtil.ensurePresenceOfSequence( node );
node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
}
if ( new_values.containsKey( TP_SEQ_NAME ) ) {
- AptxUtil.ensurePresenceOfSequence( node );
+ ForesterUtil.ensurePresenceOfSequence( node );
node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
}
if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
- AptxUtil.ensurePresenceOfSequence( node );
+ ForesterUtil.ensurePresenceOfSequence( node );
node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
}
if ( new_values.containsKey( TP_NODE_NAME ) ) {
node.setName( new_values.get( TP_NODE_NAME ) );
}
- } // if ( new_values != null )
+ } // if ( new_values != null )
} // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
else if ( picky ) {
throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
}
}
- /**
- *
- *
- *
- *
- *
- * @param phylogeny
- * @param map
- * maps names (in phylogeny) to new values
- * @param field
- * @param picky
- * @throws IllegalArgumentException
- * @throws NHXFormatException
- * @throws PhyloXmlDataFormatException
- */
- public static void decorate( final Phylogeny phylogeny,
- final Map<String, String> map,
- final FIELD field,
- final boolean extract_bracketed_scientific_name,
- final boolean picky,
- final boolean cut_name_after_space,
- final boolean process_name_intelligently,
- final boolean process_similar_to,
- final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean move_domain_numbers_at_end_to_middle,
- final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
- PhyloXmlDataFormatException {
- PhylogenyDecorator.decorate( phylogeny,
- map,
- field,
- extract_bracketed_scientific_name,
- picky,
- null,
- cut_name_after_space,
- process_name_intelligently,
- process_similar_to,
- numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- move_domain_numbers_at_end_to_middle,
- trim_after_tilde );
+ public static String decorate( final Phylogeny phylogeny,
+ final Map<String, String> map,
+ final FIELD field,
+ final boolean extract_bracketed_scientific_name,
+ final boolean extract_bracketed_tax_code,
+ final boolean picky,
+ final boolean cut_name_after_space,
+ final boolean trim_after_tilde,
+ final boolean verbose ) throws IllegalArgumentException, NHXFormatException,
+ PhyloXmlDataFormatException {
+ return PhylogenyDecorator.decorate( phylogeny,
+ map,
+ field,
+ extract_bracketed_scientific_name,
+ extract_bracketed_tax_code,
+ picky,
+ null,
+ cut_name_after_space,
+ trim_after_tilde,
+ verbose );
}
/**
- *
- *
- *
+ *
+ *
+ *
* @param phylogeny
* @param map
* maps names (in phylogeny) to new values if intermediate_map is
* @param intermediate_map
* maps name (in phylogeny) to a intermediate value
* @throws IllegalArgumentException
- * @throws PhyloXmlDataFormatException
+ * @throws PhyloXmlDataFormatException
*/
- public static void decorate( final Phylogeny phylogeny,
- final Map<String, String> map,
- final FIELD field,
- final boolean extract_bracketed_scientific_name,
- final boolean picky,
- final Map<String, String> intermediate_map,
- final boolean cut_name_after_space,
- final boolean process_name_intelligently,
- final boolean process_similar_to,
- final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean move_domain_numbers_at_end_to_middle,
- final boolean trim_after_tilde ) throws IllegalArgumentException,
- PhyloXmlDataFormatException {
+ public static String decorate( final Phylogeny phylogeny,
+ final Map<String, String> map,
+ final FIELD field,
+ final boolean extract_bracketed_scientific_name,
+ final boolean extract_bracketed_tax_code,
+ final boolean picky,
+ final Map<String, String> intermediate_map,
+ final boolean cut_name_after_space,
+ final boolean trim_after_tilde,
+ final boolean verbose ) throws IllegalArgumentException, PhyloXmlDataFormatException {
if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
}
+ if ( map.isEmpty() ) {
+ throw new IllegalArgumentException( "map is empty" );
+ }
+ int ext_nodes = 0;
+ int ext_nodes_updated = 0;
+ int int_nodes = 0;
+ int int_nodes_updated = 0;
for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
final PhylogenyNode node = iter.next();
+ if ( node.isExternal() ) {
+ ++ext_nodes;
+ }
+ else {
+ ++int_nodes;
+ }
String name = node.getName();
+ if ( picky && node.isExternal() && ForesterUtil.isEmpty( name ) ) {
+ throw new IllegalArgumentException( "external node with no name present" );
+ }
+ String tilde_annotation = null;
+ final String orig_name = name;
if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
- name = name.substring( 0, name.indexOf( '~' ) );
+ final int ti = name.indexOf( '~' );
+ tilde_annotation = name.substring( ti );
+ name = name.substring( 0, ti );
+ if ( node.isExternal() && ForesterUtil.isEmpty( name ) ) {
+ throw new IllegalArgumentException( "external node with illegal name: " + orig_name );
+ }
}
if ( !ForesterUtil.isEmpty( name ) ) {
if ( intermediate_map != null ) {
- name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
+ name = PhylogenyDecorator.extractIntermediate( intermediate_map, name, verbose );
}
- // int space_index = name.indexOf( " " );
- // if ( CUT && space_index > 0 ) {
- // int y = name.lastIndexOf( "|" );
- // name = name.substring( y + 1, space_index );
- // }
- // String new_value = null;
- // for( String key : map.keySet() ) {
- // if ( key.indexOf( name ) >= 0 ) {
- // if ( new_value == null ) {
- // new_value = map.get( key );
- // }
- // else {
- // System.out.println( name + " is not unique" );
- // System.exit( -1 );
- // }
- // }
- // }
- // if ( new_value != null ) {
- if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
- String new_value = map.get( name );
- int x = 0;
- while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
- && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
- new_value = map.get( name.substring( 0, name.length() - x ) );
- ++x;
- }
- if ( new_value != null ) {
- new_value = new_value.trim();
- new_value.replaceAll( "/\\s+/", " " );
+ if ( ( field == FIELD.MOL_SEQ ) && !map.containsKey( name ) ) {
+ name = orig_name;
+ }
+ if ( map.containsKey( name ) ) {
+ String new_value = map.get( name ).trim().replaceAll( "/\\s+/", " " );
+ if ( !ForesterUtil.isEmpty( new_value ) ) {
+ if ( node.isExternal() ) {
+ ++ext_nodes_updated;
+ }
+ else {
+ ++int_nodes_updated;
+ }
if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
new_value = extractBracketedScientificNames( node, new_value );
}
+ else if ( extract_bracketed_tax_code ) {
+ if ( ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( new_value ).find() ) {
+ new_value = extractBracketedTaxCodes( node, new_value );
+ }
+ else if ( picky ) {
+ throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value
+ + "\"" );
+ }
+ }
switch ( field ) {
+ case MOL_SEQ:
+ if ( verbose ) {
+ System.out.println( name + ": " + new_value );
+ }
+ if ( !node.getNodeData().isHasSequence() ) {
+ node.getNodeData().setSequence( new Sequence() );
+ }
+ node.getNodeData().getSequence().setMolecularSequence( new_value );
+ final TYPE type = ForesterUtil.guessMolecularSequenceType( new_value );
+ if ( type != null ) {
+ if ( type == TYPE.AA ) {
+ node.getNodeData().getSequence().setType( "protein" );
+ }
+ else if ( type == TYPE.DNA ) {
+ node.getNodeData().getSequence().setType( "dna" );
+ }
+ else if ( type == TYPE.RNA ) {
+ node.getNodeData().getSequence().setType( "rna" );
+ }
+ }
+ break;
case SEQUENCE_ANNOTATION_DESC:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
if ( !node.getNodeData().isHasSequence() ) {
node.getNodeData().setSequence( new Sequence() );
}
- final Annotation annotation = new Annotation( "?" );
+ final Annotation annotation = new Annotation();
annotation.setDesc( new_value );
node.getNodeData().getSequence().addAnnotation( annotation );
break;
case DOMAIN_STRUCTURE:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
if ( !node.getNodeData().isHasSequence() ) {
node.getNodeData().setSequence( new Sequence() );
}
node.getNodeData().getSequence()
- .setDomainArchitecture( new DomainArchitecture( new_value ) );
+ .setDomainArchitecture( new DomainArchitecture( new_value ) );
break;
case TAXONOMY_CODE:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
break;
case TAXONOMY_SCIENTIFIC_NAME:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().setScientificName( new_value );
break;
case SEQUENCE_NAME:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( trim_after_tilde ) {
+ new_value = addTildeAnnotation( tilde_annotation, new_value );
+ }
+ if ( verbose ) {
System.out.println( name + ": " + new_value );
}
if ( !node.getNodeData().isHasSequence() ) {
node.getNodeData().getSequence().setName( new_value );
break;
case NODE_NAME:
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( name + " -> " );
}
if ( cut_name_after_space ) {
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( new_value + " -> " );
}
new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
}
- else if ( process_name_intelligently ) {
- if ( PhylogenyDecorator.VERBOSE ) {
- System.out.print( new_value + " -> " );
- }
- new_value = PhylogenyDecorator.processNameIntelligently( new_value );
- }
- else if ( process_similar_to ) {
- if ( PhylogenyDecorator.VERBOSE ) {
- System.out.print( new_value + " -> " );
- }
- new_value = PhylogenyDecorator.processSimilarTo( new_value );
- }
- if ( PhylogenyDecorator.SANITIZE ) {
- new_value = PhylogenyDecorator.sanitize( new_value );
+ if ( trim_after_tilde ) {
+ new_value = addTildeAnnotation( tilde_annotation, new_value );
}
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( new_value );
}
node.setName( new_value );
default:
throw new RuntimeException( "unknown field \"" + field + "\"" );
}
- if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) {
- node.setName( moveDomainNumbersAtEnd( node.getName() ) );
- }
+ }
+ else {
+ throw new IllegalArgumentException( "node name \"" + name + "\" maps to empty value" );
}
}
else if ( picky ) {
- throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
+ throw new IllegalArgumentException( "node name \"" + name + "\" not found in map" );
}
}
}
+ return "updated " + ext_nodes_updated + "/" + ext_nodes + " external nodes, updated " + int_nodes_updated + "/"
+ + int_nodes + " internal nodes";
}
- public static void decorate( final Phylogeny[] phylogenies,
- final Map<String, Map<String, String>> map,
- final boolean picky,
- final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
- throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException {
- for( int i = 0; i < phylogenies.length; ++i ) {
- PhylogenyDecorator.decorate( phylogenies[ i ],
- map,
- picky,
- numbers_of_chars_allowed_to_remove_if_not_found_in_map );
- }
- }
-
- public static void decorate( final Phylogeny[] phylogenies,
- final Map<String, String> map,
- final FIELD field,
- final boolean extract_bracketed_scientific_name,
- final boolean picky,
- final boolean cut_name_after_space,
- final boolean process_name_intelligently,
- final boolean process_similar_to,
- final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean move_domain_numbers_at_end_to_middle,
- final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
- PhyloXmlDataFormatException {
- for( int i = 0; i < phylogenies.length; ++i ) {
- PhylogenyDecorator.decorate( phylogenies[ i ],
- map,
- field,
- extract_bracketed_scientific_name,
- picky,
- cut_name_after_space,
- process_name_intelligently,
- process_similar_to,
- numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- move_domain_numbers_at_end_to_middle,
- trim_after_tilde );
+ public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
+ throws IOException {
+ final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
+ BasicTable<String> mapping_table = null;
+ mapping_table = BasicTableParser.parse( mapping_table_file, '\t', false, false );
+ for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
+ final Map<String, String> row_map = new HashMap<String, String>();
+ String name = null;
+ for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
+ final String table_cell = mapping_table.getValue( col, row );
+ if ( col == 0 ) {
+ name = table_cell;
+ }
+ else if ( table_cell != null ) {
+ final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
+ final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
+ row_map.put( key, val );
+ }
+ }
+ map.put( name, row_map );
}
+ return map;
}
- public static void decorate( final Phylogeny[] phylogenies,
- final Map<String, String> map,
- final FIELD field,
- final boolean extract_bracketed_scientific_name,
- final boolean picky,
- final Map<String, String> intermediate_map,
- final boolean cut_name_after_space,
- final boolean process_name_intelligently,
- final boolean process_similar_to,
- final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- final boolean move_domain_numbers_at_end_to_middle,
- final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
- PhyloXmlDataFormatException {
- for( int i = 0; i < phylogenies.length; ++i ) {
- PhylogenyDecorator.decorate( phylogenies[ i ],
- map,
- field,
- extract_bracketed_scientific_name,
- picky,
- intermediate_map,
- cut_name_after_space,
- process_name_intelligently,
- process_similar_to,
- numbers_of_chars_allowed_to_remove_if_not_found_in_map,
- move_domain_numbers_at_end_to_middle,
- trim_after_tilde );
+ private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
+ if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
+ return new_value;
}
+ return new_value + tilde_annotation;
}
private static String deleteAtFirstSpace( final String name ) {
private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
final int i = new_value.lastIndexOf( "[" );
final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
- AptxUtil.ensurePresenceOfTaxonomy( node );
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
node.getNodeData().getTaxonomy().setScientificName( scientific_name );
return new_value.substring( 0, i - 1 ).trim();
}
- private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
+ private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
+ final StringBuilder sb = new StringBuilder();
+ sb.append( new_value );
+ final String tc = extractBracketedTaxCodes( sb );
+ if ( !ForesterUtil.isEmpty( tc ) ) {
+ ForesterUtil.ensurePresenceOfTaxonomy( node );
+ try {
+ node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
+ }
+ catch ( final PhyloXmlDataFormatException e ) {
+ throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
+ }
+ return sb.toString().trim();
+ }
+ return new_value;
+ }
+
+ private static String extractBracketedTaxCodes( final StringBuilder sb ) {
+ final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( sb );
+ if ( m.find() ) {
+ final String tc = m.group( 1 );
+ sb.delete( m.start( 1 ) - 1, m.end( 1 ) + 1 );
+ return tc;
+ }
+ return null;
+ }
+
+ private static String extractIntermediate( final Map<String, String> intermediate_map,
+ final String name,
+ final boolean verbose ) {
String new_name = null;
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.print( name + " => " );
}
if ( intermediate_map.containsKey( name ) ) {
else {
throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
}
- if ( PhylogenyDecorator.VERBOSE ) {
+ if ( verbose ) {
System.out.println( new_name + " " );
}
return new_name;
}
- private static String moveDomainNumbersAtEnd( final String node_name ) {
- final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name );
- if ( m.matches() ) {
- final String seq_number = m.group( 1 );
- final String tax = m.group( 2 );
- final String domain_number = m.group( 3 );
- return seq_number + "_[" + domain_number + "]_" + tax;
- }
- else {
- return node_name;
- }
- }
-
- public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
- throws IOException {
- final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
- BasicTable<String> mapping_table = null;
- mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false, false );
- for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
- final Map<String, String> row_map = new HashMap<String, String>();
- String name = null;
- for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
- final String table_cell = mapping_table.getValue( col, row );
- if ( col == 0 ) {
- name = table_cell;
- }
- else if ( table_cell != null ) {
- final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
- final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
- row_map.put( key, val );
- }
- }
- map.put( name, row_map );
- }
- return map;
- }
-
- private static String processNameIntelligently( final String name ) {
- final String[] s = name.split( " " );
- if ( s.length < 2 ) {
- return name;
- }
- else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) {
- return s[ 0 ];
- }
- else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) {
- return s[ 1 ];
- }
- else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) {
- return s[ 0 ];
- }
- else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) {
- return s[ 1 ];
- }
- else if ( s[ 0 ].indexOf( "_" ) > 0 ) {
- return s[ 0 ];
- }
- else if ( s[ 1 ].indexOf( "_" ) > 0 ) {
- return s[ 1 ];
- }
- else {
- return s[ 0 ];
- }
- }
-
- private static String processSimilarTo( final String name ) {
- final int i = name.toLowerCase().indexOf( "similar to" );
- String similar_to = "";
- if ( i >= 0 ) {
- similar_to = " similarity=" + name.substring( i + 10 ).trim();
- }
- final String pi = processNameIntelligently( name );
- return pi + similar_to;
- }
-
- private static String sanitize( String s ) {
- s = s.replace( ' ', '_' );
- s = s.replace( '(', '{' );
- s = s.replace( ')', '}' );
- s = s.replace( '[', '{' );
- s = s.replace( ']', '}' );
- s = s.replace( ',', '_' );
- return s;
- }
-
public static enum FIELD {
- NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;
+ DOMAIN_STRUCTURE,
+ MOL_SEQ,
+ NODE_NAME,
+ SEQUENCE_ANNOTATION_DESC,
+ SEQUENCE_NAME,
+ TAXONOMY_CODE,
+ TAXONOMY_SCIENTIFIC_NAME;
}
}