2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.tools;
29 import java.io.IOException;
30 import java.util.HashMap;
33 import org.forester.io.parsers.nhx.NHXFormatException;
34 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
35 import org.forester.phylogeny.Phylogeny;
36 import org.forester.phylogeny.PhylogenyNode;
37 import org.forester.phylogeny.data.Accession;
38 import org.forester.phylogeny.data.Annotation;
39 import org.forester.phylogeny.data.DomainArchitecture;
40 import org.forester.phylogeny.data.Identifier;
41 import org.forester.phylogeny.data.Sequence;
42 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
43 import org.forester.util.BasicTable;
44 import org.forester.util.BasicTableParser;
45 import org.forester.util.ForesterUtil;
47 public final class PhylogenyDecorator {
49 // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
50 final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
51 final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
52 final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
53 final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
54 final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
55 final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
56 final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
57 final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
58 final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
59 final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
60 final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
61 final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
62 final private static String TP_SEQ_NAME = "SEQ_NAME";
63 final private static String TP_NODE_NAME = "NODE_NAME";
64 public final static boolean SANITIZE = false;
65 public final static boolean VERBOSE = true;
67 private PhylogenyDecorator() {
71 public static void decorate( final Phylogeny phylogeny,
72 final Map<String, Map<String, String>> map,
74 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
75 throws IllegalArgumentException, PhyloXmlDataFormatException {
76 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
77 final PhylogenyNode node = iter.next();
78 final String name = node.getName();
79 if ( !ForesterUtil.isEmpty( name ) ) {
80 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
81 Map<String, String> new_values = map.get( name );
83 while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
84 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
85 new_values = map.get( name.substring( 0, name.length() - x ) );
88 if ( new_values != null ) {
89 if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
90 ForesterUtil.ensurePresenceOfTaxonomy( node );
91 node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
93 if ( new_values.containsKey( TP_TAXONOMY_ID )
94 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
95 ForesterUtil.ensurePresenceOfTaxonomy( node );
98 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
99 new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
101 else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
102 ForesterUtil.ensurePresenceOfTaxonomy( node );
103 node.getNodeData().getTaxonomy()
104 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
106 if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
107 ForesterUtil.ensurePresenceOfTaxonomy( node );
108 node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
110 if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
111 ForesterUtil.ensurePresenceOfTaxonomy( node );
112 node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
114 if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
115 ForesterUtil.ensurePresenceOfTaxonomy( node );
116 node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
118 if ( new_values.containsKey( TP_SEQ_ACCESSION )
119 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
120 ForesterUtil.ensurePresenceOfSequence( node );
123 .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
124 new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
126 if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
127 ForesterUtil.ensurePresenceOfSequence( node );
128 final Annotation ann = new Annotation();
129 ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
130 node.getNodeData().getSequence().addAnnotation( ann );
132 if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
133 ForesterUtil.ensurePresenceOfSequence( node );
134 final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
135 node.getNodeData().getSequence().addAnnotation( ann );
137 if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
138 ForesterUtil.ensurePresenceOfSequence( node );
139 node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
141 if ( new_values.containsKey( TP_SEQ_NAME ) ) {
142 ForesterUtil.ensurePresenceOfSequence( node );
143 node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
145 if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
146 ForesterUtil.ensurePresenceOfSequence( node );
147 node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
149 if ( new_values.containsKey( TP_NODE_NAME ) ) {
150 node.setName( new_values.get( TP_NODE_NAME ) );
152 } // if ( new_values != null )
153 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
155 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
161 public static void decorate( final Phylogeny phylogeny,
162 final Map<String, String> map,
164 final boolean extract_bracketed_scientific_name,
165 final boolean extract_bracketed_tax_code,
167 final boolean cut_name_after_space,
168 final boolean process_name_intelligently,
169 final boolean process_similar_to,
170 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
171 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
172 PhyloXmlDataFormatException {
173 PhylogenyDecorator.decorate( phylogeny,
176 extract_bracketed_scientific_name,
177 extract_bracketed_tax_code,
180 cut_name_after_space,
181 process_name_intelligently,
183 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
193 * maps names (in phylogeny) to new values if intermediate_map is
194 * null otherwise maps intermediate value to new value
197 * @param intermediate_map
198 * maps name (in phylogeny) to a intermediate value
199 * @throws IllegalArgumentException
200 * @throws PhyloXmlDataFormatException
202 public static void decorate( final Phylogeny phylogeny,
203 final Map<String, String> map,
205 final boolean extract_bracketed_scientific_name,
206 final boolean extract_bracketed_tax_code,
208 final Map<String, String> intermediate_map,
209 final boolean cut_name_after_space,
210 final boolean process_name_intelligently,
211 final boolean process_similar_to,
212 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
213 final boolean trim_after_tilde ) throws IllegalArgumentException,
214 PhyloXmlDataFormatException {
215 if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
216 throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
218 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
219 final PhylogenyNode node = iter.next();
220 String name = node.getName();
221 String tilde_annotation = null;
222 if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
223 final int ti = name.indexOf( '~' );
224 name = name.substring( 0, ti );
225 tilde_annotation = name.substring( ti );
227 if ( !ForesterUtil.isEmpty( name ) ) {
228 if ( intermediate_map != null ) {
229 name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
231 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
232 String new_value = map.get( name );
234 while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
235 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
236 new_value = map.get( name.substring( 0, name.length() - x ) );
239 if ( new_value != null ) {
240 new_value = new_value.trim();
241 new_value.replaceAll( "/\\s+/", " " );
242 if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
243 new_value = extractBracketedScientificNames( node, new_value );
245 else if ( extract_bracketed_tax_code && new_value.endsWith( "]" ) ) {
246 new_value = extractBracketedTaxCodes( node, new_value );
249 case SEQUENCE_ANNOTATION_DESC:
250 if ( PhylogenyDecorator.VERBOSE ) {
251 System.out.println( name + ": " + new_value );
253 if ( !node.getNodeData().isHasSequence() ) {
254 node.getNodeData().setSequence( new Sequence() );
256 final Annotation annotation = new Annotation( "?" );
257 annotation.setDesc( new_value );
258 node.getNodeData().getSequence().addAnnotation( annotation );
260 case DOMAIN_STRUCTURE:
261 if ( PhylogenyDecorator.VERBOSE ) {
262 System.out.println( name + ": " + new_value );
264 if ( !node.getNodeData().isHasSequence() ) {
265 node.getNodeData().setSequence( new Sequence() );
267 node.getNodeData().getSequence()
268 .setDomainArchitecture( new DomainArchitecture( new_value ) );
271 if ( PhylogenyDecorator.VERBOSE ) {
272 System.out.println( name + ": " + new_value );
274 ForesterUtil.ensurePresenceOfTaxonomy( node );
275 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
277 case TAXONOMY_SCIENTIFIC_NAME:
278 if ( PhylogenyDecorator.VERBOSE ) {
279 System.out.println( name + ": " + new_value );
281 ForesterUtil.ensurePresenceOfTaxonomy( node );
282 node.getNodeData().getTaxonomy().setScientificName( new_value );
285 if ( trim_after_tilde ) {
286 new_value = addTildeAnnotation( tilde_annotation, new_value );
288 if ( PhylogenyDecorator.VERBOSE ) {
289 System.out.println( name + ": " + new_value );
291 if ( !node.getNodeData().isHasSequence() ) {
292 node.getNodeData().setSequence( new Sequence() );
294 node.getNodeData().getSequence().setName( new_value );
297 if ( PhylogenyDecorator.VERBOSE ) {
298 System.out.print( name + " -> " );
300 if ( cut_name_after_space ) {
301 if ( PhylogenyDecorator.VERBOSE ) {
302 System.out.print( new_value + " -> " );
304 new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
306 else if ( process_name_intelligently ) {
307 if ( PhylogenyDecorator.VERBOSE ) {
308 System.out.print( new_value + " -> " );
310 new_value = PhylogenyDecorator.processNameIntelligently( new_value );
312 else if ( process_similar_to ) {
313 if ( PhylogenyDecorator.VERBOSE ) {
314 System.out.print( new_value + " -> " );
316 new_value = PhylogenyDecorator.processSimilarTo( new_value );
318 if ( PhylogenyDecorator.SANITIZE ) {
319 new_value = PhylogenyDecorator.sanitize( new_value );
321 if ( trim_after_tilde ) {
322 new_value = addTildeAnnotation( tilde_annotation, new_value );
324 if ( PhylogenyDecorator.VERBOSE ) {
325 System.out.println( new_value );
327 node.setName( new_value );
330 throw new RuntimeException( "unknown field \"" + field + "\"" );
335 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
341 private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
342 if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
345 return new_value + tilde_annotation;
348 public static void decorate( final Phylogeny[] phylogenies,
349 final Map<String, Map<String, String>> map,
351 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
352 throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException {
353 for( final Phylogeny phylogenie : phylogenies ) {
355 .decorate( phylogenie, map, picky, numbers_of_chars_allowed_to_remove_if_not_found_in_map );
359 public static void decorate( final Phylogeny[] phylogenies,
360 final Map<String, String> map,
362 final boolean extract_bracketed_scientific_name,
363 final boolean extract_bracketed_tax_code,
365 final boolean cut_name_after_space,
366 final boolean process_name_intelligently,
367 final boolean process_similar_to,
368 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
369 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
370 PhyloXmlDataFormatException {
371 for( final Phylogeny phylogenie : phylogenies ) {
372 PhylogenyDecorator.decorate( phylogenie,
375 extract_bracketed_scientific_name,
376 extract_bracketed_tax_code,
378 cut_name_after_space,
379 process_name_intelligently,
381 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
386 public static void decorate( final Phylogeny[] phylogenies,
387 final Map<String, String> map,
389 final boolean extract_bracketed_scientific_name,
390 final boolean extract_bracketed_tax_code,
392 final Map<String, String> intermediate_map,
393 final boolean cut_name_after_space,
394 final boolean process_name_intelligently,
395 final boolean process_similar_to,
396 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
397 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
398 PhyloXmlDataFormatException {
399 for( final Phylogeny phylogenie : phylogenies ) {
400 PhylogenyDecorator.decorate( phylogenie,
403 extract_bracketed_scientific_name,
404 extract_bracketed_tax_code,
407 cut_name_after_space,
408 process_name_intelligently,
410 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
415 public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
417 final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
418 BasicTable<String> mapping_table = null;
419 mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false, false );
420 for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
421 final Map<String, String> row_map = new HashMap<String, String>();
423 for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
424 final String table_cell = mapping_table.getValue( col, row );
428 else if ( table_cell != null ) {
429 final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
430 final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
431 row_map.put( key, val );
434 map.put( name, row_map );
439 private static String deleteAtFirstSpace( final String name ) {
440 final int first_space = name.indexOf( " " );
441 if ( first_space > 1 ) {
442 return name.substring( 0, first_space ).trim();
447 private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
448 final int i = new_value.lastIndexOf( "[" );
449 final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
450 ForesterUtil.ensurePresenceOfTaxonomy( node );
451 node.getNodeData().getTaxonomy().setScientificName( scientific_name );
452 return new_value.substring( 0, i - 1 ).trim();
455 private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
456 final int i = new_value.lastIndexOf( "[" );
457 String tc = new_value.substring( i + 1, new_value.length() - 1 );
458 if ( tc.length() == 6 ) {
459 tc = tc.substring( 0, 5 );
461 ForesterUtil.ensurePresenceOfTaxonomy( node );
463 node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
465 catch ( final PhyloXmlDataFormatException e ) {
466 throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
468 return new_value.substring( 0, i - 1 ).trim();
471 private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
472 String new_name = null;
473 if ( PhylogenyDecorator.VERBOSE ) {
474 System.out.print( name + " => " );
476 if ( intermediate_map.containsKey( name ) ) {
477 new_name = intermediate_map.get( name );
478 if ( ForesterUtil.isEmpty( new_name ) ) {
479 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
483 throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
485 if ( PhylogenyDecorator.VERBOSE ) {
486 System.out.println( new_name + " " );
491 private static String processNameIntelligently( final String name ) {
492 final String[] s = name.split( " " );
493 if ( s.length < 2 ) {
496 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) {
499 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) {
502 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) {
505 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) {
508 else if ( s[ 0 ].indexOf( "_" ) > 0 ) {
511 else if ( s[ 1 ].indexOf( "_" ) > 0 ) {
519 private static String processSimilarTo( final String name ) {
520 final int i = name.toLowerCase().indexOf( "similar to" );
521 String similar_to = "";
523 similar_to = " similarity=" + name.substring( i + 10 ).trim();
525 final String pi = processNameIntelligently( name );
526 return pi + similar_to;
529 private static String sanitize( String s ) {
530 s = s.replace( ' ', '_' );
531 s = s.replace( '(', '{' );
532 s = s.replace( ')', '}' );
533 s = s.replace( '[', '{' );
534 s = s.replace( ']', '}' );
535 s = s.replace( ',', '_' );
539 public static enum FIELD {
540 NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;