2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.tools;
29 import java.io.IOException;
30 import java.util.HashMap;
32 import java.util.regex.Matcher;
34 import org.forester.io.parsers.nhx.NHXFormatException;
35 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
36 import org.forester.io.parsers.util.ParserUtils;
37 import org.forester.phylogeny.Phylogeny;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Accession;
40 import org.forester.phylogeny.data.Annotation;
41 import org.forester.phylogeny.data.DomainArchitecture;
42 import org.forester.phylogeny.data.Identifier;
43 import org.forester.phylogeny.data.Sequence;
44 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
45 import org.forester.util.BasicTable;
46 import org.forester.util.BasicTableParser;
47 import org.forester.util.ForesterUtil;
49 public final class PhylogenyDecorator {
51 // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
52 final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
53 final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
54 final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
55 final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
56 final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
57 final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
58 final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
59 final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
60 final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
61 final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
62 final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
63 final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
64 final private static String TP_SEQ_NAME = "SEQ_NAME";
65 final private static String TP_NODE_NAME = "NODE_NAME";
66 public final static boolean SANITIZE = false;
67 public final static boolean VERBOSE = true;
69 private PhylogenyDecorator() {
73 public static void decorate( final Phylogeny phylogeny,
74 final Map<String, Map<String, String>> map,
76 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
77 throws IllegalArgumentException, PhyloXmlDataFormatException {
78 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
79 final PhylogenyNode node = iter.next();
80 final String name = node.getName();
81 if ( !ForesterUtil.isEmpty( name ) ) {
82 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
83 Map<String, String> new_values = map.get( name );
85 while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
86 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
87 new_values = map.get( name.substring( 0, name.length() - x ) );
90 if ( new_values != null ) {
91 if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
92 ForesterUtil.ensurePresenceOfTaxonomy( node );
93 node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
95 if ( new_values.containsKey( TP_TAXONOMY_ID )
96 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
97 ForesterUtil.ensurePresenceOfTaxonomy( node );
100 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
101 new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
103 else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
104 ForesterUtil.ensurePresenceOfTaxonomy( node );
105 node.getNodeData().getTaxonomy()
106 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
108 if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
109 ForesterUtil.ensurePresenceOfTaxonomy( node );
110 node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
112 if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
113 ForesterUtil.ensurePresenceOfTaxonomy( node );
114 node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
116 if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
117 ForesterUtil.ensurePresenceOfTaxonomy( node );
118 node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
120 if ( new_values.containsKey( TP_SEQ_ACCESSION )
121 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
122 ForesterUtil.ensurePresenceOfSequence( node );
125 .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
126 new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
128 if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
129 ForesterUtil.ensurePresenceOfSequence( node );
130 final Annotation ann = new Annotation();
131 ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
132 node.getNodeData().getSequence().addAnnotation( ann );
134 if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
135 ForesterUtil.ensurePresenceOfSequence( node );
136 final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
137 node.getNodeData().getSequence().addAnnotation( ann );
139 if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
140 ForesterUtil.ensurePresenceOfSequence( node );
141 node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
143 if ( new_values.containsKey( TP_SEQ_NAME ) ) {
144 ForesterUtil.ensurePresenceOfSequence( node );
145 node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
147 if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
148 ForesterUtil.ensurePresenceOfSequence( node );
149 node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
151 if ( new_values.containsKey( TP_NODE_NAME ) ) {
152 node.setName( new_values.get( TP_NODE_NAME ) );
154 } // if ( new_values != null )
155 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
157 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
163 public static void decorate( final Phylogeny phylogeny,
164 final Map<String, String> map,
166 final boolean extract_bracketed_scientific_name,
167 final boolean extract_bracketed_tax_code,
169 final boolean cut_name_after_space,
170 final boolean process_name_intelligently,
171 final boolean process_similar_to,
172 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
173 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
174 PhyloXmlDataFormatException {
175 PhylogenyDecorator.decorate( phylogeny,
178 extract_bracketed_scientific_name,
179 extract_bracketed_tax_code,
182 cut_name_after_space,
183 process_name_intelligently,
185 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
195 * maps names (in phylogeny) to new values if intermediate_map is
196 * null otherwise maps intermediate value to new value
199 * @param intermediate_map
200 * maps name (in phylogeny) to a intermediate value
201 * @throws IllegalArgumentException
202 * @throws PhyloXmlDataFormatException
204 public static void decorate( final Phylogeny phylogeny,
205 final Map<String, String> map,
207 final boolean extract_bracketed_scientific_name,
208 final boolean extract_bracketed_tax_code,
210 final Map<String, String> intermediate_map,
211 final boolean cut_name_after_space,
212 final boolean process_name_intelligently,
213 final boolean process_similar_to,
214 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
215 final boolean trim_after_tilde ) throws IllegalArgumentException,
216 PhyloXmlDataFormatException {
217 if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
218 throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
220 if ( map.isEmpty() ) {
221 throw new IllegalArgumentException( "map is empty" );
223 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
224 final PhylogenyNode node = iter.next();
225 String name = node.getName();
226 String tilde_annotation = null;
227 if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
228 final int ti = name.indexOf( '~' );
229 tilde_annotation = name.substring( ti );
230 name = name.substring( 0, ti );
232 if ( !ForesterUtil.isEmpty( name ) ) {
233 if ( intermediate_map != null ) {
234 name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
236 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
237 String new_value = map.get( name );
239 while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
240 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
241 new_value = map.get( name.substring( 0, name.length() - x ) );
244 if ( new_value != null ) {
245 new_value = new_value.trim();
246 new_value.replaceAll( "/\\s+/", " " );
247 if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
248 new_value = extractBracketedScientificNames( node, new_value );
250 else if ( extract_bracketed_tax_code ) {
251 if ( ParserUtils.TAXOMONY_CODE_PATTERN_4.matcher( new_value ).find() ) {
252 new_value = extractBracketedTaxCodes( node, new_value );
254 else if ( ParserUtils.TAXOMONY_CODE_PATTERN_6.matcher( new_value ).find() ) {
255 new_value = extractBracketedTaxCodes6( node, new_value );
258 throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value
263 case SEQUENCE_ANNOTATION_DESC:
264 if ( PhylogenyDecorator.VERBOSE ) {
265 System.out.println( name + ": " + new_value );
267 if ( !node.getNodeData().isHasSequence() ) {
268 node.getNodeData().setSequence( new Sequence() );
270 final Annotation annotation = new Annotation( "?" );
271 annotation.setDesc( new_value );
272 node.getNodeData().getSequence().addAnnotation( annotation );
274 case DOMAIN_STRUCTURE:
275 if ( PhylogenyDecorator.VERBOSE ) {
276 System.out.println( name + ": " + new_value );
278 if ( !node.getNodeData().isHasSequence() ) {
279 node.getNodeData().setSequence( new Sequence() );
281 node.getNodeData().getSequence()
282 .setDomainArchitecture( new DomainArchitecture( new_value ) );
285 if ( PhylogenyDecorator.VERBOSE ) {
286 System.out.println( name + ": " + new_value );
288 ForesterUtil.ensurePresenceOfTaxonomy( node );
289 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
291 case TAXONOMY_SCIENTIFIC_NAME:
292 if ( PhylogenyDecorator.VERBOSE ) {
293 System.out.println( name + ": " + new_value );
295 ForesterUtil.ensurePresenceOfTaxonomy( node );
296 node.getNodeData().getTaxonomy().setScientificName( new_value );
299 if ( trim_after_tilde ) {
300 new_value = addTildeAnnotation( tilde_annotation, new_value );
302 if ( PhylogenyDecorator.VERBOSE ) {
303 System.out.println( name + ": " + new_value );
305 if ( !node.getNodeData().isHasSequence() ) {
306 node.getNodeData().setSequence( new Sequence() );
308 node.getNodeData().getSequence().setName( new_value );
311 if ( PhylogenyDecorator.VERBOSE ) {
312 System.out.print( name + " -> " );
314 if ( cut_name_after_space ) {
315 if ( PhylogenyDecorator.VERBOSE ) {
316 System.out.print( new_value + " -> " );
318 new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
320 else if ( process_name_intelligently ) {
321 if ( PhylogenyDecorator.VERBOSE ) {
322 System.out.print( new_value + " -> " );
324 new_value = PhylogenyDecorator.processNameIntelligently( new_value );
326 else if ( process_similar_to ) {
327 if ( PhylogenyDecorator.VERBOSE ) {
328 System.out.print( new_value + " -> " );
330 new_value = PhylogenyDecorator.processSimilarTo( new_value );
332 if ( PhylogenyDecorator.SANITIZE ) {
333 new_value = PhylogenyDecorator.sanitize( new_value );
335 if ( trim_after_tilde ) {
336 new_value = addTildeAnnotation( tilde_annotation, new_value );
338 if ( PhylogenyDecorator.VERBOSE ) {
339 System.out.println( new_value );
341 node.setName( new_value );
344 throw new RuntimeException( "unknown field \"" + field + "\"" );
349 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
355 private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
356 if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
359 return new_value + tilde_annotation;
362 public static void decorate( final Phylogeny[] phylogenies,
363 final Map<String, Map<String, String>> map,
365 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
366 throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException {
367 for( final Phylogeny phylogenie : phylogenies ) {
369 .decorate( phylogenie, map, picky, numbers_of_chars_allowed_to_remove_if_not_found_in_map );
373 public static void decorate( final Phylogeny[] phylogenies,
374 final Map<String, String> map,
376 final boolean extract_bracketed_scientific_name,
377 final boolean extract_bracketed_tax_code,
379 final boolean cut_name_after_space,
380 final boolean process_name_intelligently,
381 final boolean process_similar_to,
382 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
383 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
384 PhyloXmlDataFormatException {
385 for( final Phylogeny phylogenie : phylogenies ) {
386 PhylogenyDecorator.decorate( phylogenie,
389 extract_bracketed_scientific_name,
390 extract_bracketed_tax_code,
392 cut_name_after_space,
393 process_name_intelligently,
395 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
400 public static void decorate( final Phylogeny[] phylogenies,
401 final Map<String, String> map,
403 final boolean extract_bracketed_scientific_name,
404 final boolean extract_bracketed_tax_code,
406 final Map<String, String> intermediate_map,
407 final boolean cut_name_after_space,
408 final boolean process_name_intelligently,
409 final boolean process_similar_to,
410 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
411 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
412 PhyloXmlDataFormatException {
413 for( final Phylogeny phylogenie : phylogenies ) {
414 PhylogenyDecorator.decorate( phylogenie,
417 extract_bracketed_scientific_name,
418 extract_bracketed_tax_code,
421 cut_name_after_space,
422 process_name_intelligently,
424 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
429 public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
431 final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
432 BasicTable<String> mapping_table = null;
433 mapping_table = BasicTableParser.parse( mapping_table_file, '\t', false, false );
434 for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
435 final Map<String, String> row_map = new HashMap<String, String>();
437 for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
438 final String table_cell = mapping_table.getValue( col, row );
442 else if ( table_cell != null ) {
443 final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
444 final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
445 row_map.put( key, val );
448 map.put( name, row_map );
453 private static String deleteAtFirstSpace( final String name ) {
454 final int first_space = name.indexOf( " " );
455 if ( first_space > 1 ) {
456 return name.substring( 0, first_space ).trim();
461 private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
462 final int i = new_value.lastIndexOf( "[" );
463 final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
464 ForesterUtil.ensurePresenceOfTaxonomy( node );
465 node.getNodeData().getTaxonomy().setScientificName( scientific_name );
466 return new_value.substring( 0, i - 1 ).trim();
469 private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
470 final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_4.matcher( new_value );
475 ForesterUtil.ensurePresenceOfTaxonomy( node );
477 node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
479 catch ( final PhyloXmlDataFormatException e ) {
480 throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
482 return new_value; //TODO //FIXME
485 private static String extractBracketedTaxCodes6( final PhylogenyNode node, final String new_value ) {
486 final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_6.matcher( new_value );
491 ForesterUtil.ensurePresenceOfTaxonomy( node );
493 if ( tc.length() == 6 ) {
494 final String t = tc.substring( 0, 5 );
495 System.out.println( "WARNING: taxonomy code " + tc + " -> " + t );
499 throw new IllegalArgumentException();
501 node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
503 catch ( final PhyloXmlDataFormatException e ) {
504 throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
506 return new_value; //TODO //FIXME
509 private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
510 String new_name = null;
511 if ( PhylogenyDecorator.VERBOSE ) {
512 System.out.print( name + " => " );
514 if ( intermediate_map.containsKey( name ) ) {
515 new_name = intermediate_map.get( name );
516 if ( ForesterUtil.isEmpty( new_name ) ) {
517 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
521 throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
523 if ( PhylogenyDecorator.VERBOSE ) {
524 System.out.println( new_name + " " );
529 private static String processNameIntelligently( final String name ) {
530 final String[] s = name.split( " " );
531 if ( s.length < 2 ) {
534 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) {
537 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) {
540 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) {
543 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) {
546 else if ( s[ 0 ].indexOf( "_" ) > 0 ) {
549 else if ( s[ 1 ].indexOf( "_" ) > 0 ) {
557 private static String processSimilarTo( final String name ) {
558 final int i = name.toLowerCase().indexOf( "similar to" );
559 String similar_to = "";
561 similar_to = " similarity=" + name.substring( i + 10 ).trim();
563 final String pi = processNameIntelligently( name );
564 return pi + similar_to;
567 private static String sanitize( String s ) {
568 s = s.replace( ' ', '_' );
569 s = s.replace( '(', '{' );
570 s = s.replace( ')', '}' );
571 s = s.replace( '[', '{' );
572 s = s.replace( ']', '}' );
573 s = s.replace( ',', '_' );
577 public static enum FIELD {
578 NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;