2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.tools;
29 import java.io.IOException;
30 import java.util.HashMap;
32 import java.util.regex.Pattern;
34 import org.forester.io.parsers.nhx.NHXFormatException;
35 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyNode;
38 import org.forester.phylogeny.data.Accession;
39 import org.forester.phylogeny.data.Annotation;
40 import org.forester.phylogeny.data.DomainArchitecture;
41 import org.forester.phylogeny.data.Identifier;
42 import org.forester.phylogeny.data.Sequence;
43 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
44 import org.forester.util.BasicTable;
45 import org.forester.util.BasicTableParser;
46 import org.forester.util.ForesterUtil;
48 public final class PhylogenyDecorator {
50 // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
51 final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
52 final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
53 final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
54 final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
55 final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
56 final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
57 final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
58 final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
59 final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
60 final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
61 final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
62 final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
63 final private static String TP_SEQ_NAME = "SEQ_NAME";
64 final private static String TP_NODE_NAME = "NODE_NAME";
65 final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern
66 .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" );
67 public final static boolean SANITIZE = false;
68 public final static boolean VERBOSE = true;
69 private static final boolean CUT = true;
71 private PhylogenyDecorator() {
75 public static void decorate( final Phylogeny phylogeny,
76 final Map<String, Map<String, String>> map,
78 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
79 throws IllegalArgumentException, PhyloXmlDataFormatException {
80 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
81 final PhylogenyNode node = iter.next();
82 final String name = node.getName();
83 if ( !ForesterUtil.isEmpty( name ) ) {
84 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
85 Map<String, String> new_values = map.get( name );
87 while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
88 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
89 new_values = map.get( name.substring( 0, name.length() - x ) );
92 if ( new_values != null ) {
93 if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
94 ForesterUtil.ensurePresenceOfTaxonomy( node );
95 node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
97 if ( new_values.containsKey( TP_TAXONOMY_ID )
98 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
99 ForesterUtil.ensurePresenceOfTaxonomy( node );
102 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
103 new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
105 else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
106 ForesterUtil.ensurePresenceOfTaxonomy( node );
107 node.getNodeData().getTaxonomy()
108 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
110 if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
111 ForesterUtil.ensurePresenceOfTaxonomy( node );
112 node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
114 if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
115 ForesterUtil.ensurePresenceOfTaxonomy( node );
116 node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
118 if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
119 ForesterUtil.ensurePresenceOfTaxonomy( node );
120 node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
122 if ( new_values.containsKey( TP_SEQ_ACCESSION )
123 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
124 ForesterUtil.ensurePresenceOfSequence( node );
127 .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
128 new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
130 if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
131 ForesterUtil.ensurePresenceOfSequence( node );
132 final Annotation ann = new Annotation();
133 ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
134 node.getNodeData().getSequence().addAnnotation( ann );
136 if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
137 ForesterUtil.ensurePresenceOfSequence( node );
138 final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
139 node.getNodeData().getSequence().addAnnotation( ann );
141 if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
142 ForesterUtil.ensurePresenceOfSequence( node );
143 node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
145 if ( new_values.containsKey( TP_SEQ_NAME ) ) {
146 ForesterUtil.ensurePresenceOfSequence( node );
147 node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
149 if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
150 ForesterUtil.ensurePresenceOfSequence( node );
151 node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
153 if ( new_values.containsKey( TP_NODE_NAME ) ) {
154 node.setName( new_values.get( TP_NODE_NAME ) );
156 } // if ( new_values != null )
157 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
159 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
173 * maps names (in phylogeny) to new values
176 * @throws IllegalArgumentException
177 * @throws NHXFormatException
178 * @throws PhyloXmlDataFormatException
180 public static void decorate( final Phylogeny phylogeny,
181 final Map<String, String> map,
183 final boolean extract_bracketed_scientific_name,
184 final boolean extract_bracketed_tax_code,
186 final boolean cut_name_after_space,
187 final boolean process_name_intelligently,
188 final boolean process_similar_to,
189 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
190 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
191 PhyloXmlDataFormatException {
192 PhylogenyDecorator.decorate( phylogeny,
195 extract_bracketed_scientific_name,
196 extract_bracketed_tax_code,
199 cut_name_after_space,
200 process_name_intelligently,
202 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
212 * maps names (in phylogeny) to new values if intermediate_map is
213 * null otherwise maps intermediate value to new value
216 * @param intermediate_map
217 * maps name (in phylogeny) to a intermediate value
218 * @throws IllegalArgumentException
219 * @throws PhyloXmlDataFormatException
221 public static void decorate( final Phylogeny phylogeny,
222 final Map<String, String> map,
224 final boolean extract_bracketed_scientific_name,
225 final boolean extract_bracketed_tax_code,
227 final Map<String, String> intermediate_map,
228 final boolean cut_name_after_space,
229 final boolean process_name_intelligently,
230 final boolean process_similar_to,
231 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
232 final boolean trim_after_tilde ) throws IllegalArgumentException,
233 PhyloXmlDataFormatException {
234 if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
235 throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
237 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
238 final PhylogenyNode node = iter.next();
239 String name = node.getName();
240 if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
241 name = name.substring( 0, name.indexOf( '~' ) );
243 if ( !ForesterUtil.isEmpty( name ) ) {
244 if ( intermediate_map != null ) {
245 name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
247 // int space_index = name.indexOf( " " );
248 // if ( CUT && space_index > 0 ) {
249 // int y = name.lastIndexOf( "|" );
250 // name = name.substring( y + 1, space_index );
252 // String new_value = null;
253 // for( String key : map.keySet() ) {
254 // if ( key.indexOf( name ) >= 0 ) {
255 // if ( new_value == null ) {
256 // new_value = map.get( key );
259 // System.out.println( name + " is not unique" );
260 // System.exit( -1 );
264 // if ( new_value != null ) {
265 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
266 String new_value = map.get( name );
268 while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
269 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
270 new_value = map.get( name.substring( 0, name.length() - x ) );
273 if ( new_value != null ) {
274 new_value = new_value.trim();
275 new_value.replaceAll( "/\\s+/", " " );
276 if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
277 new_value = extractBracketedScientificNames( node, new_value );
279 else if ( extract_bracketed_tax_code && new_value.endsWith( "]" ) ) {
280 new_value = extractBracketedTaxCodes( node, new_value );
283 case SEQUENCE_ANNOTATION_DESC:
284 if ( PhylogenyDecorator.VERBOSE ) {
285 System.out.println( name + ": " + new_value );
287 if ( !node.getNodeData().isHasSequence() ) {
288 node.getNodeData().setSequence( new Sequence() );
290 final Annotation annotation = new Annotation( "?" );
291 annotation.setDesc( new_value );
292 node.getNodeData().getSequence().addAnnotation( annotation );
294 case DOMAIN_STRUCTURE:
295 if ( PhylogenyDecorator.VERBOSE ) {
296 System.out.println( name + ": " + new_value );
298 if ( !node.getNodeData().isHasSequence() ) {
299 node.getNodeData().setSequence( new Sequence() );
301 node.getNodeData().getSequence()
302 .setDomainArchitecture( new DomainArchitecture( new_value ) );
305 if ( PhylogenyDecorator.VERBOSE ) {
306 System.out.println( name + ": " + new_value );
308 ForesterUtil.ensurePresenceOfTaxonomy( node );
309 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
311 case TAXONOMY_SCIENTIFIC_NAME:
312 if ( PhylogenyDecorator.VERBOSE ) {
313 System.out.println( name + ": " + new_value );
315 ForesterUtil.ensurePresenceOfTaxonomy( node );
316 node.getNodeData().getTaxonomy().setScientificName( new_value );
319 if ( PhylogenyDecorator.VERBOSE ) {
320 System.out.println( name + ": " + new_value );
322 if ( !node.getNodeData().isHasSequence() ) {
323 node.getNodeData().setSequence( new Sequence() );
325 node.getNodeData().getSequence().setName( new_value );
328 if ( PhylogenyDecorator.VERBOSE ) {
329 System.out.print( name + " -> " );
331 if ( cut_name_after_space ) {
332 if ( PhylogenyDecorator.VERBOSE ) {
333 System.out.print( new_value + " -> " );
335 new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
337 else if ( process_name_intelligently ) {
338 if ( PhylogenyDecorator.VERBOSE ) {
339 System.out.print( new_value + " -> " );
341 new_value = PhylogenyDecorator.processNameIntelligently( new_value );
343 else if ( process_similar_to ) {
344 if ( PhylogenyDecorator.VERBOSE ) {
345 System.out.print( new_value + " -> " );
347 new_value = PhylogenyDecorator.processSimilarTo( new_value );
349 if ( PhylogenyDecorator.SANITIZE ) {
350 new_value = PhylogenyDecorator.sanitize( new_value );
352 if ( PhylogenyDecorator.VERBOSE ) {
353 System.out.println( new_value );
355 node.setName( new_value );
358 throw new RuntimeException( "unknown field \"" + field + "\"" );
363 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
369 public static void decorate( final Phylogeny[] phylogenies,
370 final Map<String, Map<String, String>> map,
372 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
373 throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException {
374 for( final Phylogeny phylogenie : phylogenies ) {
376 .decorate( phylogenie, map, picky, numbers_of_chars_allowed_to_remove_if_not_found_in_map );
380 public static void decorate( final Phylogeny[] phylogenies,
381 final Map<String, String> map,
383 final boolean extract_bracketed_scientific_name,
384 final boolean extract_bracketed_tax_code,
386 final boolean cut_name_after_space,
387 final boolean process_name_intelligently,
388 final boolean process_similar_to,
389 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
390 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
391 PhyloXmlDataFormatException {
392 for( final Phylogeny phylogenie : phylogenies ) {
393 PhylogenyDecorator.decorate( phylogenie,
396 extract_bracketed_scientific_name,
397 extract_bracketed_tax_code,
399 cut_name_after_space,
400 process_name_intelligently,
402 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
407 public static void decorate( final Phylogeny[] phylogenies,
408 final Map<String, String> map,
410 final boolean extract_bracketed_scientific_name,
411 final boolean extract_bracketed_tax_code,
413 final Map<String, String> intermediate_map,
414 final boolean cut_name_after_space,
415 final boolean process_name_intelligently,
416 final boolean process_similar_to,
417 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
418 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
419 PhyloXmlDataFormatException {
420 for( final Phylogeny phylogenie : phylogenies ) {
421 PhylogenyDecorator.decorate( phylogenie,
424 extract_bracketed_scientific_name,
425 extract_bracketed_tax_code,
428 cut_name_after_space,
429 process_name_intelligently,
431 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
436 private static String deleteAtFirstSpace( final String name ) {
437 final int first_space = name.indexOf( " " );
438 if ( first_space > 1 ) {
439 return name.substring( 0, first_space ).trim();
444 private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
445 final int i = new_value.lastIndexOf( "[" );
446 final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
447 ForesterUtil.ensurePresenceOfTaxonomy( node );
448 node.getNodeData().getTaxonomy().setScientificName( scientific_name );
449 return new_value.substring( 0, i - 1 ).trim();
452 private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
453 final int i = new_value.lastIndexOf( "[" );
454 final String tc = new_value.substring( i + 1, new_value.length() - 1 );
455 ForesterUtil.ensurePresenceOfTaxonomy( node );
457 node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
459 catch ( final PhyloXmlDataFormatException e ) {
460 throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
462 return new_value.substring( 0, i - 1 ).trim();
465 private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
466 String new_name = null;
467 if ( PhylogenyDecorator.VERBOSE ) {
468 System.out.print( name + " => " );
470 if ( intermediate_map.containsKey( name ) ) {
471 new_name = intermediate_map.get( name );
472 if ( ForesterUtil.isEmpty( new_name ) ) {
473 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
477 throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
479 if ( PhylogenyDecorator.VERBOSE ) {
480 System.out.println( new_name + " " );
485 public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
487 final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
488 BasicTable<String> mapping_table = null;
489 mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false, false );
490 for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
491 final Map<String, String> row_map = new HashMap<String, String>();
493 for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
494 final String table_cell = mapping_table.getValue( col, row );
498 else if ( table_cell != null ) {
499 final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
500 final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
501 row_map.put( key, val );
504 map.put( name, row_map );
509 private static String processNameIntelligently( final String name ) {
510 final String[] s = name.split( " " );
511 if ( s.length < 2 ) {
514 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) {
517 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) {
520 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) {
523 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) {
526 else if ( s[ 0 ].indexOf( "_" ) > 0 ) {
529 else if ( s[ 1 ].indexOf( "_" ) > 0 ) {
537 private static String processSimilarTo( final String name ) {
538 final int i = name.toLowerCase().indexOf( "similar to" );
539 String similar_to = "";
541 similar_to = " similarity=" + name.substring( i + 10 ).trim();
543 final String pi = processNameIntelligently( name );
544 return pi + similar_to;
547 private static String sanitize( String s ) {
548 s = s.replace( ' ', '_' );
549 s = s.replace( '(', '{' );
550 s = s.replace( ')', '}' );
551 s = s.replace( '[', '{' );
552 s = s.replace( ']', '}' );
553 s = s.replace( ',', '_' );
557 public static enum FIELD {
558 NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;