2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.tools;
29 import java.io.IOException;
30 import java.util.HashMap;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
35 import org.forester.archaeopteryx.AptxUtil;
36 import org.forester.io.parsers.nhx.NHXFormatException;
37 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
38 import org.forester.phylogeny.Phylogeny;
39 import org.forester.phylogeny.PhylogenyNode;
40 import org.forester.phylogeny.data.Accession;
41 import org.forester.phylogeny.data.Annotation;
42 import org.forester.phylogeny.data.DomainArchitecture;
43 import org.forester.phylogeny.data.Identifier;
44 import org.forester.phylogeny.data.Sequence;
45 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
46 import org.forester.util.BasicTable;
47 import org.forester.util.BasicTableParser;
48 import org.forester.util.ForesterUtil;
50 public final class PhylogenyDecorator {
52 // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
53 final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
54 final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
55 final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
56 final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
57 final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
58 final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
59 final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
60 final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
61 final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
62 final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
63 final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
64 final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
65 final private static String TP_SEQ_NAME = "SEQ_NAME";
66 final private static String TP_NODE_NAME = "NODE_NAME";
67 final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern
68 .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" );
69 public final static boolean SANITIZE = false;
70 public final static boolean VERBOSE = true;
71 private static final boolean CUT = true;
73 private PhylogenyDecorator() {
77 public static void decorate( final Phylogeny phylogeny,
78 final Map<String, Map<String, String>> map,
80 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
81 throws IllegalArgumentException, PhyloXmlDataFormatException {
82 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
83 final PhylogenyNode node = iter.next();
84 final String name = node.getName();
85 if ( !ForesterUtil.isEmpty( name ) ) {
86 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
87 Map<String, String> new_values = map.get( name );
89 while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
90 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
91 new_values = map.get( name.substring( 0, name.length() - x ) );
94 if ( new_values != null ) {
95 if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
96 AptxUtil.ensurePresenceOfTaxonomy( node );
97 node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
99 if ( new_values.containsKey( TP_TAXONOMY_ID )
100 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
101 AptxUtil.ensurePresenceOfTaxonomy( node );
104 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
105 new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
107 else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
108 AptxUtil.ensurePresenceOfTaxonomy( node );
109 node.getNodeData().getTaxonomy()
110 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
112 if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
113 AptxUtil.ensurePresenceOfTaxonomy( node );
114 node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
116 if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
117 AptxUtil.ensurePresenceOfTaxonomy( node );
118 node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
120 if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
121 AptxUtil.ensurePresenceOfTaxonomy( node );
122 node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
124 if ( new_values.containsKey( TP_SEQ_ACCESSION )
125 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
126 AptxUtil.ensurePresenceOfSequence( node );
129 .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
130 new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
132 if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
133 AptxUtil.ensurePresenceOfSequence( node );
134 final Annotation ann = new Annotation( "?" );
135 ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
136 node.getNodeData().getSequence().addAnnotation( ann );
138 if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
139 AptxUtil.ensurePresenceOfSequence( node );
140 final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
141 node.getNodeData().getSequence().addAnnotation( ann );
143 if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
144 AptxUtil.ensurePresenceOfSequence( node );
145 node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
147 if ( new_values.containsKey( TP_SEQ_NAME ) ) {
148 AptxUtil.ensurePresenceOfSequence( node );
149 node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
151 if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
152 AptxUtil.ensurePresenceOfSequence( node );
153 node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
155 if ( new_values.containsKey( TP_NODE_NAME ) ) {
156 node.setName( new_values.get( TP_NODE_NAME ) );
158 } // if ( new_values != null )
159 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
161 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
175 * maps names (in phylogeny) to new values
178 * @throws IllegalArgumentException
179 * @throws NHXFormatException
180 * @throws PhyloXmlDataFormatException
182 public static void decorate( final Phylogeny phylogeny,
183 final Map<String, String> map,
185 final boolean extract_bracketed_scientific_name,
187 final boolean cut_name_after_space,
188 final boolean process_name_intelligently,
189 final boolean process_similar_to,
190 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
191 final boolean move_domain_numbers_at_end_to_middle,
192 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
193 PhyloXmlDataFormatException {
194 PhylogenyDecorator.decorate( phylogeny,
197 extract_bracketed_scientific_name,
200 cut_name_after_space,
201 process_name_intelligently,
203 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
204 move_domain_numbers_at_end_to_middle,
214 * maps names (in phylogeny) to new values if intermediate_map is
215 * null otherwise maps intermediate value to new value
218 * @param intermediate_map
219 * maps name (in phylogeny) to a intermediate value
220 * @throws IllegalArgumentException
221 * @throws PhyloXmlDataFormatException
223 public static void decorate( final Phylogeny phylogeny,
224 final Map<String, String> map,
226 final boolean extract_bracketed_scientific_name,
228 final Map<String, String> intermediate_map,
229 final boolean cut_name_after_space,
230 final boolean process_name_intelligently,
231 final boolean process_similar_to,
232 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
233 final boolean move_domain_numbers_at_end_to_middle,
234 final boolean trim_after_tilde ) throws IllegalArgumentException,
235 PhyloXmlDataFormatException {
236 if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
237 throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
239 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
240 final PhylogenyNode node = iter.next();
241 String name = node.getName();
242 if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
243 name = name.substring( 0, name.indexOf( '~' ) );
245 if ( !ForesterUtil.isEmpty( name ) ) {
246 if ( intermediate_map != null ) {
247 name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
249 // int space_index = name.indexOf( " " );
250 // if ( CUT && space_index > 0 ) {
251 // int y = name.lastIndexOf( "|" );
252 // name = name.substring( y + 1, space_index );
254 // String new_value = null;
255 // for( String key : map.keySet() ) {
256 // if ( key.indexOf( name ) >= 0 ) {
257 // if ( new_value == null ) {
258 // new_value = map.get( key );
261 // System.out.println( name + " is not unique" );
262 // System.exit( -1 );
266 // if ( new_value != null ) {
267 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
268 String new_value = map.get( name );
270 while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
271 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
272 new_value = map.get( name.substring( 0, name.length() - x ) );
275 if ( new_value != null ) {
276 new_value = new_value.trim();
277 new_value.replaceAll( "/\\s+/", " " );
278 if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
279 new_value = extractBracketedScientificNames( node, new_value );
282 case SEQUENCE_ANNOTATION_DESC:
283 if ( PhylogenyDecorator.VERBOSE ) {
284 System.out.println( name + ": " + new_value );
286 if ( !node.getNodeData().isHasSequence() ) {
287 node.getNodeData().setSequence( new Sequence() );
289 final Annotation annotation = new Annotation( "?" );
290 annotation.setDesc( new_value );
291 node.getNodeData().getSequence().addAnnotation( annotation );
293 case DOMAIN_STRUCTURE:
294 if ( PhylogenyDecorator.VERBOSE ) {
295 System.out.println( name + ": " + new_value );
297 if ( !node.getNodeData().isHasSequence() ) {
298 node.getNodeData().setSequence( new Sequence() );
300 node.getNodeData().getSequence()
301 .setDomainArchitecture( new DomainArchitecture( new_value ) );
304 if ( PhylogenyDecorator.VERBOSE ) {
305 System.out.println( name + ": " + new_value );
307 AptxUtil.ensurePresenceOfTaxonomy( node );
308 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
310 case TAXONOMY_SCIENTIFIC_NAME:
311 if ( PhylogenyDecorator.VERBOSE ) {
312 System.out.println( name + ": " + new_value );
314 AptxUtil.ensurePresenceOfTaxonomy( node );
315 node.getNodeData().getTaxonomy().setScientificName( new_value );
318 if ( PhylogenyDecorator.VERBOSE ) {
319 System.out.println( name + ": " + new_value );
321 if ( !node.getNodeData().isHasSequence() ) {
322 node.getNodeData().setSequence( new Sequence() );
324 node.getNodeData().getSequence().setName( new_value );
327 if ( PhylogenyDecorator.VERBOSE ) {
328 System.out.print( name + " -> " );
330 if ( cut_name_after_space ) {
331 if ( PhylogenyDecorator.VERBOSE ) {
332 System.out.print( new_value + " -> " );
334 new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
336 else if ( process_name_intelligently ) {
337 if ( PhylogenyDecorator.VERBOSE ) {
338 System.out.print( new_value + " -> " );
340 new_value = PhylogenyDecorator.processNameIntelligently( new_value );
342 else if ( process_similar_to ) {
343 if ( PhylogenyDecorator.VERBOSE ) {
344 System.out.print( new_value + " -> " );
346 new_value = PhylogenyDecorator.processSimilarTo( new_value );
348 if ( PhylogenyDecorator.SANITIZE ) {
349 new_value = PhylogenyDecorator.sanitize( new_value );
351 if ( PhylogenyDecorator.VERBOSE ) {
352 System.out.println( new_value );
354 node.setName( new_value );
357 throw new RuntimeException( "unknown field \"" + field + "\"" );
359 if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) {
360 node.setName( moveDomainNumbersAtEnd( node.getName() ) );
365 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
371 public static void decorate( final Phylogeny[] phylogenies,
372 final Map<String, Map<String, String>> map,
374 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
375 throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException {
376 for( int i = 0; i < phylogenies.length; ++i ) {
377 PhylogenyDecorator.decorate( phylogenies[ i ],
380 numbers_of_chars_allowed_to_remove_if_not_found_in_map );
384 public static void decorate( final Phylogeny[] phylogenies,
385 final Map<String, String> map,
387 final boolean extract_bracketed_scientific_name,
389 final boolean cut_name_after_space,
390 final boolean process_name_intelligently,
391 final boolean process_similar_to,
392 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
393 final boolean move_domain_numbers_at_end_to_middle,
394 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
395 PhyloXmlDataFormatException {
396 for( int i = 0; i < phylogenies.length; ++i ) {
397 PhylogenyDecorator.decorate( phylogenies[ i ],
400 extract_bracketed_scientific_name,
402 cut_name_after_space,
403 process_name_intelligently,
405 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
406 move_domain_numbers_at_end_to_middle,
411 public static void decorate( final Phylogeny[] phylogenies,
412 final Map<String, String> map,
414 final boolean extract_bracketed_scientific_name,
416 final Map<String, String> intermediate_map,
417 final boolean cut_name_after_space,
418 final boolean process_name_intelligently,
419 final boolean process_similar_to,
420 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
421 final boolean move_domain_numbers_at_end_to_middle,
422 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
423 PhyloXmlDataFormatException {
424 for( int i = 0; i < phylogenies.length; ++i ) {
425 PhylogenyDecorator.decorate( phylogenies[ i ],
428 extract_bracketed_scientific_name,
431 cut_name_after_space,
432 process_name_intelligently,
434 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
435 move_domain_numbers_at_end_to_middle,
440 private static String deleteAtFirstSpace( final String name ) {
441 final int first_space = name.indexOf( " " );
442 if ( first_space > 1 ) {
443 return name.substring( 0, first_space ).trim();
448 private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
449 final int i = new_value.lastIndexOf( "[" );
450 final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
451 AptxUtil.ensurePresenceOfTaxonomy( node );
452 node.getNodeData().getTaxonomy().setScientificName( scientific_name );
453 return new_value.substring( 0, i - 1 ).trim();
456 private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
457 String new_name = null;
458 if ( PhylogenyDecorator.VERBOSE ) {
459 System.out.print( name + " => " );
461 if ( intermediate_map.containsKey( name ) ) {
462 new_name = intermediate_map.get( name );
463 if ( ForesterUtil.isEmpty( new_name ) ) {
464 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
468 throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
470 if ( PhylogenyDecorator.VERBOSE ) {
471 System.out.println( new_name + " " );
476 private static String moveDomainNumbersAtEnd( final String node_name ) {
477 final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name );
479 final String seq_number = m.group( 1 );
480 final String tax = m.group( 2 );
481 final String domain_number = m.group( 3 );
482 return seq_number + "_[" + domain_number + "]_" + tax;
489 public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
491 final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
492 BasicTable<String> mapping_table = null;
493 mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false, false );
494 for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
495 final Map<String, String> row_map = new HashMap<String, String>();
497 for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
498 final String table_cell = mapping_table.getValue( col, row );
502 else if ( table_cell != null ) {
503 final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
504 final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
505 row_map.put( key, val );
508 map.put( name, row_map );
513 private static String processNameIntelligently( final String name ) {
514 final String[] s = name.split( " " );
515 if ( s.length < 2 ) {
518 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) {
521 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) {
524 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) {
527 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) {
530 else if ( s[ 0 ].indexOf( "_" ) > 0 ) {
533 else if ( s[ 1 ].indexOf( "_" ) > 0 ) {
541 private static String processSimilarTo( final String name ) {
542 final int i = name.toLowerCase().indexOf( "similar to" );
543 String similar_to = "";
545 similar_to = " similarity=" + name.substring( i + 10 ).trim();
547 final String pi = processNameIntelligently( name );
548 return pi + similar_to;
551 private static String sanitize( String s ) {
552 s = s.replace( ' ', '_' );
553 s = s.replace( '(', '{' );
554 s = s.replace( ')', '}' );
555 s = s.replace( '[', '{' );
556 s = s.replace( ']', '}' );
557 s = s.replace( ',', '_' );
561 public static enum FIELD {
562 NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;