2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.tools;
29 import java.io.IOException;
30 import java.util.HashMap;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
35 import org.forester.io.parsers.nhx.NHXFormatException;
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyNode;
38 import org.forester.phylogeny.data.Accession;
39 import org.forester.phylogeny.data.Annotation;
40 import org.forester.phylogeny.data.DomainArchitecture;
41 import org.forester.phylogeny.data.Identifier;
42 import org.forester.phylogeny.data.Sequence;
43 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
44 import org.forester.util.BasicTable;
45 import org.forester.util.BasicTableParser;
46 import org.forester.util.ForesterUtil;
48 public final class PhylogenyDecorator {
50 // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
51 final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
52 final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
53 final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
54 final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
55 final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
56 final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
57 final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
58 final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
59 final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
60 final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
61 final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
62 final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
63 final private static String TP_SEQ_NAME = "SEQ_NAME";
64 final private static String TP_NODE_NAME = "NODE_NAME";
65 final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern
66 .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" );
67 public final static boolean SANITIZE = false;
68 public final static boolean VERBOSE = true;
70 private PhylogenyDecorator() {
74 public static void decorate( final Phylogeny phylogeny,
75 final Map<String, Map<String, String>> map,
77 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
78 throws IllegalArgumentException {
79 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
80 final PhylogenyNode node = iter.next();
81 final String name = node.getName();
82 if ( !ForesterUtil.isEmpty( name ) ) {
83 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
84 Map<String, String> new_values = map.get( name );
86 while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
87 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
88 new_values = map.get( name.substring( 0, name.length() - x ) );
91 if ( new_values != null ) {
92 if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
93 ForesterUtil.ensurePresenceOfTaxonomy( node );
94 node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
96 if ( new_values.containsKey( TP_TAXONOMY_ID )
97 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
98 ForesterUtil.ensurePresenceOfTaxonomy( node );
99 node.getNodeData().getTaxonomy().setIdentifier( new Identifier( new_values
100 .get( TP_TAXONOMY_ID ), new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
102 else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
103 ForesterUtil.ensurePresenceOfTaxonomy( node );
104 node.getNodeData().getTaxonomy().setIdentifier( new Identifier( new_values
105 .get( TP_TAXONOMY_ID ) ) );
107 if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
108 ForesterUtil.ensurePresenceOfTaxonomy( node );
109 node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
111 if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
112 ForesterUtil.ensurePresenceOfTaxonomy( node );
113 node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
115 if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
116 ForesterUtil.ensurePresenceOfTaxonomy( node );
117 node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
119 if ( new_values.containsKey( TP_SEQ_ACCESSION )
120 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
121 ForesterUtil.ensurePresenceOfSequence( node );
122 node.getNodeData().getSequence().setAccession( new Accession( new_values
123 .get( TP_SEQ_ACCESSION ), new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
125 if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
126 ForesterUtil.ensurePresenceOfSequence( node );
127 final Annotation ann = new Annotation( "?" );
128 ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
129 node.getNodeData().getSequence().addAnnotation( ann );
131 if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
132 ForesterUtil.ensurePresenceOfSequence( node );
133 final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
134 node.getNodeData().getSequence().addAnnotation( ann );
136 if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
137 ForesterUtil.ensurePresenceOfSequence( node );
138 node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
140 if ( new_values.containsKey( TP_SEQ_NAME ) ) {
141 ForesterUtil.ensurePresenceOfSequence( node );
142 node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
144 if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
145 ForesterUtil.ensurePresenceOfSequence( node );
146 node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
148 if ( new_values.containsKey( TP_NODE_NAME ) ) {
149 node.setName( new_values.get( TP_NODE_NAME ) );
154 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
168 * maps names (in phylogeny) to new values
171 * @throws IllegalArgumentException
172 * @throws NHXFormatException
174 public static void decorate( final Phylogeny phylogeny,
175 final Map<String, String> map,
177 final boolean extract_bracketed_scientific_name,
179 final boolean cut_name_after_space,
180 final boolean process_name_intelligently,
181 final boolean process_similar_to,
182 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
183 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
185 PhylogenyDecorator.decorate( phylogeny,
188 extract_bracketed_scientific_name,
191 cut_name_after_space,
192 process_name_intelligently,
194 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
195 move_domain_numbers_at_end_to_middle );
204 * maps names (in phylogeny) to new values if intermediate_map is
205 * null otherwise maps intermediate value to new value
208 * @param intermediate_map
209 * maps name (in phylogeny) to a intermediate value
210 * @throws IllegalArgumentException
212 public static void decorate( final Phylogeny phylogeny,
213 final Map<String, String> map,
215 final boolean extract_bracketed_scientific_name,
217 final Map<String, String> intermediate_map,
218 final boolean cut_name_after_space,
219 final boolean process_name_intelligently,
220 final boolean process_similar_to,
221 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
222 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException {
223 if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
224 throw new IllegalArgumentException( "Attempt to extract bracketed scientific name together with data field pointing to scientific name" );
226 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
227 final PhylogenyNode node = iter.next();
228 String name = node.getName();
229 if ( !ForesterUtil.isEmpty( name ) ) {
230 if ( intermediate_map != null ) {
231 name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
233 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
234 String new_value = map.get( name );
236 while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
237 && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
238 new_value = map.get( name.substring( 0, name.length() - x ) );
241 if ( new_value != null ) {
242 new_value = new_value.trim();
243 new_value.replaceAll( "/\\s+/", " " );
244 if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
245 extractBracketedScientificNames( node, new_value );
248 case SEQUENCE_ANNOTATION_DESC:
249 if ( PhylogenyDecorator.VERBOSE ) {
250 System.out.println( name + ": " + new_value );
252 if ( !node.getNodeData().isHasSequence() ) {
253 node.getNodeData().setSequence( new Sequence() );
255 final Annotation annotation = new Annotation( "?" );
256 annotation.setDesc( new_value );
257 node.getNodeData().getSequence().addAnnotation( annotation );
259 case DOMAIN_STRUCTURE:
260 if ( PhylogenyDecorator.VERBOSE ) {
261 System.out.println( name + ": " + new_value );
263 if ( !node.getNodeData().isHasSequence() ) {
264 node.getNodeData().setSequence( new Sequence() );
266 node.getNodeData().getSequence()
267 .setDomainArchitecture( new DomainArchitecture( new_value ) );
270 if ( PhylogenyDecorator.VERBOSE ) {
271 System.out.println( name + ": " + new_value );
273 ForesterUtil.ensurePresenceOfTaxonomy( node );
274 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
276 case TAXONOMY_SCIENTIFIC_NAME:
277 if ( PhylogenyDecorator.VERBOSE ) {
278 System.out.println( name + ": " + new_value );
280 ForesterUtil.ensurePresenceOfTaxonomy( node );
281 node.getNodeData().getTaxonomy().setScientificName( new_value );
284 if ( PhylogenyDecorator.VERBOSE ) {
285 System.out.println( name + ": " + new_value );
287 if ( !node.getNodeData().isHasSequence() ) {
288 node.getNodeData().setSequence( new Sequence() );
290 node.getNodeData().getSequence().setName( new_value );
293 if ( PhylogenyDecorator.VERBOSE ) {
294 System.out.print( name + " -> " );
296 if ( cut_name_after_space ) {
297 if ( PhylogenyDecorator.VERBOSE ) {
298 System.out.print( new_value + " -> " );
300 new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
302 else if ( process_name_intelligently ) {
303 if ( PhylogenyDecorator.VERBOSE ) {
304 System.out.print( new_value + " -> " );
306 new_value = PhylogenyDecorator.processNameIntelligently( new_value );
308 else if ( process_similar_to ) {
309 if ( PhylogenyDecorator.VERBOSE ) {
310 System.out.print( new_value + " -> " );
312 new_value = PhylogenyDecorator.processSimilarTo( new_value );
314 if ( PhylogenyDecorator.SANITIZE ) {
315 new_value = PhylogenyDecorator.sanitize( new_value );
317 if ( PhylogenyDecorator.VERBOSE ) {
318 System.out.println( new_value );
320 node.setName( new_value );
323 throw new RuntimeException( "unknown field \"" + field + "\"" );
325 if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) {
326 node.setName( moveDomainNumbersAtEnd( node.getName() ) );
331 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
337 public static void decorate( final Phylogeny[] phylogenies,
338 final Map<String, Map<String, String>> map,
340 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
341 throws IllegalArgumentException, NHXFormatException {
342 for( int i = 0; i < phylogenies.length; ++i ) {
343 PhylogenyDecorator.decorate( phylogenies[ i ],
346 numbers_of_chars_allowed_to_remove_if_not_found_in_map );
350 public static void decorate( final Phylogeny[] phylogenies,
351 final Map<String, String> map,
353 final boolean extract_bracketed_scientific_name,
355 final boolean cut_name_after_space,
356 final boolean process_name_intelligently,
357 final boolean process_similar_to,
358 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
359 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
361 for( int i = 0; i < phylogenies.length; ++i ) {
362 PhylogenyDecorator.decorate( phylogenies[ i ],
365 extract_bracketed_scientific_name,
367 cut_name_after_space,
368 process_name_intelligently,
370 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
371 move_domain_numbers_at_end_to_middle );
375 public static void decorate( final Phylogeny[] phylogenies,
376 final Map<String, String> map,
378 final boolean extract_bracketed_scientific_name,
380 final Map<String, String> intermediate_map,
381 final boolean cut_name_after_space,
382 final boolean process_name_intelligently,
383 final boolean process_similar_to,
384 final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
385 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
387 for( int i = 0; i < phylogenies.length; ++i ) {
388 PhylogenyDecorator.decorate( phylogenies[ i ],
391 extract_bracketed_scientific_name,
394 cut_name_after_space,
395 process_name_intelligently,
397 numbers_of_chars_allowed_to_remove_if_not_found_in_map,
398 move_domain_numbers_at_end_to_middle );
402 private static String deleteAtFirstSpace( final String name ) {
403 final int first_space = name.indexOf( " " );
404 if ( first_space > 1 ) {
405 return name.substring( 0, first_space ).trim();
410 private static void extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
411 final int i = new_value.lastIndexOf( "[" );
412 final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
413 ForesterUtil.ensurePresenceOfTaxonomy( node );
414 node.getNodeData().getTaxonomy().setScientificName( scientific_name );
417 private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
418 String new_name = null;
419 if ( PhylogenyDecorator.VERBOSE ) {
420 System.out.print( name + " => " );
422 if ( intermediate_map.containsKey( name ) ) {
423 new_name = intermediate_map.get( name );
424 if ( ForesterUtil.isEmpty( new_name ) ) {
425 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
429 throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
431 if ( PhylogenyDecorator.VERBOSE ) {
432 System.out.println( new_name + " " );
437 private static String moveDomainNumbersAtEnd( final String node_name ) {
438 final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name );
440 final String seq_number = m.group( 1 );
441 final String tax = m.group( 2 );
442 final String domain_number = m.group( 3 );
443 return seq_number + "_[" + domain_number + "]_" + tax;
450 public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
452 final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
453 BasicTable<String> mapping_table = null;
454 mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false );
455 for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
456 final Map<String, String> row_map = new HashMap<String, String>();
458 for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
459 final String table_cell = mapping_table.getValue( col, row );
463 else if ( table_cell != null ) {
464 final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
465 final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
466 row_map.put( key, val );
469 map.put( name, row_map );
474 private static String processNameIntelligently( final String name ) {
475 final String[] s = name.split( " " );
476 if ( s.length < 2 ) {
479 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) {
482 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) {
485 else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) {
488 else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) {
491 else if ( s[ 0 ].indexOf( "_" ) > 0 ) {
494 else if ( s[ 1 ].indexOf( "_" ) > 0 ) {
502 private static String processSimilarTo( final String name ) {
503 final int i = name.toLowerCase().indexOf( "similar to" );
504 String similar_to = "";
506 similar_to = " similarity=" + name.substring( i + 10 ).trim();
508 final String pi = processNameIntelligently( name );
509 return pi + similar_to;
512 private static String sanitize( String s ) {
513 s = s.replace( ' ', '_' );
514 s = s.replace( '(', '{' );
515 s = s.replace( ')', '}' );
516 s = s.replace( '[', '{' );
517 s = s.replace( ']', '}' );
518 s = s.replace( ',', '_' );
522 public static enum FIELD {
523 NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;