2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.tools;
29 import java.io.IOException;
30 import java.util.HashMap;
32 import java.util.regex.Matcher;
34 import org.forester.io.parsers.nhx.NHXFormatException;
35 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
36 import org.forester.io.parsers.util.ParserUtils;
37 import org.forester.phylogeny.Phylogeny;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Accession;
40 import org.forester.phylogeny.data.Annotation;
41 import org.forester.phylogeny.data.DomainArchitecture;
42 import org.forester.phylogeny.data.Identifier;
43 import org.forester.phylogeny.data.Sequence;
44 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
45 import org.forester.sequence.MolecularSequence.TYPE;
46 import org.forester.util.BasicTable;
47 import org.forester.util.BasicTableParser;
48 import org.forester.util.ForesterUtil;
50 public final class PhylogenyDecorator {
52 final private static String TP_NODE_NAME = "NODE_NAME";
53 final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION";
54 final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
55 final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC";
56 final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF";
57 final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ";
58 final private static String TP_SEQ_NAME = "SEQ_NAME";
59 final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL";
60 final private static String TP_TAXONOMY_CN = "TAXONOMY_CN";
61 // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
62 final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE";
63 final private static String TP_TAXONOMY_ID = "TAXONOMY_ID";
64 final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
65 final private static String TP_TAXONOMY_SN = "TAXONOMY_SN";
66 final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN";
68 private PhylogenyDecorator() {
72 public static void decorate( final Phylogeny phylogeny,
73 final Map<String, Map<String, String>> map,
74 final boolean picky ) throws IllegalArgumentException, PhyloXmlDataFormatException {
75 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
76 final PhylogenyNode node = iter.next();
77 final String name = node.getName();
78 if ( !ForesterUtil.isEmpty( name ) ) {
79 if ( map.containsKey( name ) ) {
80 final Map<String, String> new_values = map.get( name );
81 if ( new_values != null ) {
82 if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
83 ForesterUtil.ensurePresenceOfTaxonomy( node );
84 node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
86 if ( new_values.containsKey( TP_TAXONOMY_ID )
87 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
88 ForesterUtil.ensurePresenceOfTaxonomy( node );
91 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
92 new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
94 else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
95 ForesterUtil.ensurePresenceOfTaxonomy( node );
96 node.getNodeData().getTaxonomy()
97 .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
99 if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
100 ForesterUtil.ensurePresenceOfTaxonomy( node );
101 node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
103 if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
104 ForesterUtil.ensurePresenceOfTaxonomy( node );
105 node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
107 if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
108 ForesterUtil.ensurePresenceOfTaxonomy( node );
109 node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
111 if ( new_values.containsKey( TP_SEQ_ACCESSION )
112 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
113 ForesterUtil.ensurePresenceOfSequence( node );
116 .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
117 new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
119 if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
120 ForesterUtil.ensurePresenceOfSequence( node );
121 final Annotation ann = new Annotation();
122 ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
123 node.getNodeData().getSequence().addAnnotation( ann );
125 if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
126 ForesterUtil.ensurePresenceOfSequence( node );
127 final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
128 node.getNodeData().getSequence().addAnnotation( ann );
130 if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
131 ForesterUtil.ensurePresenceOfSequence( node );
132 node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
134 if ( new_values.containsKey( TP_SEQ_NAME ) ) {
135 ForesterUtil.ensurePresenceOfSequence( node );
136 node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
138 if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
139 ForesterUtil.ensurePresenceOfSequence( node );
140 node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
142 if ( new_values.containsKey( TP_NODE_NAME ) ) {
143 node.setName( new_values.get( TP_NODE_NAME ) );
145 } // if ( new_values != null )
146 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
148 throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
154 public static String decorate( final Phylogeny phylogeny,
155 final Map<String, String> map,
157 final boolean extract_bracketed_scientific_name,
158 final boolean extract_bracketed_tax_code,
160 final boolean cut_name_after_space,
161 final boolean trim_after_tilde,
162 final boolean verbose ) throws IllegalArgumentException, NHXFormatException,
163 PhyloXmlDataFormatException {
164 return PhylogenyDecorator.decorate( phylogeny,
167 extract_bracketed_scientific_name,
168 extract_bracketed_tax_code,
171 cut_name_after_space,
182 * maps names (in phylogeny) to new values if intermediate_map is
183 * null otherwise maps intermediate value to new value
186 * @param intermediate_map
187 * maps name (in phylogeny) to a intermediate value
188 * @throws IllegalArgumentException
189 * @throws PhyloXmlDataFormatException
191 public static String decorate( final Phylogeny phylogeny,
192 final Map<String, String> map,
194 final boolean extract_bracketed_scientific_name,
195 final boolean extract_bracketed_tax_code,
197 final Map<String, String> intermediate_map,
198 final boolean cut_name_after_space,
199 final boolean trim_after_tilde,
200 final boolean verbose ) throws IllegalArgumentException, PhyloXmlDataFormatException {
201 if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
202 throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
204 if ( map.isEmpty() ) {
205 throw new IllegalArgumentException( "map is empty" );
208 int ext_nodes_updated = 0;
210 int int_nodes_updated = 0;
211 for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
212 final PhylogenyNode node = iter.next();
213 if ( node.isExternal() ) {
219 String name = node.getName();
220 if ( picky && node.isExternal() && ForesterUtil.isEmpty( name ) ) {
221 throw new IllegalArgumentException( "external node with no name present" );
223 String tilde_annotation = null;
224 final String orig_name = name;
225 if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
226 final int ti = name.indexOf( '~' );
227 tilde_annotation = name.substring( ti );
228 name = name.substring( 0, ti );
229 if ( node.isExternal() && ForesterUtil.isEmpty( name ) ) {
230 throw new IllegalArgumentException( "external node with illegal name: " + orig_name );
233 if ( !ForesterUtil.isEmpty( name ) ) {
234 if ( intermediate_map != null ) {
235 name = PhylogenyDecorator.extractIntermediate( intermediate_map, name, verbose );
237 if ( ( field == FIELD.MOL_SEQ ) && !map.containsKey( name ) ) {
240 if ( map.containsKey( name ) ) {
241 String new_value = map.get( name ).trim().replaceAll( "/\\s+/", " " );
242 if ( !ForesterUtil.isEmpty( new_value ) ) {
243 if ( node.isExternal() ) {
249 if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
250 new_value = extractBracketedScientificNames( node, new_value );
252 else if ( extract_bracketed_tax_code ) {
253 if ( ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( new_value ).find() ) {
254 new_value = extractBracketedTaxCodes( node, new_value );
257 throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value
264 System.out.println( name + ": " + new_value );
266 if ( !node.getNodeData().isHasSequence() ) {
267 node.getNodeData().setSequence( new Sequence() );
269 node.getNodeData().getSequence().setMolecularSequence( new_value );
270 final TYPE type = ForesterUtil.guessMolecularSequenceType( new_value );
271 if ( type != null ) {
272 if ( type == TYPE.AA ) {
273 node.getNodeData().getSequence().setType( "protein" );
275 else if ( type == TYPE.DNA ) {
276 node.getNodeData().getSequence().setType( "dna" );
278 else if ( type == TYPE.RNA ) {
279 node.getNodeData().getSequence().setType( "rna" );
283 case SEQUENCE_ANNOTATION_DESC:
285 System.out.println( name + ": " + new_value );
287 if ( !node.getNodeData().isHasSequence() ) {
288 node.getNodeData().setSequence( new Sequence() );
290 final Annotation annotation = new Annotation();
291 annotation.setDesc( new_value );
292 node.getNodeData().getSequence().addAnnotation( annotation );
294 case DOMAIN_STRUCTURE:
296 System.out.println( name + ": " + new_value );
298 if ( !node.getNodeData().isHasSequence() ) {
299 node.getNodeData().setSequence( new Sequence() );
301 node.getNodeData().getSequence()
302 .setDomainArchitecture( new DomainArchitecture( new_value ) );
306 System.out.println( name + ": " + new_value );
308 ForesterUtil.ensurePresenceOfTaxonomy( node );
309 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
311 case TAXONOMY_SCIENTIFIC_NAME:
313 System.out.println( name + ": " + new_value );
315 ForesterUtil.ensurePresenceOfTaxonomy( node );
316 node.getNodeData().getTaxonomy().setScientificName( new_value );
319 if ( trim_after_tilde ) {
320 new_value = addTildeAnnotation( tilde_annotation, new_value );
323 System.out.println( name + ": " + new_value );
325 if ( !node.getNodeData().isHasSequence() ) {
326 node.getNodeData().setSequence( new Sequence() );
328 node.getNodeData().getSequence().setName( new_value );
332 System.out.print( name + " -> " );
334 if ( cut_name_after_space ) {
336 System.out.print( new_value + " -> " );
338 new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
340 if ( trim_after_tilde ) {
341 new_value = addTildeAnnotation( tilde_annotation, new_value );
344 System.out.println( new_value );
346 node.setName( new_value );
349 throw new RuntimeException( "unknown field \"" + field + "\"" );
353 throw new IllegalArgumentException( "node name \"" + name + "\" maps to empty value" );
357 throw new IllegalArgumentException( "node name \"" + name + "\" not found in map" );
361 return "updated " + ext_nodes_updated + "/" + ext_nodes + " external nodes, updated " + int_nodes_updated + "/"
362 + int_nodes + " internal nodes";
365 public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
367 final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
368 BasicTable<String> mapping_table = null;
369 mapping_table = BasicTableParser.parse( mapping_table_file, '\t', false, false );
370 for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
371 final Map<String, String> row_map = new HashMap<String, String>();
373 for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
374 final String table_cell = mapping_table.getValue( col, row );
378 else if ( table_cell != null ) {
379 final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
380 final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
381 row_map.put( key, val );
384 map.put( name, row_map );
389 private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
390 if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
393 return new_value + tilde_annotation;
396 private static String deleteAtFirstSpace( final String name ) {
397 final int first_space = name.indexOf( " " );
398 if ( first_space > 1 ) {
399 return name.substring( 0, first_space ).trim();
404 private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
405 final int i = new_value.lastIndexOf( "[" );
406 final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
407 ForesterUtil.ensurePresenceOfTaxonomy( node );
408 node.getNodeData().getTaxonomy().setScientificName( scientific_name );
409 return new_value.substring( 0, i - 1 ).trim();
412 private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
413 final StringBuilder sb = new StringBuilder();
414 sb.append( new_value );
415 final String tc = extractBracketedTaxCodes( sb );
416 if ( !ForesterUtil.isEmpty( tc ) ) {
417 ForesterUtil.ensurePresenceOfTaxonomy( node );
419 node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
421 catch ( final PhyloXmlDataFormatException e ) {
422 throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
424 return sb.toString().trim();
429 private static String extractBracketedTaxCodes( final StringBuilder sb ) {
430 final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( sb );
432 final String tc = m.group( 1 );
433 sb.delete( m.start( 1 ) - 1, m.end( 1 ) + 1 );
439 private static String extractIntermediate( final Map<String, String> intermediate_map,
441 final boolean verbose ) {
442 String new_name = null;
444 System.out.print( name + " => " );
446 if ( intermediate_map.containsKey( name ) ) {
447 new_name = intermediate_map.get( name );
448 if ( ForesterUtil.isEmpty( new_name ) ) {
449 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
453 throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
456 System.out.println( new_name + " " );
461 public static enum FIELD {
465 SEQUENCE_ANNOTATION_DESC,
468 TAXONOMY_SCIENTIFIC_NAME;