in progress
[jalview.git] / forester / java / src / org / forester / tools / PhylogenyDecorator.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.tools;
27
28 import java.io.File;
29 import java.io.IOException;
30 import java.util.HashMap;
31 import java.util.Map;
32 import java.util.regex.Matcher;
33
34 import org.forester.io.parsers.nhx.NHXFormatException;
35 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
36 import org.forester.io.parsers.util.ParserUtils;
37 import org.forester.phylogeny.Phylogeny;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Accession;
40 import org.forester.phylogeny.data.Annotation;
41 import org.forester.phylogeny.data.DomainArchitecture;
42 import org.forester.phylogeny.data.Identifier;
43 import org.forester.phylogeny.data.Sequence;
44 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
45 import org.forester.util.BasicTable;
46 import org.forester.util.BasicTableParser;
47 import org.forester.util.ForesterUtil;
48
49 public final class PhylogenyDecorator {
50
51     final private static String TP_NODE_NAME            = "NODE_NAME";
52     final private static String TP_SEQ_ACCESSION        = "SEQ_ACCESSION";
53     final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
54     final private static String TP_SEQ_ANNOTATION_DESC  = "SEQ_ANNOTATION_DESC";
55     final private static String TP_SEQ_ANNOTATION_REF   = "SEQ_ANNOTATION_REF";
56     final private static String TP_SEQ_MOL_SEQ          = "SEQ_MOL_SEQ";
57     final private static String TP_SEQ_NAME             = "SEQ_NAME";
58     final private static String TP_SEQ_SYMBOL           = "SEQ_SYMBOL";
59     final private static String TP_TAXONOMY_CN          = "TAXONOMY_CN";
60     // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
61     final private static String TP_TAXONOMY_CODE        = "TAXONOMY_CODE";
62     final private static String TP_TAXONOMY_ID          = "TAXONOMY_ID";
63     final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
64     final private static String TP_TAXONOMY_SN          = "TAXONOMY_SN";
65     final private static String TP_TAXONOMY_SYN         = "TAXONOMY_SYN";
66
67     private PhylogenyDecorator() {
68         // Not needed.
69     }
70
71     public static void decorate( final Phylogeny phylogeny,
72                                  final Map<String, Map<String, String>> map,
73                                  final boolean picky ) throws IllegalArgumentException, PhyloXmlDataFormatException {
74         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
75             final PhylogenyNode node = iter.next();
76             final String name = node.getName();
77             if ( !ForesterUtil.isEmpty( name ) ) {
78                 if ( map.containsKey( name ) ) {
79                     final Map<String, String> new_values = map.get( name );
80                     if ( new_values != null ) {
81                         if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
82                             ForesterUtil.ensurePresenceOfTaxonomy( node );
83                             node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
84                         }
85                         if ( new_values.containsKey( TP_TAXONOMY_ID )
86                                 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
87                             ForesterUtil.ensurePresenceOfTaxonomy( node );
88                             node.getNodeData()
89                                     .getTaxonomy()
90                                     .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
91                                                                     new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
92                         }
93                         else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
94                             ForesterUtil.ensurePresenceOfTaxonomy( node );
95                             node.getNodeData().getTaxonomy()
96                                     .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
97                         }
98                         if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
99                             ForesterUtil.ensurePresenceOfTaxonomy( node );
100                             node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
101                         }
102                         if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
103                             ForesterUtil.ensurePresenceOfTaxonomy( node );
104                             node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
105                         }
106                         if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
107                             ForesterUtil.ensurePresenceOfTaxonomy( node );
108                             node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
109                         }
110                         if ( new_values.containsKey( TP_SEQ_ACCESSION )
111                                 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
112                             ForesterUtil.ensurePresenceOfSequence( node );
113                             node.getNodeData()
114                                     .getSequence()
115                                     .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
116                                                                   new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
117                         }
118                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
119                             ForesterUtil.ensurePresenceOfSequence( node );
120                             final Annotation ann = new Annotation();
121                             ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
122                             node.getNodeData().getSequence().addAnnotation( ann );
123                         }
124                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
125                             ForesterUtil.ensurePresenceOfSequence( node );
126                             final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
127                             node.getNodeData().getSequence().addAnnotation( ann );
128                         }
129                         if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
130                             ForesterUtil.ensurePresenceOfSequence( node );
131                             node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
132                         }
133                         if ( new_values.containsKey( TP_SEQ_NAME ) ) {
134                             ForesterUtil.ensurePresenceOfSequence( node );
135                             node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
136                         }
137                         if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
138                             ForesterUtil.ensurePresenceOfSequence( node );
139                             node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
140                         }
141                         if ( new_values.containsKey( TP_NODE_NAME ) ) {
142                             node.setName( new_values.get( TP_NODE_NAME ) );
143                         }
144                     } // if ( new_values != null ) 
145                 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
146                 else if ( picky ) {
147                     throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
148                 }
149             }
150         }
151     }
152
153     public static String decorate( final Phylogeny phylogeny,
154                                    final Map<String, String> map,
155                                    final FIELD field,
156                                    final boolean extract_bracketed_scientific_name,
157                                    final boolean extract_bracketed_tax_code,
158                                    final boolean picky,
159                                    final boolean cut_name_after_space,
160                                    final boolean trim_after_tilde,
161                                    final boolean verbose ) throws IllegalArgumentException, NHXFormatException,
162             PhyloXmlDataFormatException {
163         return PhylogenyDecorator.decorate( phylogeny,
164                                             map,
165                                             field,
166                                             extract_bracketed_scientific_name,
167                                             extract_bracketed_tax_code,
168                                             picky,
169                                             null,
170                                             cut_name_after_space,
171                                             trim_after_tilde,
172                                             verbose );
173     }
174
175     /**
176      * 
177      * 
178      * 
179      * @param phylogeny
180      * @param map
181      *            maps names (in phylogeny) to new values if intermediate_map is
182      *            null otherwise maps intermediate value to new value
183      * @param field
184      * @param picky
185      * @param intermediate_map
186      *            maps name (in phylogeny) to a intermediate value
187      * @throws IllegalArgumentException
188      * @throws PhyloXmlDataFormatException 
189      */
190     public static String decorate( final Phylogeny phylogeny,
191                                    final Map<String, String> map,
192                                    final FIELD field,
193                                    final boolean extract_bracketed_scientific_name,
194                                    final boolean extract_bracketed_tax_code,
195                                    final boolean picky,
196                                    final Map<String, String> intermediate_map,
197                                    final boolean cut_name_after_space,
198                                    final boolean trim_after_tilde,
199                                    final boolean verbose ) throws IllegalArgumentException, PhyloXmlDataFormatException {
200         if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
201             throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
202         }
203         if ( map.isEmpty() ) {
204             throw new IllegalArgumentException( "map is empty" );
205         }
206         if ( picky && ( map.size() < phylogeny.getNumberOfExternalNodes() ) ) {
207             throw new IllegalArgumentException( "map contains less entries than the tree has external nodes" );
208         }
209         int ext_nodes = 0;
210         int ext_nodes_updated = 0;
211         int int_nodes = 0;
212         int int_nodes_updated = 0;
213         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
214             final PhylogenyNode node = iter.next();
215             if ( node.isExternal() ) {
216                 ++ext_nodes;
217             }
218             else {
219                 ++int_nodes;
220             }
221             String name = node.getName();
222             if ( picky && node.isExternal() && ForesterUtil.isEmpty( name ) ) {
223                 throw new IllegalArgumentException( "external node with no name present" );
224             }
225             String tilde_annotation = null;
226             if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
227                 final int ti = name.indexOf( '~' );
228                 final String orig = name;
229                 tilde_annotation = name.substring( ti );
230                 name = name.substring( 0, ti );
231                 if ( node.isExternal() && ForesterUtil.isEmpty( name ) ) {
232                     throw new IllegalArgumentException( "external node with illegal name: " + orig );
233                 }
234             }
235             if ( !ForesterUtil.isEmpty( name ) ) {
236                 if ( intermediate_map != null ) {
237                     name = PhylogenyDecorator.extractIntermediate( intermediate_map, name, verbose );
238                 }
239                 if ( map.containsKey( name ) ) {
240                     String new_value = map.get( name ).trim().replaceAll( "/\\s+/", " " );
241                     if ( !ForesterUtil.isEmpty( new_value ) ) {
242                         if ( node.isExternal() ) {
243                             ++ext_nodes_updated;
244                         }
245                         else {
246                             ++int_nodes_updated;
247                         }
248                         if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
249                             new_value = extractBracketedScientificNames( node, new_value );
250                         }
251                         else if ( extract_bracketed_tax_code ) {
252                             if ( ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( new_value ).find() ) {
253                                 new_value = extractBracketedTaxCodes( node, new_value );
254                             }
255                             else if ( picky ) {
256                                 throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value
257                                         + "\"" );
258                             }
259                         }
260                         switch ( field ) {
261                             case MOL_SEQ:
262                                 if ( verbose ) {
263                                     System.out.println( name + ": " + new_value );
264                                 }
265                                 if ( !node.getNodeData().isHasSequence() ) {
266                                     node.getNodeData().setSequence( new Sequence() );
267                                 }
268                                 node.getNodeData().getSequence().setMolecularSequence( new_value );
269                                 break;
270                             case SEQUENCE_ANNOTATION_DESC:
271                                 if ( verbose ) {
272                                     System.out.println( name + ": " + new_value );
273                                 }
274                                 if ( !node.getNodeData().isHasSequence() ) {
275                                     node.getNodeData().setSequence( new Sequence() );
276                                 }
277                                 final Annotation annotation = new Annotation();
278                                 annotation.setDesc( new_value );
279                                 node.getNodeData().getSequence().addAnnotation( annotation );
280                                 break;
281                             case DOMAIN_STRUCTURE:
282                                 if ( verbose ) {
283                                     System.out.println( name + ": " + new_value );
284                                 }
285                                 if ( !node.getNodeData().isHasSequence() ) {
286                                     node.getNodeData().setSequence( new Sequence() );
287                                 }
288                                 node.getNodeData().getSequence()
289                                         .setDomainArchitecture( new DomainArchitecture( new_value ) );
290                                 break;
291                             case TAXONOMY_CODE:
292                                 if ( verbose ) {
293                                     System.out.println( name + ": " + new_value );
294                                 }
295                                 ForesterUtil.ensurePresenceOfTaxonomy( node );
296                                 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
297                                 break;
298                             case TAXONOMY_SCIENTIFIC_NAME:
299                                 if ( verbose ) {
300                                     System.out.println( name + ": " + new_value );
301                                 }
302                                 ForesterUtil.ensurePresenceOfTaxonomy( node );
303                                 node.getNodeData().getTaxonomy().setScientificName( new_value );
304                                 break;
305                             case SEQUENCE_NAME:
306                                 if ( trim_after_tilde ) {
307                                     new_value = addTildeAnnotation( tilde_annotation, new_value );
308                                 }
309                                 if ( verbose ) {
310                                     System.out.println( name + ": " + new_value );
311                                 }
312                                 if ( !node.getNodeData().isHasSequence() ) {
313                                     node.getNodeData().setSequence( new Sequence() );
314                                 }
315                                 node.getNodeData().getSequence().setName( new_value );
316                                 break;
317                             case NODE_NAME:
318                                 if ( verbose ) {
319                                     System.out.print( name + " -> " );
320                                 }
321                                 if ( cut_name_after_space ) {
322                                     if ( verbose ) {
323                                         System.out.print( new_value + " -> " );
324                                     }
325                                     new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
326                                 }
327                                 if ( trim_after_tilde ) {
328                                     new_value = addTildeAnnotation( tilde_annotation, new_value );
329                                 }
330                                 if ( verbose ) {
331                                     System.out.println( new_value );
332                                 }
333                                 node.setName( new_value );
334                                 break;
335                             default:
336                                 throw new RuntimeException( "unknown field \"" + field + "\"" );
337                         }
338                     }
339                     else {
340                         throw new IllegalArgumentException( "node name \"" + name + "\" maps to empty value" );
341                     }
342                 }
343                 else if ( picky ) {
344                     throw new IllegalArgumentException( "node name \"" + name + "\" not found in map" );
345                 }
346             }
347         }
348         return "updated " + ext_nodes_updated + "/" + ext_nodes + " external nodes, updated " + int_nodes_updated + "/"
349                 + int_nodes + " internal nodes";
350     }
351
352     public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
353             throws IOException {
354         final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
355         BasicTable<String> mapping_table = null;
356         mapping_table = BasicTableParser.parse( mapping_table_file, '\t', false, false );
357         for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
358             final Map<String, String> row_map = new HashMap<String, String>();
359             String name = null;
360             for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
361                 final String table_cell = mapping_table.getValue( col, row );
362                 if ( col == 0 ) {
363                     name = table_cell;
364                 }
365                 else if ( table_cell != null ) {
366                     final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
367                     final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
368                     row_map.put( key, val );
369                 }
370             }
371             map.put( name, row_map );
372         }
373         return map;
374     }
375
376     private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
377         if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
378             return new_value;
379         }
380         return new_value + tilde_annotation;
381     }
382
383     private static String deleteAtFirstSpace( final String name ) {
384         final int first_space = name.indexOf( " " );
385         if ( first_space > 1 ) {
386             return name.substring( 0, first_space ).trim();
387         }
388         return name;
389     }
390
391     private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
392         final int i = new_value.lastIndexOf( "[" );
393         final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
394         ForesterUtil.ensurePresenceOfTaxonomy( node );
395         node.getNodeData().getTaxonomy().setScientificName( scientific_name );
396         return new_value.substring( 0, i - 1 ).trim();
397     }
398
399     private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
400         final StringBuilder sb = new StringBuilder();
401         sb.append( new_value );
402         final String tc = extractBracketedTaxCodes( sb );
403         if ( !ForesterUtil.isEmpty( tc ) ) {
404             ForesterUtil.ensurePresenceOfTaxonomy( node );
405             try {
406                 node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
407             }
408             catch ( final PhyloXmlDataFormatException e ) {
409                 throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
410             }
411             return sb.toString().trim();
412         }
413         return new_value;
414     }
415
416     private static String extractBracketedTaxCodes( final StringBuilder sb ) {
417         final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( sb );
418         if ( m.find() ) {
419             final String tc = m.group( 1 );
420             sb.delete( m.start( 1 ) - 1, m.end( 1 ) + 1 );
421             return tc;
422         }
423         return null;
424     }
425
426     private static String extractIntermediate( final Map<String, String> intermediate_map,
427                                                final String name,
428                                                final boolean verbose ) {
429         String new_name = null;
430         if ( verbose ) {
431             System.out.print( name + " => " );
432         }
433         if ( intermediate_map.containsKey( name ) ) {
434             new_name = intermediate_map.get( name );
435             if ( ForesterUtil.isEmpty( new_name ) ) {
436                 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
437             }
438         }
439         else {
440             throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
441         }
442         if ( verbose ) {
443             System.out.println( new_name + "  " );
444         }
445         return new_name;
446     }
447
448     public static enum FIELD {
449         DOMAIN_STRUCTURE,
450         MOL_SEQ,
451         NODE_NAME,
452         SEQUENCE_ANNOTATION_DESC,
453         SEQUENCE_NAME,
454         TAXONOMY_CODE,
455         TAXONOMY_SCIENTIFIC_NAME;
456     }
457 }