8aa59d844ed953524e9dd00321917e11b0a8d5f7
[jalview.git] / forester / java / src / org / forester / tools / PhylogenyDecorator.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.tools;
27
28 import java.io.File;
29 import java.io.IOException;
30 import java.util.HashMap;
31 import java.util.Map;
32 import java.util.regex.Matcher;
33
34 import org.forester.io.parsers.nhx.NHXFormatException;
35 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
36 import org.forester.io.parsers.util.ParserUtils;
37 import org.forester.phylogeny.Phylogeny;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Accession;
40 import org.forester.phylogeny.data.Annotation;
41 import org.forester.phylogeny.data.DomainArchitecture;
42 import org.forester.phylogeny.data.Identifier;
43 import org.forester.phylogeny.data.Sequence;
44 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
45 import org.forester.sequence.MolecularSequence.TYPE;
46 import org.forester.util.BasicTable;
47 import org.forester.util.BasicTableParser;
48 import org.forester.util.ForesterUtil;
49
50 public final class PhylogenyDecorator {
51
52     final private static String TP_NODE_NAME            = "NODE_NAME";
53     final private static String TP_SEQ_ACCESSION        = "SEQ_ACCESSION";
54     final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
55     final private static String TP_SEQ_ANNOTATION_DESC  = "SEQ_ANNOTATION_DESC";
56     final private static String TP_SEQ_ANNOTATION_REF   = "SEQ_ANNOTATION_REF";
57     final private static String TP_SEQ_MOL_SEQ          = "SEQ_MOL_SEQ";
58     final private static String TP_SEQ_NAME             = "SEQ_NAME";
59     final private static String TP_SEQ_SYMBOL           = "SEQ_SYMBOL";
60     final private static String TP_TAXONOMY_CN          = "TAXONOMY_CN";
61     // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
62     final private static String TP_TAXONOMY_CODE        = "TAXONOMY_CODE";
63     final private static String TP_TAXONOMY_ID          = "TAXONOMY_ID";
64     final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
65     final private static String TP_TAXONOMY_SN          = "TAXONOMY_SN";
66     final private static String TP_TAXONOMY_SYN         = "TAXONOMY_SYN";
67
68     private PhylogenyDecorator() {
69         // Not needed.
70     }
71
72     public static void decorate( final Phylogeny phylogeny,
73                                  final Map<String, Map<String, String>> map,
74                                  final boolean picky ) throws IllegalArgumentException, PhyloXmlDataFormatException {
75         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
76             final PhylogenyNode node = iter.next();
77             final String name = node.getName();
78             if ( !ForesterUtil.isEmpty( name ) ) {
79                 if ( map.containsKey( name ) ) {
80                     final Map<String, String> new_values = map.get( name );
81                     if ( new_values != null ) {
82                         if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
83                             ForesterUtil.ensurePresenceOfTaxonomy( node );
84                             node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
85                         }
86                         if ( new_values.containsKey( TP_TAXONOMY_ID )
87                                 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
88                             ForesterUtil.ensurePresenceOfTaxonomy( node );
89                             node.getNodeData()
90                             .getTaxonomy()
91                             .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
92                                                             new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
93                         }
94                         else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
95                             ForesterUtil.ensurePresenceOfTaxonomy( node );
96                             node.getNodeData().getTaxonomy()
97                             .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
98                         }
99                         if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
100                             ForesterUtil.ensurePresenceOfTaxonomy( node );
101                             node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
102                         }
103                         if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
104                             ForesterUtil.ensurePresenceOfTaxonomy( node );
105                             node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
106                         }
107                         if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
108                             ForesterUtil.ensurePresenceOfTaxonomy( node );
109                             node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
110                         }
111                         if ( new_values.containsKey( TP_SEQ_ACCESSION )
112                                 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
113                             ForesterUtil.ensurePresenceOfSequence( node );
114                             node.getNodeData()
115                             .getSequence()
116                             .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
117                                                           new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
118                         }
119                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
120                             ForesterUtil.ensurePresenceOfSequence( node );
121                             final Annotation ann = new Annotation();
122                             ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
123                             node.getNodeData().getSequence().addAnnotation( ann );
124                         }
125                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
126                             ForesterUtil.ensurePresenceOfSequence( node );
127                             final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
128                             node.getNodeData().getSequence().addAnnotation( ann );
129                         }
130                         if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
131                             ForesterUtil.ensurePresenceOfSequence( node );
132                             node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
133                         }
134                         if ( new_values.containsKey( TP_SEQ_NAME ) ) {
135                             ForesterUtil.ensurePresenceOfSequence( node );
136                             node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
137                         }
138                         if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
139                             ForesterUtil.ensurePresenceOfSequence( node );
140                             node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
141                         }
142                         if ( new_values.containsKey( TP_NODE_NAME ) ) {
143                             node.setName( new_values.get( TP_NODE_NAME ) );
144                         }
145                     } // if ( new_values != null )
146                 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
147                 else if ( picky ) {
148                     throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
149                 }
150             }
151         }
152     }
153
154     public static String decorate( final Phylogeny phylogeny,
155                                    final Map<String, String> map,
156                                    final FIELD field,
157                                    final boolean extract_bracketed_scientific_name,
158                                    final boolean extract_bracketed_tax_code,
159                                    final boolean picky,
160                                    final boolean cut_name_after_space,
161                                    final boolean trim_after_tilde,
162                                    final boolean verbose ) throws IllegalArgumentException, NHXFormatException,
163                                    PhyloXmlDataFormatException {
164         return PhylogenyDecorator.decorate( phylogeny,
165                                             map,
166                                             field,
167                                             extract_bracketed_scientific_name,
168                                             extract_bracketed_tax_code,
169                                             picky,
170                                             null,
171                                             cut_name_after_space,
172                                             trim_after_tilde,
173                                             verbose );
174     }
175
176     /**
177      *
178      *
179      *
180      * @param phylogeny
181      * @param map
182      *            maps names (in phylogeny) to new values if intermediate_map is
183      *            null otherwise maps intermediate value to new value
184      * @param field
185      * @param picky
186      * @param intermediate_map
187      *            maps name (in phylogeny) to a intermediate value
188      * @throws IllegalArgumentException
189      * @throws PhyloXmlDataFormatException
190      */
191     public static String decorate( final Phylogeny phylogeny,
192                                    final Map<String, String> map,
193                                    final FIELD field,
194                                    final boolean extract_bracketed_scientific_name,
195                                    final boolean extract_bracketed_tax_code,
196                                    final boolean picky,
197                                    final Map<String, String> intermediate_map,
198                                    final boolean cut_name_after_space,
199                                    final boolean trim_after_tilde,
200                                    final boolean verbose ) throws IllegalArgumentException, PhyloXmlDataFormatException {
201         if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
202             throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
203         }
204         if ( map.isEmpty() ) {
205             throw new IllegalArgumentException( "map is empty" );
206         }
207         int ext_nodes = 0;
208         int ext_nodes_updated = 0;
209         int int_nodes = 0;
210         int int_nodes_updated = 0;
211         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
212             final PhylogenyNode node = iter.next();
213             if ( node.isExternal() ) {
214                 ++ext_nodes;
215             }
216             else {
217                 ++int_nodes;
218             }
219             String name = node.getName();
220             if ( picky && node.isExternal() && ForesterUtil.isEmpty( name ) ) {
221                 throw new IllegalArgumentException( "external node with no name present" );
222             }
223             String tilde_annotation = null;
224             final String orig_name = name;
225             if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
226                 final int ti = name.indexOf( '~' );
227                 tilde_annotation = name.substring( ti );
228                 name = name.substring( 0, ti );
229                 if ( node.isExternal() && ForesterUtil.isEmpty( name ) ) {
230                     throw new IllegalArgumentException( "external node with illegal name: " + orig_name );
231                 }
232             }
233             if ( !ForesterUtil.isEmpty( name ) ) {
234                 if ( intermediate_map != null ) {
235                     name = PhylogenyDecorator.extractIntermediate( intermediate_map, name, verbose );
236                 }
237                 if ( ( field == FIELD.MOL_SEQ ) && !map.containsKey( name ) ) {
238                     name = orig_name;
239                 }
240                 if ( map.containsKey( name ) ) {
241                     String new_value = map.get( name ).trim().replaceAll( "/\\s+/", " " );
242                     if ( !ForesterUtil.isEmpty( new_value ) ) {
243                         if ( node.isExternal() ) {
244                             ++ext_nodes_updated;
245                         }
246                         else {
247                             ++int_nodes_updated;
248                         }
249                         if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
250                             new_value = extractBracketedScientificNames( node, new_value );
251                         }
252                         else if ( extract_bracketed_tax_code ) {
253                             if ( ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( new_value ).find() ) {
254                                 new_value = extractBracketedTaxCodes( node, new_value );
255                             }
256                             else if ( picky ) {
257                                 throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value
258                                                                     + "\"" );
259                             }
260                         }
261                         switch ( field ) {
262                             case MOL_SEQ:
263                                 if ( verbose ) {
264                                     System.out.println( name + ": " + new_value );
265                                 }
266                                 if ( !node.getNodeData().isHasSequence() ) {
267                                     node.getNodeData().setSequence( new Sequence() );
268                                 }
269                                 node.getNodeData().getSequence().setMolecularSequence( new_value );
270                                 final TYPE type = ForesterUtil.guessMolecularSequenceType( new_value );
271                                 if ( type != null ) {
272                                     if ( type == TYPE.AA ) {
273                                         node.getNodeData().getSequence().setType( "protein" );
274                                     }
275                                     else if ( type == TYPE.DNA ) {
276                                         node.getNodeData().getSequence().setType( "dna" );
277                                     }
278                                     else if ( type == TYPE.RNA ) {
279                                         node.getNodeData().getSequence().setType( "rna" );
280                                     }
281                                 }
282                                 break;
283                             case SEQUENCE_ANNOTATION_DESC:
284                                 if ( verbose ) {
285                                     System.out.println( name + ": " + new_value );
286                                 }
287                                 if ( !node.getNodeData().isHasSequence() ) {
288                                     node.getNodeData().setSequence( new Sequence() );
289                                 }
290                                 final Annotation annotation = new Annotation();
291                                 annotation.setDesc( new_value );
292                                 node.getNodeData().getSequence().addAnnotation( annotation );
293                                 break;
294                             case DOMAIN_STRUCTURE:
295                                 if ( verbose ) {
296                                     System.out.println( name + ": " + new_value );
297                                 }
298                                 if ( !node.getNodeData().isHasSequence() ) {
299                                     node.getNodeData().setSequence( new Sequence() );
300                                 }
301                                 node.getNodeData().getSequence()
302                                 .setDomainArchitecture( new DomainArchitecture( new_value ) );
303                                 break;
304                             case TAXONOMY_CODE:
305                                 if ( verbose ) {
306                                     System.out.println( name + ": " + new_value );
307                                 }
308                                 ForesterUtil.ensurePresenceOfTaxonomy( node );
309                                 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
310                                 break;
311                             case TAXONOMY_SCIENTIFIC_NAME:
312                                 if ( verbose ) {
313                                     System.out.println( name + ": " + new_value );
314                                 }
315                                 ForesterUtil.ensurePresenceOfTaxonomy( node );
316                                 node.getNodeData().getTaxonomy().setScientificName( new_value );
317                                 break;
318                             case SEQUENCE_NAME:
319                                 if ( trim_after_tilde ) {
320                                     new_value = addTildeAnnotation( tilde_annotation, new_value );
321                                 }
322                                 if ( verbose ) {
323                                     System.out.println( name + ": " + new_value );
324                                 }
325                                 if ( !node.getNodeData().isHasSequence() ) {
326                                     node.getNodeData().setSequence( new Sequence() );
327                                 }
328                                 node.getNodeData().getSequence().setName( new_value );
329                                 break;
330                             case NODE_NAME:
331                                 if ( verbose ) {
332                                     System.out.print( name + " -> " );
333                                 }
334                                 if ( cut_name_after_space ) {
335                                     if ( verbose ) {
336                                         System.out.print( new_value + " -> " );
337                                     }
338                                     new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
339                                 }
340                                 if ( trim_after_tilde ) {
341                                     new_value = addTildeAnnotation( tilde_annotation, new_value );
342                                 }
343                                 if ( verbose ) {
344                                     System.out.println( new_value );
345                                 }
346                                 node.setName( new_value );
347                                 break;
348                             default:
349                                 throw new RuntimeException( "unknown field \"" + field + "\"" );
350                         }
351                     }
352                     else {
353                         throw new IllegalArgumentException( "node name \"" + name + "\" maps to empty value" );
354                     }
355                 }
356                 else if ( picky ) {
357                     throw new IllegalArgumentException( "node name \"" + name + "\" not found in map" );
358                 }
359             }
360         }
361         return "updated " + ext_nodes_updated + "/" + ext_nodes + " external nodes, updated " + int_nodes_updated + "/"
362         + int_nodes + " internal nodes";
363     }
364
365     public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
366             throws IOException {
367         final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
368         BasicTable<String> mapping_table = null;
369         mapping_table = BasicTableParser.parse( mapping_table_file, '\t', false, false );
370         for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
371             final Map<String, String> row_map = new HashMap<String, String>();
372             String name = null;
373             for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
374                 final String table_cell = mapping_table.getValue( col, row );
375                 if ( col == 0 ) {
376                     name = table_cell;
377                 }
378                 else if ( table_cell != null ) {
379                     final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
380                     final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
381                     row_map.put( key, val );
382                 }
383             }
384             map.put( name, row_map );
385         }
386         return map;
387     }
388
389     private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
390         if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
391             return new_value;
392         }
393         return new_value + tilde_annotation;
394     }
395
396     private static String deleteAtFirstSpace( final String name ) {
397         final int first_space = name.indexOf( " " );
398         if ( first_space > 1 ) {
399             return name.substring( 0, first_space ).trim();
400         }
401         return name;
402     }
403
404     private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
405         final int i = new_value.lastIndexOf( "[" );
406         final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
407         ForesterUtil.ensurePresenceOfTaxonomy( node );
408         node.getNodeData().getTaxonomy().setScientificName( scientific_name );
409         return new_value.substring( 0, i - 1 ).trim();
410     }
411
412     private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
413         final StringBuilder sb = new StringBuilder();
414         sb.append( new_value );
415         final String tc = extractBracketedTaxCodes( sb );
416         if ( !ForesterUtil.isEmpty( tc ) ) {
417             ForesterUtil.ensurePresenceOfTaxonomy( node );
418             try {
419                 node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
420             }
421             catch ( final PhyloXmlDataFormatException e ) {
422                 throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
423             }
424             return sb.toString().trim();
425         }
426         return new_value;
427     }
428
429     private static String extractBracketedTaxCodes( final StringBuilder sb ) {
430         final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_BRACKETED.matcher( sb );
431         if ( m.find() ) {
432             final String tc = m.group( 1 );
433             sb.delete( m.start( 1 ) - 1, m.end( 1 ) + 1 );
434             return tc;
435         }
436         return null;
437     }
438
439     private static String extractIntermediate( final Map<String, String> intermediate_map,
440                                                final String name,
441                                                final boolean verbose ) {
442         String new_name = null;
443         if ( verbose ) {
444             System.out.print( name + " => " );
445         }
446         if ( intermediate_map.containsKey( name ) ) {
447             new_name = intermediate_map.get( name );
448             if ( ForesterUtil.isEmpty( new_name ) ) {
449                 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
450             }
451         }
452         else {
453             throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
454         }
455         if ( verbose ) {
456             System.out.println( new_name + "  " );
457         }
458         return new_name;
459     }
460
461     public static enum FIELD {
462         DOMAIN_STRUCTURE,
463         MOL_SEQ,
464         NODE_NAME,
465         SEQUENCE_ANNOTATION_DESC,
466         SEQUENCE_NAME,
467         TAXONOMY_CODE,
468         TAXONOMY_SCIENTIFIC_NAME;
469     }
470 }