086e51a5de2252d7ac633084082c089e652610f7
[jalview.git] / forester / java / src / org / forester / tools / PhylogenyDecorator.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 // 
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 // 
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.tools;
27
28 import java.io.File;
29 import java.io.IOException;
30 import java.util.HashMap;
31 import java.util.Map;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import org.forester.io.parsers.nhx.NHXFormatException;
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyNode;
38 import org.forester.phylogeny.data.Accession;
39 import org.forester.phylogeny.data.Annotation;
40 import org.forester.phylogeny.data.DomainArchitecture;
41 import org.forester.phylogeny.data.Identifier;
42 import org.forester.phylogeny.data.Sequence;
43 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
44 import org.forester.util.BasicTable;
45 import org.forester.util.BasicTableParser;
46 import org.forester.util.ForesterUtil;
47
48 public final class PhylogenyDecorator {
49
50     // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
51     final private static String  TP_TAXONOMY_CODE                   = "TAXONOMY_CODE";
52     final private static String  TP_TAXONOMY_ID                     = "TAXONOMY_ID";
53     final private static String  TP_TAXONOMY_ID_PROVIDER            = "TAXONOMY_ID_PROVIDER";
54     final private static String  TP_TAXONOMY_SN                     = "TAXONOMY_SN";
55     final private static String  TP_TAXONOMY_CN                     = "TAXONOMY_CN";
56     final private static String  TP_TAXONOMY_SYN                    = "TAXONOMY_SYN";
57     final private static String  TP_SEQ_SYMBOL                      = "SEQ_SYMBOL";
58     final private static String  TP_SEQ_ACCESSION                   = "SEQ_ACCESSION";
59     final private static String  TP_SEQ_ACCESSION_SOURCE            = "SEQ_ACCESSION_SOURCE";
60     final private static String  TP_SEQ_ANNOTATION_DESC             = "SEQ_ANNOTATION_DESC";
61     final private static String  TP_SEQ_ANNOTATION_REF              = "SEQ_ANNOTATION_REF";
62     final private static String  TP_SEQ_MOL_SEQ                     = "SEQ_MOL_SEQ";
63     final private static String  TP_SEQ_NAME                        = "SEQ_NAME";
64     final private static String  TP_NODE_NAME                       = "NODE_NAME";
65     final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern
66                                                                             .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" );
67     public final static boolean  SANITIZE                           = false;
68     public final static boolean  VERBOSE                            = true;
69
70     private PhylogenyDecorator() {
71         // Not needed.
72     }
73
74     public static void decorate( final Phylogeny phylogeny,
75                                  final Map<String, Map<String, String>> map,
76                                  final boolean picky,
77                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
78             throws IllegalArgumentException {
79         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
80             final PhylogenyNode node = iter.next();
81             final String name = node.getName();
82             if ( !ForesterUtil.isEmpty( name ) ) {
83                 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
84                     Map<String, String> new_values = map.get( name );
85                     int x = 0;
86                     while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
87                             && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
88                         new_values = map.get( name.substring( 0, name.length() - x ) );
89                         ++x;
90                     }
91                     if ( new_values != null ) {
92                         if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
93                             ForesterUtil.ensurePresenceOfTaxonomy( node );
94                             node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
95                         }
96                         if ( new_values.containsKey( TP_TAXONOMY_ID )
97                                 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
98                             ForesterUtil.ensurePresenceOfTaxonomy( node );
99                             node.getNodeData().getTaxonomy().setIdentifier( new Identifier( new_values
100                                     .get( TP_TAXONOMY_ID ), new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
101                         }
102                         else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
103                             ForesterUtil.ensurePresenceOfTaxonomy( node );
104                             node.getNodeData().getTaxonomy().setIdentifier( new Identifier( new_values
105                                     .get( TP_TAXONOMY_ID ) ) );
106                         }
107                         if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
108                             ForesterUtil.ensurePresenceOfTaxonomy( node );
109                             node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
110                         }
111                         if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
112                             ForesterUtil.ensurePresenceOfTaxonomy( node );
113                             node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
114                         }
115                         if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
116                             ForesterUtil.ensurePresenceOfTaxonomy( node );
117                             node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
118                         }
119                         if ( new_values.containsKey( TP_SEQ_ACCESSION )
120                                 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
121                             ForesterUtil.ensurePresenceOfSequence( node );
122                             node.getNodeData().getSequence().setAccession( new Accession( new_values
123                                     .get( TP_SEQ_ACCESSION ), new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
124                         }
125                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
126                             ForesterUtil.ensurePresenceOfSequence( node );
127                             final Annotation ann = new Annotation( "?" );
128                             ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
129                             node.getNodeData().getSequence().addAnnotation( ann );
130                         }
131                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
132                             ForesterUtil.ensurePresenceOfSequence( node );
133                             final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
134                             node.getNodeData().getSequence().addAnnotation( ann );
135                         }
136                         if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
137                             ForesterUtil.ensurePresenceOfSequence( node );
138                             node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
139                         }
140                         if ( new_values.containsKey( TP_SEQ_NAME ) ) {
141                             ForesterUtil.ensurePresenceOfSequence( node );
142                             node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
143                         }
144                         if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
145                             ForesterUtil.ensurePresenceOfSequence( node );
146                             node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
147                         }
148                         if ( new_values.containsKey( TP_NODE_NAME ) ) {
149                             node.setName( new_values.get( TP_NODE_NAME ) );
150                         }
151                     }
152                 }
153                 else if ( picky ) {
154                     throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
155                 }
156             }
157         }
158     }
159
160     /**
161      * 
162      * 
163      * 
164      * 
165      * 
166      * @param phylogeny
167      * @param map
168      *            maps names (in phylogeny) to new values
169      * @param field
170      * @param picky
171      * @throws IllegalArgumentException
172      * @throws NHXFormatException
173      */
174     public static void decorate( final Phylogeny phylogeny,
175                                  final Map<String, String> map,
176                                  final FIELD field,
177                                  final boolean extract_bracketed_scientific_name,
178                                  final boolean picky,
179                                  final boolean cut_name_after_space,
180                                  final boolean process_name_intelligently,
181                                  final boolean process_similar_to,
182                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
183                                  final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
184             NHXFormatException {
185         PhylogenyDecorator.decorate( phylogeny,
186                                      map,
187                                      field,
188                                      extract_bracketed_scientific_name,
189                                      picky,
190                                      null,
191                                      cut_name_after_space,
192                                      process_name_intelligently,
193                                      process_similar_to,
194                                      numbers_of_chars_allowed_to_remove_if_not_found_in_map,
195                                      move_domain_numbers_at_end_to_middle );
196     }
197
198     /**
199      * 
200      * 
201      * 
202      * @param phylogeny
203      * @param map
204      *            maps names (in phylogeny) to new values if intermediate_map is
205      *            null otherwise maps intermediate value to new value
206      * @param field
207      * @param picky
208      * @param intermediate_map
209      *            maps name (in phylogeny) to a intermediate value
210      * @throws IllegalArgumentException
211      */
212     public static void decorate( final Phylogeny phylogeny,
213                                  final Map<String, String> map,
214                                  final FIELD field,
215                                  final boolean extract_bracketed_scientific_name,
216                                  final boolean picky,
217                                  final Map<String, String> intermediate_map,
218                                  final boolean cut_name_after_space,
219                                  final boolean process_name_intelligently,
220                                  final boolean process_similar_to,
221                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
222                                  final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException {
223         if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
224             throw new IllegalArgumentException( "Attempt to extract bracketed scientific name together with data field pointing to scientific name" );
225         }
226         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
227             final PhylogenyNode node = iter.next();
228             String name = node.getName();
229             if ( !ForesterUtil.isEmpty( name ) ) {
230                 if ( intermediate_map != null ) {
231                     name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
232                 }
233                 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
234                     String new_value = map.get( name );
235                     int x = 0;
236                     while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 )
237                             && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) {
238                         new_value = map.get( name.substring( 0, name.length() - x ) );
239                         ++x;
240                     }
241                     if ( new_value != null ) {
242                         new_value = new_value.trim();
243                         new_value.replaceAll( "/\\s+/", " " );
244                         if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
245                             extractBracketedScientificNames( node, new_value );
246                         }
247                         switch ( field ) {
248                             case SEQUENCE_ANNOTATION_DESC:
249                                 if ( PhylogenyDecorator.VERBOSE ) {
250                                     System.out.println( name + ": " + new_value );
251                                 }
252                                 if ( !node.getNodeData().isHasSequence() ) {
253                                     node.getNodeData().setSequence( new Sequence() );
254                                 }
255                                 final Annotation annotation = new Annotation( "?" );
256                                 annotation.setDesc( new_value );
257                                 node.getNodeData().getSequence().addAnnotation( annotation );
258                                 break;
259                             case DOMAIN_STRUCTURE:
260                                 if ( PhylogenyDecorator.VERBOSE ) {
261                                     System.out.println( name + ": " + new_value );
262                                 }
263                                 if ( !node.getNodeData().isHasSequence() ) {
264                                     node.getNodeData().setSequence( new Sequence() );
265                                 }
266                                 node.getNodeData().getSequence()
267                                         .setDomainArchitecture( new DomainArchitecture( new_value ) );
268                                 break;
269                             case TAXONOMY_CODE:
270                                 if ( PhylogenyDecorator.VERBOSE ) {
271                                     System.out.println( name + ": " + new_value );
272                                 }
273                                 ForesterUtil.ensurePresenceOfTaxonomy( node );
274                                 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
275                                 break;
276                             case TAXONOMY_SCIENTIFIC_NAME:
277                                 if ( PhylogenyDecorator.VERBOSE ) {
278                                     System.out.println( name + ": " + new_value );
279                                 }
280                                 ForesterUtil.ensurePresenceOfTaxonomy( node );
281                                 node.getNodeData().getTaxonomy().setScientificName( new_value );
282                                 break;
283                             case SEQUENCE_NAME:
284                                 if ( PhylogenyDecorator.VERBOSE ) {
285                                     System.out.println( name + ": " + new_value );
286                                 }
287                                 if ( !node.getNodeData().isHasSequence() ) {
288                                     node.getNodeData().setSequence( new Sequence() );
289                                 }
290                                 node.getNodeData().getSequence().setName( new_value );
291                                 break;
292                             case NODE_NAME:
293                                 if ( PhylogenyDecorator.VERBOSE ) {
294                                     System.out.print( name + " -> " );
295                                 }
296                                 if ( cut_name_after_space ) {
297                                     if ( PhylogenyDecorator.VERBOSE ) {
298                                         System.out.print( new_value + " -> " );
299                                     }
300                                     new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value );
301                                 }
302                                 else if ( process_name_intelligently ) {
303                                     if ( PhylogenyDecorator.VERBOSE ) {
304                                         System.out.print( new_value + " -> " );
305                                     }
306                                     new_value = PhylogenyDecorator.processNameIntelligently( new_value );
307                                 }
308                                 else if ( process_similar_to ) {
309                                     if ( PhylogenyDecorator.VERBOSE ) {
310                                         System.out.print( new_value + " -> " );
311                                     }
312                                     new_value = PhylogenyDecorator.processSimilarTo( new_value );
313                                 }
314                                 if ( PhylogenyDecorator.SANITIZE ) {
315                                     new_value = PhylogenyDecorator.sanitize( new_value );
316                                 }
317                                 if ( PhylogenyDecorator.VERBOSE ) {
318                                     System.out.println( new_value );
319                                 }
320                                 node.setName( new_value );
321                                 break;
322                             default:
323                                 throw new RuntimeException( "unknown field \"" + field + "\"" );
324                         }
325                         if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) {
326                             node.setName( moveDomainNumbersAtEnd( node.getName() ) );
327                         }
328                     }
329                 }
330                 else if ( picky ) {
331                     throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
332                 }
333             }
334         }
335     }
336
337     public static void decorate( final Phylogeny[] phylogenies,
338                                  final Map<String, Map<String, String>> map,
339                                  final boolean picky,
340                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
341             throws IllegalArgumentException, NHXFormatException {
342         for( int i = 0; i < phylogenies.length; ++i ) {
343             PhylogenyDecorator.decorate( phylogenies[ i ],
344                                          map,
345                                          picky,
346                                          numbers_of_chars_allowed_to_remove_if_not_found_in_map );
347         }
348     }
349
350     public static void decorate( final Phylogeny[] phylogenies,
351                                  final Map<String, String> map,
352                                  final FIELD field,
353                                  final boolean extract_bracketed_scientific_name,
354                                  final boolean picky,
355                                  final boolean cut_name_after_space,
356                                  final boolean process_name_intelligently,
357                                  final boolean process_similar_to,
358                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
359                                  final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
360             NHXFormatException {
361         for( int i = 0; i < phylogenies.length; ++i ) {
362             PhylogenyDecorator.decorate( phylogenies[ i ],
363                                          map,
364                                          field,
365                                          extract_bracketed_scientific_name,
366                                          picky,
367                                          cut_name_after_space,
368                                          process_name_intelligently,
369                                          process_similar_to,
370                                          numbers_of_chars_allowed_to_remove_if_not_found_in_map,
371                                          move_domain_numbers_at_end_to_middle );
372         }
373     }
374
375     public static void decorate( final Phylogeny[] phylogenies,
376                                  final Map<String, String> map,
377                                  final FIELD field,
378                                  final boolean extract_bracketed_scientific_name,
379                                  final boolean picky,
380                                  final Map<String, String> intermediate_map,
381                                  final boolean cut_name_after_space,
382                                  final boolean process_name_intelligently,
383                                  final boolean process_similar_to,
384                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
385                                  final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
386             NHXFormatException {
387         for( int i = 0; i < phylogenies.length; ++i ) {
388             PhylogenyDecorator.decorate( phylogenies[ i ],
389                                          map,
390                                          field,
391                                          extract_bracketed_scientific_name,
392                                          picky,
393                                          intermediate_map,
394                                          cut_name_after_space,
395                                          process_name_intelligently,
396                                          process_similar_to,
397                                          numbers_of_chars_allowed_to_remove_if_not_found_in_map,
398                                          move_domain_numbers_at_end_to_middle );
399         }
400     }
401
402     private static String deleteAtFirstSpace( final String name ) {
403         final int first_space = name.indexOf( " " );
404         if ( first_space > 1 ) {
405             return name.substring( 0, first_space ).trim();
406         }
407         return name;
408     }
409
410     private static void extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
411         final int i = new_value.lastIndexOf( "[" );
412         final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
413         ForesterUtil.ensurePresenceOfTaxonomy( node );
414         node.getNodeData().getTaxonomy().setScientificName( scientific_name );
415     }
416
417     private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
418         String new_name = null;
419         if ( PhylogenyDecorator.VERBOSE ) {
420             System.out.print( name + " => " );
421         }
422         if ( intermediate_map.containsKey( name ) ) {
423             new_name = intermediate_map.get( name );
424             if ( ForesterUtil.isEmpty( new_name ) ) {
425                 throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" );
426             }
427         }
428         else {
429             throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" );
430         }
431         if ( PhylogenyDecorator.VERBOSE ) {
432             System.out.println( new_name + "  " );
433         }
434         return new_name;
435     }
436
437     private static String moveDomainNumbersAtEnd( final String node_name ) {
438         final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name );
439         if ( m.matches() ) {
440             final String seq_number = m.group( 1 );
441             final String tax = m.group( 2 );
442             final String domain_number = m.group( 3 );
443             return seq_number + "_[" + domain_number + "]_" + tax;
444         }
445         else {
446             return node_name;
447         }
448     }
449
450     public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
451             throws IOException {
452         final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
453         BasicTable<String> mapping_table = null;
454         mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false );
455         for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
456             final Map<String, String> row_map = new HashMap<String, String>();
457             String name = null;
458             for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
459                 final String table_cell = mapping_table.getValue( col, row );
460                 if ( col == 0 ) {
461                     name = table_cell;
462                 }
463                 else if ( table_cell != null ) {
464                     final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
465                     final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
466                     row_map.put( key, val );
467                 }
468             }
469             map.put( name, row_map );
470         }
471         return map;
472     }
473
474     private static String processNameIntelligently( final String name ) {
475         final String[] s = name.split( " " );
476         if ( s.length < 2 ) {
477             return name;
478         }
479         else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) {
480             return s[ 0 ];
481         }
482         else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) {
483             return s[ 1 ];
484         }
485         else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) {
486             return s[ 0 ];
487         }
488         else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) {
489             return s[ 1 ];
490         }
491         else if ( s[ 0 ].indexOf( "_" ) > 0 ) {
492             return s[ 0 ];
493         }
494         else if ( s[ 1 ].indexOf( "_" ) > 0 ) {
495             return s[ 1 ];
496         }
497         else {
498             return s[ 0 ];
499         }
500     }
501
502     private static String processSimilarTo( final String name ) {
503         final int i = name.toLowerCase().indexOf( "similar to" );
504         String similar_to = "";
505         if ( i >= 0 ) {
506             similar_to = " similarity=" + name.substring( i + 10 ).trim();
507         }
508         final String pi = processNameIntelligently( name );
509         return pi + similar_to;
510     }
511
512     private static String sanitize( String s ) {
513         s = s.replace( ' ', '_' );
514         s = s.replace( '(', '{' );
515         s = s.replace( ')', '}' );
516         s = s.replace( '[', '{' );
517         s = s.replace( ']', '}' );
518         s = s.replace( ',', '_' );
519         return s;
520     }
521
522     public static enum FIELD {
523         NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME;
524     }
525 }