inprogress
[jalview.git] / forester / java / src / org / forester / tools / PhylogenyDecorator.java
index ebc1a6a..9d865e1 100644 (file)
@@ -21,7 +21,7 @@
 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 //
 // Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
 
 package org.forester.tools;
 
@@ -30,9 +30,10 @@ import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.forester.io.parsers.nhx.NHXFormatException;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
+import org.forester.io.parsers.util.ParserUtils;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.data.Accession;
@@ -48,24 +49,22 @@ import org.forester.util.ForesterUtil;
 public final class PhylogenyDecorator {
 
     // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb:
-    final private static String  TP_TAXONOMY_CODE                   = "TAXONOMY_CODE";
-    final private static String  TP_TAXONOMY_ID                     = "TAXONOMY_ID";
-    final private static String  TP_TAXONOMY_ID_PROVIDER            = "TAXONOMY_ID_PROVIDER";
-    final private static String  TP_TAXONOMY_SN                     = "TAXONOMY_SN";
-    final private static String  TP_TAXONOMY_CN                     = "TAXONOMY_CN";
-    final private static String  TP_TAXONOMY_SYN                    = "TAXONOMY_SYN";
-    final private static String  TP_SEQ_SYMBOL                      = "SEQ_SYMBOL";
-    final private static String  TP_SEQ_ACCESSION                   = "SEQ_ACCESSION";
-    final private static String  TP_SEQ_ACCESSION_SOURCE            = "SEQ_ACCESSION_SOURCE";
-    final private static String  TP_SEQ_ANNOTATION_DESC             = "SEQ_ANNOTATION_DESC";
-    final private static String  TP_SEQ_ANNOTATION_REF              = "SEQ_ANNOTATION_REF";
-    final private static String  TP_SEQ_MOL_SEQ                     = "SEQ_MOL_SEQ";
-    final private static String  TP_SEQ_NAME                        = "SEQ_NAME";
-    final private static String  TP_NODE_NAME                       = "NODE_NAME";
-    final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern
-                                                                            .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" );
-    public final static boolean  SANITIZE                           = false;
-    public final static boolean  VERBOSE                            = true;
+    final private static String TP_TAXONOMY_CODE        = "TAXONOMY_CODE";
+    final private static String TP_TAXONOMY_ID          = "TAXONOMY_ID";
+    final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER";
+    final private static String TP_TAXONOMY_SN          = "TAXONOMY_SN";
+    final private static String TP_TAXONOMY_CN          = "TAXONOMY_CN";
+    final private static String TP_TAXONOMY_SYN         = "TAXONOMY_SYN";
+    final private static String TP_SEQ_SYMBOL           = "SEQ_SYMBOL";
+    final private static String TP_SEQ_ACCESSION        = "SEQ_ACCESSION";
+    final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE";
+    final private static String TP_SEQ_ANNOTATION_DESC  = "SEQ_ANNOTATION_DESC";
+    final private static String TP_SEQ_ANNOTATION_REF   = "SEQ_ANNOTATION_REF";
+    final private static String TP_SEQ_MOL_SEQ          = "SEQ_MOL_SEQ";
+    final private static String TP_SEQ_NAME             = "SEQ_NAME";
+    final private static String TP_NODE_NAME            = "NODE_NAME";
+    public final static boolean SANITIZE                = false;
+    public final static boolean VERBOSE                 = true;
 
     private PhylogenyDecorator() {
         // Not needed.
@@ -75,7 +74,7 @@ public final class PhylogenyDecorator {
                                  final Map<String, Map<String, String>> map,
                                  final boolean picky,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
-            throws IllegalArgumentException {
+            throws IllegalArgumentException, PhyloXmlDataFormatException {
         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
             final PhylogenyNode node = iter.next();
             final String name = node.getName();
@@ -128,7 +127,7 @@ public final class PhylogenyDecorator {
                         }
                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
                             ForesterUtil.ensurePresenceOfSequence( node );
-                            final Annotation ann = new Annotation( "?" );
+                            final Annotation ann = new Annotation();
                             ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
                             node.getNodeData().getSequence().addAnnotation( ann );
                         }
@@ -152,8 +151,8 @@ public final class PhylogenyDecorator {
                         if ( new_values.containsKey( TP_NODE_NAME ) ) {
                             node.setName( new_values.get( TP_NODE_NAME ) );
                         }
-                    }
-                }
+                    } // if ( new_values != null ) 
+                } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
                 else if ( picky ) {
                     throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
                 }
@@ -161,42 +160,30 @@ public final class PhylogenyDecorator {
         }
     }
 
-    /**
-     * 
-     * 
-     * 
-     * 
-     * 
-     * @param phylogeny
-     * @param map
-     *            maps names (in phylogeny) to new values
-     * @param field
-     * @param picky
-     * @throws IllegalArgumentException
-     * @throws NHXFormatException
-     */
     public static void decorate( final Phylogeny phylogeny,
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
-            NHXFormatException {
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+            PhyloXmlDataFormatException {
         PhylogenyDecorator.decorate( phylogeny,
                                      map,
                                      field,
                                      extract_bracketed_scientific_name,
+                                     extract_bracketed_tax_code,
                                      picky,
                                      null,
                                      cut_name_after_space,
                                      process_name_intelligently,
                                      process_similar_to,
                                      numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                     move_domain_numbers_at_end_to_middle );
+                                     trim_after_tilde );
     }
 
     /**
@@ -212,24 +199,36 @@ public final class PhylogenyDecorator {
      * @param intermediate_map
      *            maps name (in phylogeny) to a intermediate value
      * @throws IllegalArgumentException
+     * @throws PhyloXmlDataFormatException 
      */
     public static void decorate( final Phylogeny phylogeny,
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final Map<String, String> intermediate_map,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException {
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException,
+            PhyloXmlDataFormatException {
         if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
-            throw new IllegalArgumentException( "Attempt to extract bracketed scientific name together with data field pointing to scientific name" );
+            throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
+        }
+        if ( map.isEmpty() ) {
+            throw new IllegalArgumentException( "map is empty" );
         }
         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
             final PhylogenyNode node = iter.next();
             String name = node.getName();
+            String tilde_annotation = null;
+            if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
+                final int ti = name.indexOf( '~' );
+                tilde_annotation = name.substring( ti );
+                name = name.substring( 0, ti );
+            }
             if ( !ForesterUtil.isEmpty( name ) ) {
                 if ( intermediate_map != null ) {
                     name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
@@ -246,7 +245,16 @@ public final class PhylogenyDecorator {
                         new_value = new_value.trim();
                         new_value.replaceAll( "/\\s+/", " " );
                         if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
-                            extractBracketedScientificNames( node, new_value );
+                            new_value = extractBracketedScientificNames( node, new_value );
+                        }
+                        else if ( extract_bracketed_tax_code ) {
+                            if ( ParserUtils.TAXOMONY_CODE_PATTERN_4.matcher( new_value ).find() ) {
+                                new_value = extractBracketedTaxCodes( node, new_value );
+                            }
+                            else if ( picky ) {
+                                throw new IllegalArgumentException( " could not get taxonomy from \"" + new_value
+                                        + "\"" );
+                            }
                         }
                         switch ( field ) {
                             case SEQUENCE_ANNOTATION_DESC:
@@ -285,6 +293,9 @@ public final class PhylogenyDecorator {
                                 node.getNodeData().getTaxonomy().setScientificName( new_value );
                                 break;
                             case SEQUENCE_NAME:
+                                if ( trim_after_tilde ) {
+                                    new_value = addTildeAnnotation( tilde_annotation, new_value );
+                                }
                                 if ( PhylogenyDecorator.VERBOSE ) {
                                     System.out.println( name + ": " + new_value );
                                 }
@@ -318,6 +329,9 @@ public final class PhylogenyDecorator {
                                 if ( PhylogenyDecorator.SANITIZE ) {
                                     new_value = PhylogenyDecorator.sanitize( new_value );
                                 }
+                                if ( trim_after_tilde ) {
+                                    new_value = addTildeAnnotation( tilde_annotation, new_value );
+                                }
                                 if ( PhylogenyDecorator.VERBOSE ) {
                                     System.out.println( new_value );
                                 }
@@ -326,9 +340,6 @@ public final class PhylogenyDecorator {
                             default:
                                 throw new RuntimeException( "unknown field \"" + field + "\"" );
                         }
-                        if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) {
-                            node.setName( moveDomainNumbersAtEnd( node.getName() ) );
-                        }
                     }
                 }
                 else if ( picky ) {
@@ -338,16 +349,21 @@ public final class PhylogenyDecorator {
         }
     }
 
+    private final static String addTildeAnnotation( final String tilde_annotation, final String new_value ) {
+        if ( ForesterUtil.isEmpty( tilde_annotation ) ) {
+            return new_value;
+        }
+        return new_value + tilde_annotation;
+    }
+
     public static void decorate( final Phylogeny[] phylogenies,
                                  final Map<String, Map<String, String>> map,
                                  final boolean picky,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
-            throws IllegalArgumentException, NHXFormatException {
-        for( int i = 0; i < phylogenies.length; ++i ) {
-            PhylogenyDecorator.decorate( phylogenies[ i ],
-                                         map,
-                                         picky,
-                                         numbers_of_chars_allowed_to_remove_if_not_found_in_map );
+            throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException {
+        for( final Phylogeny phylogenie : phylogenies ) {
+            PhylogenyDecorator
+                    .decorate( phylogenie, map, picky, numbers_of_chars_allowed_to_remove_if_not_found_in_map );
         }
     }
 
@@ -355,24 +371,26 @@ public final class PhylogenyDecorator {
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
-            NHXFormatException {
-        for( int i = 0; i < phylogenies.length; ++i ) {
-            PhylogenyDecorator.decorate( phylogenies[ i ],
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+            PhyloXmlDataFormatException {
+        for( final Phylogeny phylogenie : phylogenies ) {
+            PhylogenyDecorator.decorate( phylogenie,
                                          map,
                                          field,
                                          extract_bracketed_scientific_name,
+                                         extract_bracketed_tax_code,
                                          picky,
                                          cut_name_after_space,
                                          process_name_intelligently,
                                          process_similar_to,
                                          numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                         move_domain_numbers_at_end_to_middle );
+                                         trim_after_tilde );
         }
     }
 
@@ -380,27 +398,53 @@ public final class PhylogenyDecorator {
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final Map<String, String> intermediate_map,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
-            NHXFormatException {
-        for( int i = 0; i < phylogenies.length; ++i ) {
-            PhylogenyDecorator.decorate( phylogenies[ i ],
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+            PhyloXmlDataFormatException {
+        for( final Phylogeny phylogenie : phylogenies ) {
+            PhylogenyDecorator.decorate( phylogenie,
                                          map,
                                          field,
                                          extract_bracketed_scientific_name,
+                                         extract_bracketed_tax_code,
                                          picky,
                                          intermediate_map,
                                          cut_name_after_space,
                                          process_name_intelligently,
                                          process_similar_to,
                                          numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                         move_domain_numbers_at_end_to_middle );
+                                         trim_after_tilde );
+        }
+    }
+
+    public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
+            throws IOException {
+        final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
+        BasicTable<String> mapping_table = null;
+        mapping_table = BasicTableParser.parse( mapping_table_file, '\t', false, false );
+        for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
+            final Map<String, String> row_map = new HashMap<String, String>();
+            String name = null;
+            for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
+                final String table_cell = mapping_table.getValue( col, row );
+                if ( col == 0 ) {
+                    name = table_cell;
+                }
+                else if ( table_cell != null ) {
+                    final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
+                    final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
+                    row_map.put( key, val );
+                }
+            }
+            map.put( name, row_map );
         }
+        return map;
     }
 
     private static String deleteAtFirstSpace( final String name ) {
@@ -411,11 +455,28 @@ public final class PhylogenyDecorator {
         return name;
     }
 
-    private static void extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
+    private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
         final int i = new_value.lastIndexOf( "[" );
         final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
         ForesterUtil.ensurePresenceOfTaxonomy( node );
         node.getNodeData().getTaxonomy().setScientificName( scientific_name );
+        return new_value.substring( 0, i - 1 ).trim();
+    }
+
+    private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
+        final Matcher m = ParserUtils.TAXOMONY_CODE_PATTERN_4.matcher( new_value );
+        String tc = "?";
+        if ( m.find() ) {
+            tc = m.group( 1 );
+        }
+        ForesterUtil.ensurePresenceOfTaxonomy( node );
+        try {
+            node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
+        }
+        catch ( final PhyloXmlDataFormatException e ) {
+            throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
+        }
+        return new_value; //TODO //FIXME
     }
 
     private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
@@ -438,43 +499,6 @@ public final class PhylogenyDecorator {
         return new_name;
     }
 
-    private static String moveDomainNumbersAtEnd( final String node_name ) {
-        final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name );
-        if ( m.matches() ) {
-            final String seq_number = m.group( 1 );
-            final String tax = m.group( 2 );
-            final String domain_number = m.group( 3 );
-            return seq_number + "_[" + domain_number + "]_" + tax;
-        }
-        else {
-            return node_name;
-        }
-    }
-
-    public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
-            throws IOException {
-        final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
-        BasicTable<String> mapping_table = null;
-        mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false );
-        for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
-            final Map<String, String> row_map = new HashMap<String, String>();
-            String name = null;
-            for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) {
-                final String table_cell = mapping_table.getValue( col, row );
-                if ( col == 0 ) {
-                    name = table_cell;
-                }
-                else if ( table_cell != null ) {
-                    final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) );
-                    final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() );
-                    row_map.put( key, val );
-                }
-            }
-            map.put( name, row_map );
-        }
-        return map;
-    }
-
     private static String processNameIntelligently( final String name ) {
         final String[] s = name.split( " " );
         if ( s.length < 2 ) {