rio - gsdir work...
[jalview.git] / forester / java / src / org / forester / tools / PhylogenyDecorator.java
index f01701a..7e70444 100644 (file)
@@ -29,10 +29,8 @@ import java.io.File;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.forester.archaeopteryx.AptxUtil;
 import org.forester.io.parsers.nhx.NHXFormatException;
 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
 import org.forester.phylogeny.Phylogeny;
@@ -68,6 +66,7 @@ public final class PhylogenyDecorator {
                                                                             .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" );
     public final static boolean  SANITIZE                           = false;
     public final static boolean  VERBOSE                            = true;
+    private static final boolean CUT                                = true;
 
     private PhylogenyDecorator() {
         // Not needed.
@@ -92,63 +91,63 @@ public final class PhylogenyDecorator {
                     }
                     if ( new_values != null ) {
                         if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) {
-                            AptxUtil.ensurePresenceOfTaxonomy( node );
+                            ForesterUtil.ensurePresenceOfTaxonomy( node );
                             node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) );
                         }
                         if ( new_values.containsKey( TP_TAXONOMY_ID )
                                 && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) {
-                            AptxUtil.ensurePresenceOfTaxonomy( node );
+                            ForesterUtil.ensurePresenceOfTaxonomy( node );
                             node.getNodeData()
                                     .getTaxonomy()
                                     .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ),
                                                                     new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) );
                         }
                         else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) {
-                            AptxUtil.ensurePresenceOfTaxonomy( node );
+                            ForesterUtil.ensurePresenceOfTaxonomy( node );
                             node.getNodeData().getTaxonomy()
                                     .setIdentifier( new Identifier( new_values.get( TP_TAXONOMY_ID ) ) );
                         }
                         if ( new_values.containsKey( TP_TAXONOMY_SN ) ) {
-                            AptxUtil.ensurePresenceOfTaxonomy( node );
+                            ForesterUtil.ensurePresenceOfTaxonomy( node );
                             node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) );
                         }
                         if ( new_values.containsKey( TP_TAXONOMY_CN ) ) {
-                            AptxUtil.ensurePresenceOfTaxonomy( node );
+                            ForesterUtil.ensurePresenceOfTaxonomy( node );
                             node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) );
                         }
                         if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) {
-                            AptxUtil.ensurePresenceOfTaxonomy( node );
+                            ForesterUtil.ensurePresenceOfTaxonomy( node );
                             node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) );
                         }
                         if ( new_values.containsKey( TP_SEQ_ACCESSION )
                                 && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) {
-                            AptxUtil.ensurePresenceOfSequence( node );
+                            ForesterUtil.ensurePresenceOfSequence( node );
                             node.getNodeData()
                                     .getSequence()
                                     .setAccession( new Accession( new_values.get( TP_SEQ_ACCESSION ),
                                                                   new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) );
                         }
                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) {
-                            AptxUtil.ensurePresenceOfSequence( node );
-                            final Annotation ann = new Annotation( "?" );
+                            ForesterUtil.ensurePresenceOfSequence( node );
+                            final Annotation ann = new Annotation();
                             ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) );
                             node.getNodeData().getSequence().addAnnotation( ann );
                         }
                         if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) {
-                            AptxUtil.ensurePresenceOfSequence( node );
+                            ForesterUtil.ensurePresenceOfSequence( node );
                             final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) );
                             node.getNodeData().getSequence().addAnnotation( ann );
                         }
                         if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) {
-                            AptxUtil.ensurePresenceOfSequence( node );
+                            ForesterUtil.ensurePresenceOfSequence( node );
                             node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) );
                         }
                         if ( new_values.containsKey( TP_SEQ_NAME ) ) {
-                            AptxUtil.ensurePresenceOfSequence( node );
+                            ForesterUtil.ensurePresenceOfSequence( node );
                             node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) );
                         }
                         if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) {
-                            AptxUtil.ensurePresenceOfSequence( node );
+                            ForesterUtil.ensurePresenceOfSequence( node );
                             node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) );
                         }
                         if ( new_values.containsKey( TP_NODE_NAME ) ) {
@@ -157,7 +156,6 @@ public final class PhylogenyDecorator {
                     } // if ( new_values != null ) 
                 } // if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) )
                 else if ( picky ) {
-                    System.out.println( map.toString() );
                     throw new IllegalArgumentException( "\"" + name + "\" not found in name map" );
                 }
             }
@@ -183,24 +181,26 @@ public final class PhylogenyDecorator {
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
-            NHXFormatException, PhyloXmlDataFormatException {
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+            PhyloXmlDataFormatException {
         PhylogenyDecorator.decorate( phylogeny,
                                      map,
                                      field,
                                      extract_bracketed_scientific_name,
+                                     extract_bracketed_tax_code,
                                      picky,
                                      null,
                                      cut_name_after_space,
                                      process_name_intelligently,
                                      process_similar_to,
                                      numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                     move_domain_numbers_at_end_to_middle );
+                                     trim_after_tilde );
     }
 
     /**
@@ -222,24 +222,46 @@ public final class PhylogenyDecorator {
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final Map<String, String> intermediate_map,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException,
             PhyloXmlDataFormatException {
         if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) {
-            throw new IllegalArgumentException( "Attempt to extract bracketed scientific name together with data field pointing to scientific name" );
+            throw new IllegalArgumentException( "attempt to extract bracketed scientific name together with data field pointing to scientific name" );
         }
         for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) {
             final PhylogenyNode node = iter.next();
             String name = node.getName();
+            if ( trim_after_tilde && ( name.indexOf( '~' ) > 0 ) ) {
+                name = name.substring( 0, name.indexOf( '~' ) );
+            }
             if ( !ForesterUtil.isEmpty( name ) ) {
                 if ( intermediate_map != null ) {
                     name = PhylogenyDecorator.extractIntermediate( intermediate_map, name );
                 }
+                // int space_index = name.indexOf( " " );
+                //                if ( CUT && space_index > 0 ) {
+                //                    int y = name.lastIndexOf( "|" );
+                //                    name = name.substring( y + 1, space_index );
+                //                }
+                //                String new_value = null;
+                //                for( String key : map.keySet() ) {
+                //                    if ( key.indexOf( name ) >= 0 ) {
+                //                        if ( new_value == null ) {
+                //                            new_value = map.get( key );
+                //                        }
+                //                        else {
+                //                            System.out.println( name + " is not unique" );
+                //                            System.exit( -1 );
+                //                        }
+                //                    }
+                //                }
+                // if ( new_value != null ) {
                 if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) {
                     String new_value = map.get( name );
                     int x = 0;
@@ -252,7 +274,10 @@ public final class PhylogenyDecorator {
                         new_value = new_value.trim();
                         new_value.replaceAll( "/\\s+/", " " );
                         if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) {
-                            extractBracketedScientificNames( node, new_value );
+                            new_value = extractBracketedScientificNames( node, new_value );
+                        }
+                        else if ( extract_bracketed_tax_code && new_value.endsWith( "]" ) ) {
+                            new_value = extractBracketedTaxCodes( node, new_value );
                         }
                         switch ( field ) {
                             case SEQUENCE_ANNOTATION_DESC:
@@ -280,14 +305,14 @@ public final class PhylogenyDecorator {
                                 if ( PhylogenyDecorator.VERBOSE ) {
                                     System.out.println( name + ": " + new_value );
                                 }
-                                AptxUtil.ensurePresenceOfTaxonomy( node );
+                                ForesterUtil.ensurePresenceOfTaxonomy( node );
                                 node.getNodeData().getTaxonomy().setTaxonomyCode( new_value );
                                 break;
                             case TAXONOMY_SCIENTIFIC_NAME:
                                 if ( PhylogenyDecorator.VERBOSE ) {
                                     System.out.println( name + ": " + new_value );
                                 }
-                                AptxUtil.ensurePresenceOfTaxonomy( node );
+                                ForesterUtil.ensurePresenceOfTaxonomy( node );
                                 node.getNodeData().getTaxonomy().setScientificName( new_value );
                                 break;
                             case SEQUENCE_NAME:
@@ -332,9 +357,6 @@ public final class PhylogenyDecorator {
                             default:
                                 throw new RuntimeException( "unknown field \"" + field + "\"" );
                         }
-                        if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) {
-                            node.setName( moveDomainNumbersAtEnd( node.getName() ) );
-                        }
                     }
                 }
                 else if ( picky ) {
@@ -349,11 +371,9 @@ public final class PhylogenyDecorator {
                                  final boolean picky,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map )
             throws IllegalArgumentException, NHXFormatException, PhyloXmlDataFormatException {
-        for( int i = 0; i < phylogenies.length; ++i ) {
-            PhylogenyDecorator.decorate( phylogenies[ i ],
-                                         map,
-                                         picky,
-                                         numbers_of_chars_allowed_to_remove_if_not_found_in_map );
+        for( final Phylogeny phylogenie : phylogenies ) {
+            PhylogenyDecorator
+                    .decorate( phylogenie, map, picky, numbers_of_chars_allowed_to_remove_if_not_found_in_map );
         }
     }
 
@@ -361,24 +381,26 @@ public final class PhylogenyDecorator {
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
-            NHXFormatException, PhyloXmlDataFormatException {
-        for( int i = 0; i < phylogenies.length; ++i ) {
-            PhylogenyDecorator.decorate( phylogenies[ i ],
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+            PhyloXmlDataFormatException {
+        for( final Phylogeny phylogenie : phylogenies ) {
+            PhylogenyDecorator.decorate( phylogenie,
                                          map,
                                          field,
                                          extract_bracketed_scientific_name,
+                                         extract_bracketed_tax_code,
                                          picky,
                                          cut_name_after_space,
                                          process_name_intelligently,
                                          process_similar_to,
                                          numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                         move_domain_numbers_at_end_to_middle );
+                                         trim_after_tilde );
         }
     }
 
@@ -386,26 +408,28 @@ public final class PhylogenyDecorator {
                                  final Map<String, String> map,
                                  final FIELD field,
                                  final boolean extract_bracketed_scientific_name,
+                                 final boolean extract_bracketed_tax_code,
                                  final boolean picky,
                                  final Map<String, String> intermediate_map,
                                  final boolean cut_name_after_space,
                                  final boolean process_name_intelligently,
                                  final boolean process_similar_to,
                                  final int numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                 final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException,
-            NHXFormatException, PhyloXmlDataFormatException {
-        for( int i = 0; i < phylogenies.length; ++i ) {
-            PhylogenyDecorator.decorate( phylogenies[ i ],
+                                 final boolean trim_after_tilde ) throws IllegalArgumentException, NHXFormatException,
+            PhyloXmlDataFormatException {
+        for( final Phylogeny phylogenie : phylogenies ) {
+            PhylogenyDecorator.decorate( phylogenie,
                                          map,
                                          field,
                                          extract_bracketed_scientific_name,
+                                         extract_bracketed_tax_code,
                                          picky,
                                          intermediate_map,
                                          cut_name_after_space,
                                          process_name_intelligently,
                                          process_similar_to,
                                          numbers_of_chars_allowed_to_remove_if_not_found_in_map,
-                                         move_domain_numbers_at_end_to_middle );
+                                         trim_after_tilde );
         }
     }
 
@@ -417,11 +441,25 @@ public final class PhylogenyDecorator {
         return name;
     }
 
-    private static void extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
+    private static String extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) {
         final int i = new_value.lastIndexOf( "[" );
         final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 );
-        AptxUtil.ensurePresenceOfTaxonomy( node );
+        ForesterUtil.ensurePresenceOfTaxonomy( node );
         node.getNodeData().getTaxonomy().setScientificName( scientific_name );
+        return new_value.substring( 0, i - 1 ).trim();
+    }
+
+    private static String extractBracketedTaxCodes( final PhylogenyNode node, final String new_value ) {
+        final int i = new_value.lastIndexOf( "[" );
+        final String tc = new_value.substring( i + 1, new_value.length() - 1 );
+        ForesterUtil.ensurePresenceOfTaxonomy( node );
+        try {
+            node.getNodeData().getTaxonomy().setTaxonomyCode( tc );
+        }
+        catch ( final PhyloXmlDataFormatException e ) {
+            throw new IllegalArgumentException( "illegal format for taxonomy code: " + tc );
+        }
+        return new_value.substring( 0, i - 1 ).trim();
     }
 
     private static String extractIntermediate( final Map<String, String> intermediate_map, final String name ) {
@@ -444,24 +482,11 @@ public final class PhylogenyDecorator {
         return new_name;
     }
 
-    private static String moveDomainNumbersAtEnd( final String node_name ) {
-        final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name );
-        if ( m.matches() ) {
-            final String seq_number = m.group( 1 );
-            final String tax = m.group( 2 );
-            final String domain_number = m.group( 3 );
-            return seq_number + "_[" + domain_number + "]_" + tax;
-        }
-        else {
-            return node_name;
-        }
-    }
-
     public static Map<String, Map<String, String>> parseMappingTable( final File mapping_table_file )
             throws IOException {
         final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
         BasicTable<String> mapping_table = null;
-        mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false );
+        mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false, false );
         for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) {
             final Map<String, String> row_map = new HashMap<String, String>();
             String name = null;