in progress
authorcmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 26 Dec 2012 02:41:00 +0000 (02:41 +0000)
committercmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 26 Dec 2012 02:41:00 +0000 (02:41 +0000)
forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java
forester/java/src/org/forester/io/parsers/util/ParserUtils.java
forester/java/src/org/forester/test/Test.java

index db00321..95b23cb 100644 (file)
@@ -507,7 +507,7 @@ public final class MainFrameApplication extends MainFrame {
                 moveNodeNamesToSeqNames();
             }
             else if ( o == _extract_tax_code_from_node_names_jmi ) {
-                extractTaxCodeFromNodeNames();
+                extractTaxDataFromNodeNames();
             }
             else if ( o == _gsdi_item ) {
                 if ( isSubtreeDisplayed() ) {
@@ -1137,10 +1137,10 @@ public final class MainFrameApplication extends MainFrame {
         customizeJMenuItem( _move_node_names_to_seq_names_jmi );
         _move_node_names_to_seq_names_jmi.setToolTipText( "To interpret node names as sequence (protein, gene) names" );
         _tools_menu
-                .add( _extract_tax_code_from_node_names_jmi = new JMenuItem( "Extract Taxonomic Codes from Node Names" ) );
+                .add( _extract_tax_code_from_node_names_jmi = new JMenuItem( "Extract Taxonomic Codes or Ids from Node Names" ) );
         customizeJMenuItem( _extract_tax_code_from_node_names_jmi );
         _extract_tax_code_from_node_names_jmi
-                .setToolTipText( "To extract taxonomic codes (mnemonics) from nodes names in the form of 'xyz_ECOLI'" );
+                .setToolTipText( "To extract taxonomic codes (mnemonics) from nodes names in the form of 'xyz_ECOLI', or Uniprot identifiers from nodes names in the form of 'xyz_1234567'" );
         _tools_menu.addSeparator();
         _tools_menu
                 .add( _obtain_detailed_taxonomic_information_jmi = new JMenuItem( OBTAIN_DETAILED_TAXONOMIC_INFORMATION ) );
@@ -1871,24 +1871,58 @@ public final class MainFrameApplication extends MainFrame {
         }
     }
 
-    private void extractTaxCodeFromNodeNames() throws PhyloXmlDataFormatException {
+    private void extractTaxDataFromNodeNames() throws PhyloXmlDataFormatException {
+        final StringBuilder sb = new StringBuilder();
+        final StringBuilder sb_failed = new StringBuilder();
+        int counter = 0;
+        int counter_failed = 0;
         if ( getCurrentTreePanel() != null ) {
             final Phylogeny phy = getCurrentTreePanel().getPhylogeny();
             if ( ( phy != null ) && !phy.isEmpty() ) {
-                final PhylogenyNodeIterator it = phy.iteratorPostorder();
+                final PhylogenyNodeIterator it = phy.iteratorExternalForward();
                 while ( it.hasNext() ) {
                     final PhylogenyNode n = it.next();
                     final String name = n.getName().trim();
                     if ( !ForesterUtil.isEmpty( name ) ) {
+                        final String nt = ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES );
+                        if ( !ForesterUtil.isEmpty( nt ) ) {
+                            if ( counter < 15 ) {
+                                sb.append( name + ": " + nt + "\n" );
+                            }
+                            else if ( counter == 15 ) {
+                                sb.append( "...\n" );
+                            }
+                            counter++;
+                        }
+                        else {
+                            if ( counter_failed < 15 ) {
+                                sb_failed.append( name + "\n" );
+                            }
+                            else if (  counter_failed == 15 ) {
+                                sb_failed.append( "...\n" );
+                            }
+                            counter_failed++;
+                        }
+                    }
+                }
+                if ( counter > 0 ) {
+                    String failed = "";
+                    if ( counter_failed > 0 ) {
+                        failed = "\nDid not extract taxonomic data for "  + counter_failed + " (named) external nodes:\n" + sb_failed;
                         
-                        ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES );
-                        
-                       // final String code = ParserUtils
-                       //         .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES );
-                       // if ( !ForesterUtil.isEmpty( code ) ) {
-                       //     PhylogenyMethods.setTaxonomyCode( n, code );
-                       // }
                     }
+                    JOptionPane.showMessageDialog( this,
+                                                   "Successfully extracted taxonomic data from " + counter
+                                                           + " external nodes:\n" + sb.toString() + failed,
+                                                   "Taxonomic Data Extraction Successfully Completed",
+                                                   JOptionPane.INFORMATION_MESSAGE );
+                }
+                else {
+                    JOptionPane
+                            .showMessageDialog( this,
+                                                "Could not extract any taxonomic data, maybe node names are empty\nor not in the form \"XYZ_CAEEL\", \"XYZ_CAEEL/12-394\", or \"XYZ_1234567\"?",
+                                                "No Taxonomic Data Extracted",
+                                                JOptionPane.WARNING_MESSAGE );
                 }
             }
         }
index db628cf..655ec7b 100644 (file)
@@ -55,18 +55,15 @@ import org.forester.util.ForesterUtil;
 
 public final class ParserUtils {
 
-    final public static Pattern TAXOMONY_CODE_PATTERN_1  = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
-    final private static Pattern TAXOMONY_CODE_PATTERN_2  = Pattern
-                                                                  .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" );
-    final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
-
-    
-    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1  = Pattern.compile( "\\d{1,7}" );
-    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2  = Pattern
-                                                                  .compile( "(\\d{1,7})[^A-Za-z].*" );
+    final public static Pattern  TAXOMONY_CODE_PATTERN_1        = Pattern.compile( "\\b[A-Z0-9]{5}|RAT|PIG|PEA|CAP\\b" );
+    final private static Pattern TAXOMONY_CODE_PATTERN_2        = Pattern
+                                                                        .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^0-9A-Za-z].*" );
+    final private static Pattern TAXOMONY_CODE_PATTERN_PF       = Pattern
+                                                                        .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
+    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1  = Pattern.compile( "\\b\\d{1,7}\\b" );
+    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2  = Pattern.compile( "(\\d{1,7})[^0-9A-Za-z].*" );
     final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" );
 
-    
     final public static PhylogenyParser createParserDependingFileContents( final File file,
                                                                            final boolean phyloxml_validate_against_xsd )
             throws FileNotFoundException, IOException {
@@ -258,9 +255,9 @@ public final class ParserUtils {
         }
         return null;
     }
-    
+
     public final static String extractUniprotTaxonomyIdFromNodeName( final String name,
-                                                                final TAXONOMY_EXTRACTION taxonomy_extraction ) {
+                                                                     final TAXONOMY_EXTRACTION taxonomy_extraction ) {
         if ( ( name.indexOf( "_" ) > 0 )
                 && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) {
             final String[] s = name.split( "[_\\s]" );
@@ -303,15 +300,18 @@ public final class ParserUtils {
         return readPhylogenies( new File( file_name ) );
     }
 
-    public final static void extractTaxonomyDataFromNodeName( final PhylogenyNode node,
-                                                              final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction )
+    public final static String extractTaxonomyDataFromNodeName( final PhylogenyNode node,
+                                                                final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction )
             throws PhyloXmlDataFormatException {
         final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction );
         if ( !ForesterUtil.isEmpty( id ) ) {
             if ( !node.getNodeData().isHasTaxonomy() ) {
                 node.getNodeData().setTaxonomy( new Taxonomy() );
             }
-            node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) );
+            if ( node.getNodeData().getTaxonomy().getIdentifier() == null || ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getIdentifier().getValue() ) ) {
+                node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) );
+                return id;
+            }
         }
         else {
             final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction );
@@ -319,8 +319,12 @@ public final class ParserUtils {
                 if ( !node.getNodeData().isHasTaxonomy() ) {
                     node.getNodeData().setTaxonomy( new Taxonomy() );
                 }
-                node.getNodeData().getTaxonomy().setTaxonomyCode( code );
+                if ( ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) {
+                    node.getNodeData().getTaxonomy().setTaxonomyCode( code );
+                    return code;
+                }
             }
         }
+        return null;
     }
 }
index de9ab0c..4aa0fc2 100644 (file)
@@ -198,7 +198,7 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
-        System.out.print( "Taxonomy extraction: " );
+        System.out.print( "Taxonomy code extraction: " );
         if ( Test.testExtractTaxonomyCodeFromNodeName() ) {
             System.out.println( "OK." );
             succeeded++;
@@ -207,6 +207,15 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
+        System.out.print( "Taxonomy extraction (general): " );
+        if ( Test.testTaxonomyExtraction() ) {
+            System.out.println( "OK." );
+            succeeded++;
+        }
+        else {
+            System.out.println( "failed." );
+            failed++;
+        }
         System.out.print( "Basic node construction and parsing of NHX (node level): " );
         if ( Test.testNHXNodeParsing() ) {
             System.out.println( "OK." );
@@ -225,6 +234,7 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
+       
         System.out.print( "Conversion to NHX (node level): " );
         if ( Test.testNHXconversion() ) {
             System.out.println( "OK." );
@@ -3967,29 +3977,24 @@ public final class Test {
         return true;
     }
 
-    
     private static boolean testNodeRemoval() {
         try {
             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
             final Phylogeny t0 = factory.create( "((a)b)", new NHXParser() )[ 0 ];
             PhylogenyMethods.removeNode( t0.getNode( "b" ), t0 );
-            
             if ( !t0.toNewHampshire().equals( "(a);" ) ) {
                 return false;
             }
             final Phylogeny t1 = factory.create( "((a:2)b:4)", new NHXParser() )[ 0 ];
             PhylogenyMethods.removeNode( t1.getNode( "b" ), t1 );
-            
             if ( !t1.toNewHampshire().equals( "(a:6.0);" ) ) {
                 return false;
             }
             final Phylogeny t2 = factory.create( "((a,b),c)", new NHXParser() )[ 0 ];
             PhylogenyMethods.removeNode( t2.getNode( "b" ), t2 );
-            
             if ( !t2.toNewHampshire().equals( "((a),c);" ) ) {
                 return false;
             }
-            
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );
@@ -3997,7 +4002,7 @@ public final class Test {
         }
         return true;
     }
-    
+
     private static boolean testMidpointrooting() {
         try {
             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
@@ -4882,6 +4887,81 @@ public final class Test {
         return true;
     }
 
+    private static boolean testTaxonomyExtraction() {
+        try {
+            final PhylogenyNode n0 = PhylogenyNode.createInstanceFromNhxString( "sd_12345678",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n0.getNodeData().isHasTaxonomy() ) {
+                return false;
+            }
+            final PhylogenyNode n1 = PhylogenyNode.createInstanceFromNhxString( "sd_12345x",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n1.getNodeData().isHasTaxonomy() ) {
+                System.out.println( n1.toString() );
+                return false;
+            }
+            final PhylogenyNode n2 = PhylogenyNode.createInstanceFromNhxString( "12345",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( !n2.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+                System.out.println( n2.toString() );
+                return false;
+            }
+            final PhylogenyNode n3 = PhylogenyNode.createInstanceFromNhxString( "blag_12345",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( !n3.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+                System.out.println( n3.toString() );
+                return false;
+            }
+            final PhylogenyNode n4 = PhylogenyNode.createInstanceFromNhxString( "blag-12345",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n4.getNodeData().isHasTaxonomy() ) {
+                System.out.println( n4.toString() );
+                return false;
+            }
+            final PhylogenyNode n5 = PhylogenyNode.createInstanceFromNhxString( "12345-blag",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n5.getNodeData().isHasTaxonomy() ) {
+                System.out.println( n5.toString() );
+                return false;
+            }
+            final PhylogenyNode n6 = PhylogenyNode.createInstanceFromNhxString( "blag-12345-blag",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n6.getNodeData().isHasTaxonomy() ) {
+                System.out.println( n6.toString() );
+                return false;
+            }
+            final PhylogenyNode n7 = PhylogenyNode.createInstanceFromNhxString( "blag-12345_blag",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n7.getNodeData().isHasTaxonomy() ) {
+                System.out.println( n7.toString() );
+                return false;
+            }
+            final PhylogenyNode n8 = PhylogenyNode.createInstanceFromNhxString( "blag_12345-blag",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( !n8.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+                System.out.println( n8.toString() );
+                return false;
+            }
+            final PhylogenyNode n9 = PhylogenyNode.createInstanceFromNhxString( "blag_12345_blag",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( !n9.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
+                System.out.println( n9.toString() );
+                return false;
+            }
+            final PhylogenyNode n10 = PhylogenyNode.createInstanceFromNhxString( "blag_12X45-blag",
+                                                                                NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( !n10.getNodeData().getTaxonomy().getTaxonomyCode().equals( "12X45" ) ) {
+                System.out.println( n10.toString() );
+                return false;
+            }
+        }
+        catch ( final Exception e ) {
+            e.printStackTrace( System.out );
+            return false;
+        }
+        return true;
+    }
+
     private static boolean testNHXNodeParsing() {
         try {
             final PhylogenyNode n1 = new PhylogenyNode();
@@ -5092,7 +5172,7 @@ public final class Test {
             if ( !e2.getName().equals( "n10_RAT1" ) ) {
                 return false;
             }
-            if ( !PhylogenyMethods.getSpecies( e2 ).equals( "RAT" ) ) {
+            if ( PhylogenyMethods.getSpecies( e2 ).equals( "RAT" ) ) {
                 return false;
             }
             final PhylogenyNode e3 = PhylogenyNode.createInstanceFromNhxString( "n10_RAT~",
@@ -5229,10 +5309,10 @@ public final class Test {
             if ( PhylogenyMethods.getSpecies( n13 ).equals( "12345" ) ) {
                 return false;
             }
-            if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" )  ) {
+            if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" ) ) {
                 return false;
             }
-            if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
+            if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
                 return false;
             }
             final PhylogenyNode n14 = PhylogenyNode
@@ -5285,39 +5365,32 @@ public final class Test {
             if ( !isEqual( n18.getBranchData().getConfidence( 0 ).getValue(), 91 ) ) {
                 return false;
             }
-            
-            
-            //
-            final PhylogenyNode n19 = PhylogenyNode
-                    .createInstanceFromNhxString( "blah_1-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
-           
-          
-            if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" )  ) {
+            final PhylogenyNode n19 = PhylogenyNode.createInstanceFromNhxString( "blah_1-roejojoej",
+                                                                                 NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) {
                 return false;
             }
-            if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
+            if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
                 return false;
             }
-            final PhylogenyNode n30 = PhylogenyNode
-                    .createInstanceFromNhxString( "blah_1234567-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
-           
-          
-            if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" )  ) {
+            final PhylogenyNode n30 = PhylogenyNode.createInstanceFromNhxString( "blah_1234567-roejojoej",
+                                                                                 NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" ) ) {
                 return false;
             }
-            if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
+            if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" ) ) {
                 return false;
             }
-            final PhylogenyNode n31 = PhylogenyNode
-                    .createInstanceFromNhxString( "blah_12345678-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
-           
-          
-            if ( n31.getNodeData().isHasTaxonomy()  ) {
+            final PhylogenyNode n31 = PhylogenyNode.createInstanceFromNhxString( "blah_12345678-roejojoej",
+                                                                                 NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n31.getNodeData().isHasTaxonomy() ) {
+                return false;
+            }
+            final PhylogenyNode n32 = PhylogenyNode.createInstanceFromNhxString( "sd_12345678",
+                                                                                 NHXParser.TAXONOMY_EXTRACTION.YES );
+            if ( n32.getNodeData().isHasTaxonomy() ) {
                 return false;
             }
-           // if ( !n31.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
-           //     return false;
-           // }
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );