in progress
authorcmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 25 Dec 2012 06:20:08 +0000 (06:20 +0000)
committercmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Tue, 25 Dec 2012 06:20:08 +0000 (06:20 +0000)
forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java
forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java
forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java
forester/java/src/org/forester/io/parsers/nhx/NHXParser.java
forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java
forester/java/src/org/forester/io/parsers/util/ParserUtils.java
forester/java/src/org/forester/phylogeny/PhylogenyMethods.java
forester/java/src/org/forester/phylogeny/data/Taxonomy.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/ForesterConstants.java

index fbe73f0..db00321 100644 (file)
@@ -1880,11 +1880,14 @@ public final class MainFrameApplication extends MainFrame {
                     final PhylogenyNode n = it.next();
                     final String name = n.getName().trim();
                     if ( !ForesterUtil.isEmpty( name ) ) {
-                        final String code = ParserUtils
-                                .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES );
-                        if ( !ForesterUtil.isEmpty( code ) ) {
-                            PhylogenyMethods.setTaxonomyCode( n, code );
-                        }
+                        
+                        ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES );
+                        
+                       // final String code = ParserUtils
+                       //         .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES );
+                       // if ( !ForesterUtil.isEmpty( code ) ) {
+                       //     PhylogenyMethods.setTaxonomyCode( n, code );
+                       // }
                     }
                 }
             }
index 86ff0d6..5ba492e 100644 (file)
@@ -170,7 +170,7 @@ public final class WebserviceUtil {
             final PhylogenyNode n = it.next();
             if ( n.isExternal() && n.getNodeData().isHasTaxonomy() ) {
                 final String name = n.getNodeData().getTaxonomy().getScientificName();
-                if ( !ForesterUtil.isEmpty( name ) && PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( name ).matches() ) {
+                if ( !ForesterUtil.isEmpty( name ) && PhyloXmlUtil.TAXOMONY_CODE_PATTERN_STRICT.matcher( name ).matches() ) {
                     n.getNodeData().getTaxonomy().setScientificName( "" );
                     n.getNodeData().getTaxonomy().setTaxonomyCode( name );
                 }
index f104de3..5760331 100644 (file)
@@ -256,14 +256,17 @@ public class NexusPhylogeniesParser implements PhylogenyParser {
                     }
                 }
                 if ( !isReplaceUnderscores() && ( ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) ) ) {
-                    final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
-                                                                                    getTaxonomyExtraction() );
-                    if ( !ForesterUtil.isEmpty( tax ) ) {
-                        if ( !node.getNodeData().isHasTaxonomy() ) {
-                            node.getNodeData().setTaxonomy( new Taxonomy() );
-                        }
-                        node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
-                    }
+                  
+                    ParserUtils.extractTaxonomyDataFromNodeName( node,  getTaxonomyExtraction() );
+                    
+//                    final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
+//                                                                                    getTaxonomyExtraction() );
+//                    if ( !ForesterUtil.isEmpty( tax ) ) {
+//                        if ( !node.getNodeData().isHasTaxonomy() ) {
+//                            node.getNodeData().setTaxonomy( new Taxonomy() );
+//                        }
+//                        node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
+//                    }
                 }
             }
         }
index 92935cd..09418fa 100644 (file)
@@ -649,14 +649,7 @@ public final class NHXParser implements PhylogenyParser {
                 if ( !s.startsWith( ":" ) ) {
                     node_to_annotate.setName( t.nextToken() );
                     if ( !replace_underscores && ( !is_nhx && ( taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
-                        final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
-                                                                                        taxonomy_extraction );
-                        if ( !ForesterUtil.isEmpty( tax ) ) {
-                            if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
-                                node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
-                            }
-                            node_to_annotate.getNodeData().getTaxonomy().setTaxonomyCode( tax );
-                        }
+                        ParserUtils.extractTaxonomyDataFromNodeName( node_to_annotate, taxonomy_extraction );
                     }
                 }
                 while ( t.hasMoreTokens() ) {
index 5083c91..0adb7fa 100644 (file)
@@ -31,12 +31,15 @@ import java.util.List;
 import java.util.Set;
 import java.util.regex.Pattern;
 
+import org.forester.io.parsers.util.ParserUtils;
+
 public final class PhyloXmlUtil {
 
     public static final String       OTHER                                      = "other";
     public static final String       UNKNOWN                                    = "unknown";
     public final static Pattern      SEQUENCE_SYMBOL_PATTERN                    = Pattern.compile( "\\S{1,20}" );
-    public final static Pattern      TAXOMONY_CODE_PATTERN                      = Pattern.compile( "[A-Z0-9]{3,5}" );
+    public final static Pattern      TAXOMONY_CODE_PATTERN_STRICT               = ParserUtils.TAXOMONY_CODE_PATTERN_1;
+    public final static Pattern      TAXOMONY_CODE_PATTERN_LAX                  = Pattern.compile( "[A-Z0-9]{3,6}" );
     public final static Pattern      LIT_REF_DOI_PATTERN                        = Pattern
                                                                                         .compile( "[a-zA-Z0-9_\\.]+\\S+" );
     public final static Set<String>  SEQUENCE_TYPES                             = new HashSet<String>();
index 4baf7ce..db628cf 100644 (file)
@@ -42,20 +42,31 @@ import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
 import org.forester.io.parsers.nhx.NHXParser;
 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
 import org.forester.io.parsers.tol.TolParser;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.Taxonomy;
 import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
 
 public final class ParserUtils {
 
-    final private static Pattern TAXOMONY_CODE_PATTERN_1  = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
+    final public static Pattern TAXOMONY_CODE_PATTERN_1  = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
     final private static Pattern TAXOMONY_CODE_PATTERN_2  = Pattern
                                                                   .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" );
     final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
 
+    
+    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1  = Pattern.compile( "\\d{1,7}" );
+    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2  = Pattern
+                                                                  .compile( "(\\d{1,7})[^A-Za-z].*" );
+    final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" );
+
+    
     final public static PhylogenyParser createParserDependingFileContents( final File file,
                                                                            final boolean phyloxml_validate_against_xsd )
             throws FileNotFoundException, IOException {
@@ -247,6 +258,42 @@ public final class ParserUtils {
         }
         return null;
     }
+    
+    public final static String extractUniprotTaxonomyIdFromNodeName( final String name,
+                                                                final TAXONOMY_EXTRACTION taxonomy_extraction ) {
+        if ( ( name.indexOf( "_" ) > 0 )
+                && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) {
+            final String[] s = name.split( "[_\\s]" );
+            if ( s.length > 1 ) {
+                final String str = s[ 1 ];
+                if ( !ForesterUtil.isEmpty( str ) ) {
+                    if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) {
+                        final Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PF.matcher( str );
+                        if ( m.matches() ) {
+                            return m.group( 1 );
+                        }
+                    }
+                    else {
+                        final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( str );
+                        if ( m1.matches() ) {
+                            return m1.group();
+                        }
+                        final Matcher m2 = TAXOMONY_UNIPROT_ID_PATTERN_2.matcher( str );
+                        if ( m2.matches() ) {
+                            return m2.group( 1 );
+                        }
+                    }
+                }
+            }
+        }
+        else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.YES ) {
+            final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( name );
+            if ( m1.matches() ) {
+                return name;
+            }
+        }
+        return null;
+    }
 
     public final static Phylogeny[] readPhylogenies( final File file ) throws FileNotFoundException, IOException {
         return PhylogenyMethods.readPhylogenies( ParserUtils.createParserDependingOnFileType( file, true ), file );
@@ -255,4 +302,25 @@ public final class ParserUtils {
     public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException {
         return readPhylogenies( new File( file_name ) );
     }
+
+    public final static void extractTaxonomyDataFromNodeName( final PhylogenyNode node,
+                                                              final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction )
+            throws PhyloXmlDataFormatException {
+        final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction );
+        if ( !ForesterUtil.isEmpty( id ) ) {
+            if ( !node.getNodeData().isHasTaxonomy() ) {
+                node.getNodeData().setTaxonomy( new Taxonomy() );
+            }
+            node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) );
+        }
+        else {
+            final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction );
+            if ( !ForesterUtil.isEmpty( code ) ) {
+                if ( !node.getNodeData().isHasTaxonomy() ) {
+                    node.getNodeData().setTaxonomy( new Taxonomy() );
+                }
+                node.getNodeData().getTaxonomy().setTaxonomyCode( code );
+            }
+        }
+    }
 }
index d48db8d..b2b57d4 100644 (file)
@@ -408,7 +408,7 @@ public class PhylogenyMethods {
         final ArrayList<PhylogenyNode> to_delete = new ArrayList<PhylogenyNode>();
         for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
             final PhylogenyNode n = iter.next();
-            if ( ( !n.isExternal() ) && ( !n.isRoot() ) && ( n.getNumberOfDescendants() == 1 ) ) {
+            if ( ( !n.isExternal() )  && ( n.getNumberOfDescendants() == 1 ) ) {
                 to_delete.add( n );
             }
         }
index 544522b..4f6d146 100644 (file)
@@ -34,10 +34,13 @@ import org.forester.io.parsers.nhx.NHXtags;
 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
 import org.forester.io.parsers.phyloxml.PhyloXmlMapping;
 import org.forester.io.parsers.phyloxml.PhyloXmlUtil;
+import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
 
 public class Taxonomy implements PhylogenyData, MultipleUris, Comparable<Taxonomy> {
 
+    
+    
     private String       _scientific_name;
     private String       _common_name;
     private List<String> _synonyms;
@@ -326,9 +329,17 @@ public class Taxonomy implements PhylogenyData, MultipleUris, Comparable<Taxonom
     }
 
     public void setTaxonomyCode( final String taxonomy_code ) throws PhyloXmlDataFormatException {
-        if ( !ForesterUtil.isEmpty( taxonomy_code )
-                && !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( taxonomy_code ).matches() ) {
-            throw new PhyloXmlDataFormatException( "illegal taxonomy code: [" + taxonomy_code + "]" );
+        if ( ForesterConstants.TAXONOMY_CODE_STRICT ) {
+            if ( !ForesterUtil.isEmpty( taxonomy_code )
+                    && !PhyloXmlUtil.TAXOMONY_CODE_PATTERN_STRICT.matcher( taxonomy_code ).matches() ) {
+                throw new PhyloXmlDataFormatException( "illegal taxonomy code: [" + taxonomy_code + "]" );
+            }
+        }
+        else {
+            if ( !ForesterUtil.isEmpty( taxonomy_code )
+                    && !PhyloXmlUtil.TAXOMONY_CODE_PATTERN_LAX.matcher( taxonomy_code ).matches() ) {
+                throw new PhyloXmlDataFormatException( "illegal taxonomy code: [" + taxonomy_code + "]" );
+            }
         }
         _taxonomy_code = taxonomy_code;
     }
index 04c868e..de9ab0c 100644 (file)
@@ -5226,7 +5226,13 @@ public final class Test {
             if ( !n13.getName().equals( "blah_12345/1-2" ) ) {
                 return false;
             }
-            if ( !PhylogenyMethods.getSpecies( n13 ).equals( "12345" ) ) {
+            if ( PhylogenyMethods.getSpecies( n13 ).equals( "12345" ) ) {
+                return false;
+            }
+            if ( !n13.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "12345" )  ) {
+                return false;
+            }
+            if ( !n13.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
                 return false;
             }
             final PhylogenyNode n14 = PhylogenyNode
@@ -5279,6 +5285,39 @@ public final class Test {
             if ( !isEqual( n18.getBranchData().getConfidence( 0 ).getValue(), 91 ) ) {
                 return false;
             }
+            
+            
+            //
+            final PhylogenyNode n19 = PhylogenyNode
+                    .createInstanceFromNhxString( "blah_1-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
+           
+          
+            if ( !n19.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" )  ) {
+                return false;
+            }
+            if ( !n19.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
+                return false;
+            }
+            final PhylogenyNode n30 = PhylogenyNode
+                    .createInstanceFromNhxString( "blah_1234567-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
+           
+          
+            if ( !n30.getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1234567" )  ) {
+                return false;
+            }
+            if ( !n30.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
+                return false;
+            }
+            final PhylogenyNode n31 = PhylogenyNode
+                    .createInstanceFromNhxString( "blah_12345678-roejojoej", NHXParser.TAXONOMY_EXTRACTION.YES );
+           
+          
+            if ( n31.getNodeData().isHasTaxonomy()  ) {
+                return false;
+            }
+           // if ( !n31.getNodeData().getTaxonomy().getIdentifier().getProvider().equals( "uniprot" )  ) {
+           //     return false;
+           // }
         }
         catch ( final Exception e ) {
             e.printStackTrace( System.out );
index a096bf5..f030c91 100644 (file)
@@ -38,7 +38,9 @@ public final class ForesterConstants {
     public final static String  UTF8                        = "UTF-8";
     public final static String  PHYLO_XML_REFERENCE         = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356";
     public final static boolean RELEASE                     = false;
-
+    public final static boolean TAXONOMY_CODE_STRICT        = true;
+    
+    
     public enum PhylogeneticTreeFormats {
         NH, NHX, NEXUS, PHYLOXML
     }