From ad896f6e18663a95f887e00ea9c24f73f3d387e6 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Tue, 25 Dec 2012 06:20:08 +0000 Subject: [PATCH] in progress --- .../archaeopteryx/MainFrameApplication.java | 13 ++-- .../archaeopteryx/webservices/WebserviceUtil.java | 2 +- .../io/parsers/nexus/NexusPhylogeniesParser.java | 19 +++--- .../src/org/forester/io/parsers/nhx/NHXParser.java | 9 +-- .../forester/io/parsers/phyloxml/PhyloXmlUtil.java | 5 +- .../org/forester/io/parsers/util/ParserUtils.java | 70 +++++++++++++++++++- .../org/forester/phylogeny/PhylogenyMethods.java | 2 +- .../src/org/forester/phylogeny/data/Taxonomy.java | 17 ++++- forester/java/src/org/forester/test/Test.java | 41 +++++++++++- .../src/org/forester/util/ForesterConstants.java | 4 +- 10 files changed, 152 insertions(+), 30 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index fbe73f0..db00321 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -1880,11 +1880,14 @@ public final class MainFrameApplication extends MainFrame { final PhylogenyNode n = it.next(); final String name = n.getName().trim(); if ( !ForesterUtil.isEmpty( name ) ) { - final String code = ParserUtils - .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES ); - if ( !ForesterUtil.isEmpty( code ) ) { - PhylogenyMethods.setTaxonomyCode( n, code ); - } + + ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.YES ); + + // final String code = ParserUtils + // .extractTaxonomyCodeFromNodeName( name, NHXParser.TAXONOMY_EXTRACTION.YES ); + // if ( !ForesterUtil.isEmpty( code ) ) { + // PhylogenyMethods.setTaxonomyCode( n, code ); + // } } } } diff --git a/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java b/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java index 86ff0d6..5ba492e 100644 --- a/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java @@ -170,7 +170,7 @@ public final class WebserviceUtil { final PhylogenyNode n = it.next(); if ( n.isExternal() && n.getNodeData().isHasTaxonomy() ) { final String name = n.getNodeData().getTaxonomy().getScientificName(); - if ( !ForesterUtil.isEmpty( name ) && PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( name ).matches() ) { + if ( !ForesterUtil.isEmpty( name ) && PhyloXmlUtil.TAXOMONY_CODE_PATTERN_STRICT.matcher( name ).matches() ) { n.getNodeData().getTaxonomy().setScientificName( "" ); n.getNodeData().getTaxonomy().setTaxonomyCode( name ); } diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java index f104de3..5760331 100644 --- a/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java +++ b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java @@ -256,14 +256,17 @@ public class NexusPhylogeniesParser implements PhylogenyParser { } } if ( !isReplaceUnderscores() && ( ( getTaxonomyExtraction() != TAXONOMY_EXTRACTION.NO ) ) ) { - final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(), - getTaxonomyExtraction() ); - if ( !ForesterUtil.isEmpty( tax ) ) { - if ( !node.getNodeData().isHasTaxonomy() ) { - node.getNodeData().setTaxonomy( new Taxonomy() ); - } - node.getNodeData().getTaxonomy().setTaxonomyCode( tax ); - } + + ParserUtils.extractTaxonomyDataFromNodeName( node, getTaxonomyExtraction() ); + +// final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(), +// getTaxonomyExtraction() ); +// if ( !ForesterUtil.isEmpty( tax ) ) { +// if ( !node.getNodeData().isHasTaxonomy() ) { +// node.getNodeData().setTaxonomy( new Taxonomy() ); +// } +// node.getNodeData().getTaxonomy().setTaxonomyCode( tax ); +// } } } } diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java index 92935cd..09418fa 100644 --- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -649,14 +649,7 @@ public final class NHXParser implements PhylogenyParser { if ( !s.startsWith( ":" ) ) { node_to_annotate.setName( t.nextToken() ); if ( !replace_underscores && ( !is_nhx && ( taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) { - final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(), - taxonomy_extraction ); - if ( !ForesterUtil.isEmpty( tax ) ) { - if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) { - node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() ); - } - node_to_annotate.getNodeData().getTaxonomy().setTaxonomyCode( tax ); - } + ParserUtils.extractTaxonomyDataFromNodeName( node_to_annotate, taxonomy_extraction ); } } while ( t.hasMoreTokens() ) { diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java index 5083c91..0adb7fa 100644 --- a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java @@ -31,12 +31,15 @@ import java.util.List; import java.util.Set; import java.util.regex.Pattern; +import org.forester.io.parsers.util.ParserUtils; + public final class PhyloXmlUtil { public static final String OTHER = "other"; public static final String UNKNOWN = "unknown"; public final static Pattern SEQUENCE_SYMBOL_PATTERN = Pattern.compile( "\\S{1,20}" ); - public final static Pattern TAXOMONY_CODE_PATTERN = Pattern.compile( "[A-Z0-9]{3,5}" ); + public final static Pattern TAXOMONY_CODE_PATTERN_STRICT = ParserUtils.TAXOMONY_CODE_PATTERN_1; + public final static Pattern TAXOMONY_CODE_PATTERN_LAX = Pattern.compile( "[A-Z0-9]{3,6}" ); public final static Pattern LIT_REF_DOI_PATTERN = Pattern .compile( "[a-zA-Z0-9_\\.]+\\S+" ); public final static Set SEQUENCE_TYPES = new HashSet(); diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java index 4baf7ce..db628cf 100644 --- a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -42,20 +42,31 @@ import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.io.parsers.tol.TolParser; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Taxonomy; import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; public final class ParserUtils { - final private static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" ); + final public static Pattern TAXOMONY_CODE_PATTERN_1 = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" ); final private static Pattern TAXOMONY_CODE_PATTERN_2 = Pattern .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" ); final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" ); + + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_1 = Pattern.compile( "\\d{1,7}" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_2 = Pattern + .compile( "(\\d{1,7})[^A-Za-z].*" ); + final private static Pattern TAXOMONY_UNIPROT_ID_PATTERN_PF = Pattern.compile( "(\\d{1,7})/\\d+-\\d+" ); + + final public static PhylogenyParser createParserDependingFileContents( final File file, final boolean phyloxml_validate_against_xsd ) throws FileNotFoundException, IOException { @@ -247,6 +258,42 @@ public final class ParserUtils { } return null; } + + public final static String extractUniprotTaxonomyIdFromNodeName( final String name, + final TAXONOMY_EXTRACTION taxonomy_extraction ) { + if ( ( name.indexOf( "_" ) > 0 ) + && ( ( taxonomy_extraction != TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name.indexOf( "/" ) > 4 ) ) ) { + final String[] s = name.split( "[_\\s]" ); + if ( s.length > 1 ) { + final String str = s[ 1 ]; + if ( !ForesterUtil.isEmpty( str ) ) { + if ( taxonomy_extraction == TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) { + final Matcher m = TAXOMONY_UNIPROT_ID_PATTERN_PF.matcher( str ); + if ( m.matches() ) { + return m.group( 1 ); + } + } + else { + final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( str ); + if ( m1.matches() ) { + return m1.group(); + } + final Matcher m2 = TAXOMONY_UNIPROT_ID_PATTERN_2.matcher( str ); + if ( m2.matches() ) { + return m2.group( 1 ); + } + } + } + } + } + else if ( taxonomy_extraction == TAXONOMY_EXTRACTION.YES ) { + final Matcher m1 = TAXOMONY_UNIPROT_ID_PATTERN_1.matcher( name ); + if ( m1.matches() ) { + return name; + } + } + return null; + } public final static Phylogeny[] readPhylogenies( final File file ) throws FileNotFoundException, IOException { return PhylogenyMethods.readPhylogenies( ParserUtils.createParserDependingOnFileType( file, true ), file ); @@ -255,4 +302,25 @@ public final class ParserUtils { public final static Phylogeny[] readPhylogenies( final String file_name ) throws FileNotFoundException, IOException { return readPhylogenies( new File( file_name ) ); } + + public final static void extractTaxonomyDataFromNodeName( final PhylogenyNode node, + final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) + throws PhyloXmlDataFormatException { + final String id = extractUniprotTaxonomyIdFromNodeName( node.getName(), taxonomy_extraction ); + if ( !ForesterUtil.isEmpty( id ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy().setIdentifier( new Identifier( id, "uniprot" ) ); + } + else { + final String code = extractTaxonomyCodeFromNodeName( node.getName(), taxonomy_extraction ); + if ( !ForesterUtil.isEmpty( code ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy().setTaxonomyCode( code ); + } + } + } } diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index d48db8d..b2b57d4 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -408,7 +408,7 @@ public class PhylogenyMethods { final ArrayList to_delete = new ArrayList(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode n = iter.next(); - if ( ( !n.isExternal() ) && ( !n.isRoot() ) && ( n.getNumberOfDescendants() == 1 ) ) { + if ( ( !n.isExternal() ) && ( n.getNumberOfDescendants() == 1 ) ) { to_delete.add( n ); } } diff --git a/forester/java/src/org/forester/phylogeny/data/Taxonomy.java b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java index 544522b..4f6d146 100644 --- a/forester/java/src/org/forester/phylogeny/data/Taxonomy.java +++ b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java @@ -34,10 +34,13 @@ import org.forester.io.parsers.nhx.NHXtags; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlMapping; import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; public class Taxonomy implements PhylogenyData, MultipleUris, Comparable { + + private String _scientific_name; private String _common_name; private List _synonyms; @@ -326,9 +329,17 @@ public class Taxonomy implements PhylogenyData, MultipleUris, Comparable