From 6726c4b54521e8f4f1ca3e293124776a507cbdfa Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Tue, 8 Apr 2014 02:31:55 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/archaeopteryx/AptxUtil.java | 29 ------ .../org/forester/archaeopteryx/UrlTreeReader.java | 7 +- .../archaeopteryx/webservices/WebserviceUtil.java | 106 ++++++++++++++++---- .../webservices/WebservicesManager.java | 2 +- .../io/parsers/nexus/NexusPhylogeniesParser.java | 94 +++++++++++------ .../org/forester/phylogeny/PhylogenyMethods.java | 43 ++++++-- .../src/org/forester/phylogeny/data/Accession.java | 4 +- forester/java/src/org/forester/test/Test.java | 105 +++++++++++++++++++ .../org/forester/util/SequenceAccessionTools.java | 13 +++ 9 files changed, 310 insertions(+), 93 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java index 73c2c4f..0e217c4 100644 --- a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java @@ -48,8 +48,6 @@ import java.util.Locale; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import javax.imageio.IIOImage; import javax.imageio.ImageIO; @@ -71,7 +69,6 @@ import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY; import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; @@ -84,10 +81,6 @@ public final class AptxUtil { private final static String[] AVAILABLE_FONT_FAMILIES_SORTED = GraphicsEnvironment.getLocalGraphicsEnvironment() .getAvailableFontFamilyNames(); - private final static Pattern seq_identifier_pattern_1 = Pattern - .compile( "^([A-Za-z]{2,5})[|=:]([0-9A-Za-z_\\.]{5,40})\\s*$" ); - private final static Pattern seq_identifier_pattern_2 = Pattern - .compile( "^([A-Za-z]{2,5})[|=:]([0-9A-Za-z_\\.]{5,40})[|,; ].*$" ); static { Arrays.sort( AVAILABLE_FONT_FAMILIES_SORTED ); } @@ -191,28 +184,6 @@ public final class AptxUtil { return tax_set; } - public final static Accession obtainSequenceAccessionFromName( final String sequence_name ) { - final String n = sequence_name.trim(); - final Matcher matcher1 = seq_identifier_pattern_1.matcher( n ); - String group1 = ""; - String group2 = ""; - if ( matcher1.matches() ) { - group1 = matcher1.group( 1 ); - group2 = matcher1.group( 2 ); - } - else { - final Matcher matcher2 = seq_identifier_pattern_2.matcher( n ); - if ( matcher2.matches() ) { - group1 = matcher2.group( 1 ); - group2 = matcher2.group( 2 ); - } - } - if ( ForesterUtil.isEmpty( group1 ) || ForesterUtil.isEmpty( group2 ) ) { - return null; - } - return new Accession( group2, group1 ); - } - public final static void printWarningMessage( final String name, final String message ) { System.out.println( "[" + name + "] > " + message ); } diff --git a/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java b/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java index dad02df..caf9ad8 100644 --- a/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java +++ b/forester/java/src/org/forester/archaeopteryx/UrlTreeReader.java @@ -107,7 +107,12 @@ public class UrlTreeReader implements Runnable { parser = new NexusPhylogeniesParser(); ( ( NexusPhylogeniesParser ) parser ).setReplaceUnderscores( true ); break; - case TREEBASE: + case TREEBASE_TREE: + parser = new NexusPhylogeniesParser(); + ( ( NexusPhylogeniesParser ) parser ).setReplaceUnderscores( true ); + ( ( NexusPhylogeniesParser ) parser ).setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.NO ); + break; + case TREEBASE_STUDY: parser = new NexusPhylogeniesParser(); ( ( NexusPhylogeniesParser ) parser ).setReplaceUnderscores( true ); ( ( NexusPhylogeniesParser ) parser ).setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.NO ); diff --git a/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java b/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java index 18bc84f..5efee81 100644 --- a/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/webservices/WebserviceUtil.java @@ -35,21 +35,25 @@ import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.Identifier; import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.phylogeny.iterators.PreorderTreeIterator; import org.forester.util.ForesterUtil; +import org.forester.util.SequenceAccessionTools; public final class WebserviceUtil { - public static final String TREE_FAM_INST = "tree_fam"; public static final String PFAM_INST = "pfam"; + public static final String PFAM_NAME = "Pfam"; + public static final String PFAM_SERVER = "http://pfam.janelia.org"; + public static final String TOL_NAME = "Tree of Life"; public static final String TOL_WEBSERVER = "http://tolweb.org/onlinecontributors/app?service=external&page=xml/TreeStructureService&node_id=" + PhylogeniesWebserviceClient.QUERY_PLACEHOLDER; - public static final String TOL_NAME = "Tree of Life"; + public static final String TREE_BASE_DESC = "This data set was downloaded from TreeBASE, a relational database of phylogenetic knowledge. TreeBASE has been supported by the NSF, Harvard University, Yale University, SDSC and UC Davis. Please do not remove this acknowledgment."; + public static final String TREE_BASE_INST = "treebase"; public static final String TREE_BASE_NAME = "TreeBASE"; + public static final String TREE_FAM_INST = "tree_fam"; public static final String TREE_FAM_NAME = "TreeFam"; - public static final String PFAM_NAME = "Pfam"; - public static final String PFAM_SERVER = "http://pfam.janelia.org"; public static List createDefaultClients() { final List clients = new ArrayList(); @@ -65,17 +69,29 @@ public final class WebserviceUtil { "http://tolweb.org", null ) ); clients.add( new BasicPhylogeniesWebserviceClient( TREE_BASE_NAME, + "Read Tree(s) from TreeBASE Study...", + "Use TreeBASE to obtain evolutionary tree(s) from a study", + "Please enter a TreeBASE study (\"S\") identifier (without the \"S\")\n(Examples: 15613, 15632, 14525, 14909)", + WsPhylogenyFormat.TREEBASE_STUDY, + null, + "http://purl.org/phylo/treebase/phylows/study/TB2:S" + + PhylogeniesWebserviceClient.QUERY_PLACEHOLDER + + "?format=nexus", + true, + "http://www.treebase.org", + TREE_BASE_INST ) ); + clients.add( new BasicPhylogeniesWebserviceClient( TREE_BASE_NAME, "Read Tree from TreeBASE...", "Use TreeBASE to obtain a evolutionary tree", - "Please enter a TreeBASE tree identifier\n(Examples: 2654, 825, 4931, 2518, 2406, 4934)", - WsPhylogenyFormat.TREEBASE, + "Please enter a TreeBASE tree (\"Tr\") identifier (without the \"Tr\")\n(Examples: 422, 2654, 825, 4931, 2518, 2406, 4934)", + WsPhylogenyFormat.TREEBASE_TREE, null, "http://purl.org/phylo/treebase/phylows/tree/TB2:Tr" + PhylogeniesWebserviceClient.QUERY_PLACEHOLDER + "?format=nexus", true, "http://www.treebase.org", - null ) ); + TREE_BASE_INST ) ); clients.add( new BasicPhylogeniesWebserviceClient( PFAM_NAME, "Read Gene Tree from Pfam...", "Use Pfam to obtain gene trees for seed alignments", @@ -103,6 +119,23 @@ public final class WebserviceUtil { return clients; } + public static void processInstructions( final PhylogeniesWebserviceClient client, final Phylogeny phylogeny ) + throws PhyloXmlDataFormatException { + if ( client.getProcessingInstructions().equals( WebserviceUtil.TREE_FAM_INST ) ) { + WebserviceUtil.processTreeFamTrees( phylogeny ); + } + else if ( client.getProcessingInstructions().equals( WebserviceUtil.PFAM_INST ) ) { + WebserviceUtil.extractSpTremblAccFromNodeName( phylogeny, "sptrembl" ); + PhylogenyMethods.transferInternalNodeNamesToConfidence( phylogeny, "bootstrap" ); + } + else if ( client.getProcessingInstructions().equals( WebserviceUtil.TREE_BASE_INST ) ) { + if ( PhylogenyMethods.isInternalNamesLookLikeConfidences( phylogeny ) ) { + PhylogenyMethods.transferInternalNodeNamesToConfidence( phylogeny, "" ); + } + WebserviceUtil.processTreeBaseTrees( phylogeny ); + } + } + static void extractSpTremblAccFromNodeName( final Phylogeny phy, final String source ) { final PreorderTreeIterator it = new PreorderTreeIterator( phy ); while ( it.hasNext() ) { @@ -123,15 +156,24 @@ public final class WebserviceUtil { } } - public static void processInstructions( final PhylogeniesWebserviceClient client, final Phylogeny phylogeny ) - throws PhyloXmlDataFormatException { - if ( client.getProcessingInstructions().equals( WebserviceUtil.TREE_FAM_INST ) ) { - - WebserviceUtil.processTreeFamTrees( phylogeny ); + static void processTreeBaseTrees( final Phylogeny phy ) { + phy.setDescription( TREE_BASE_DESC ); + final PhylogenyNodeIterator it = phy.iteratorExternalForward(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !ForesterUtil.isEmpty( n.getName() ) ) { + final Accession acc = SequenceAccessionTools.parseAccessorFromString( n.getName() ); + if ( acc != null ) { + if ( !n.getNodeData().isHasSequence() ) { + n.getNodeData().addSequence( new Sequence() ); + } + final Sequence s = n.getNodeData().getSequence(); + if ( s.getAccession() == null ) { + s.setAccession( acc ); + } + } + } } - else if ( client.getProcessingInstructions().equals( WebserviceUtil.PFAM_INST ) ) { - WebserviceUtil.extractSpTremblAccFromNodeName( phylogeny, "sptrembl" ); - PhylogenyMethods.transferInternalNodeNamesToConfidence( phylogeny, "bootstrap" ); } } static void processTreeFamTrees( final Phylogeny phy ) { @@ -140,15 +182,41 @@ public final class WebserviceUtil { final PhylogenyNode n = it.next(); if ( n.isExternal() ) { n.getNodeData().setEvent( null ); + if ( !ForesterUtil.isEmpty( n.getName() ) ) { + final Accession acc = SequenceAccessionTools.parseAccessorFromString( n.getName() ); + if ( acc != null ) { + if ( !n.getNodeData().isHasSequence() ) { + n.getNodeData().addSequence( new Sequence() ); + } + final Sequence s = n.getNodeData().getSequence(); + if ( s.getAccession() == null ) { + s.setAccession( acc ); + } + } + } + } + else { + if ( ( n.getBranchData() != null ) && n.getBranchData().isHasConfidences() + && ( n.getBranchData().getConfidence( 0 ) != null ) ) { + n.getBranchData().getConfidence( 0 ).setType( "bootstrap" ); + } + if ( !ForesterUtil.isEmpty( n.getName() ) ) { + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().addTaxonomy( new Taxonomy() ); + } + final Taxonomy t = n.getNodeData().getTaxonomy(); + if ( ForesterUtil.isEmpty( t.getScientificName() ) ) { + t.setScientificName( n.getName() ); + n.setName( "" ); + } + } } - if ( n.getNodeData().isHasTaxonomy() && ( n.getNodeData().getTaxonomy().getIdentifier() != null ) ) { n.getNodeData() .getTaxonomy() - .setIdentifier( new Identifier( n.getNodeData().getTaxonomy().getIdentifier().getValue(), "ncbi" ) ); + .setIdentifier( new Identifier( n.getNodeData().getTaxonomy().getIdentifier().getValue(), + "ncbi" ) ); } } } - - } diff --git a/forester/java/src/org/forester/archaeopteryx/webservices/WebservicesManager.java b/forester/java/src/org/forester/archaeopteryx/webservices/WebservicesManager.java index e60528d..42ad0b1 100644 --- a/forester/java/src/org/forester/archaeopteryx/webservices/WebservicesManager.java +++ b/forester/java/src/org/forester/archaeopteryx/webservices/WebservicesManager.java @@ -58,6 +58,6 @@ public final class WebservicesManager { } public enum WsPhylogenyFormat { - NH, NHX, NEXUS, TOL_XML_RESPONSE, PHYLOXML, NH_EXTRACT_TAXONOMY, PFAM, TREEBASE + NEXUS, NH, NH_EXTRACT_TAXONOMY, NHX, PFAM, PHYLOXML, TOL_XML_RESPONSE, TREEBASE_STUDY, TREEBASE_TREE } } diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java index 46f6548..4a25f4d 100644 --- a/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java +++ b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java @@ -51,32 +51,40 @@ import org.forester.util.ForesterUtil; public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser { final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase(); + final private static String end = NexusConstants.END.toLowerCase(); + final private static String endblock = "endblock"; + final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" ); final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase(); + final private static Pattern TITLE_PATTERN = Pattern.compile( "TITLE.?\\s+([^;]+)", + Pattern.CASE_INSENSITIVE ); final private static String translate = NexusConstants.TRANSLATE.toLowerCase(); final private static String tree = NexusConstants.TREE.toLowerCase(); - final private static String utree = NexusConstants.UTREE.toLowerCase(); - final private static String end = NexusConstants.END.toLowerCase(); - final private static String endblock = "endblock"; final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+", Pattern.CASE_INSENSITIVE ); - final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" ); - private Object _nexus_source; - private List _taxlabels; - private Map _translate_map; - private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT; - private boolean _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT; - private TAXONOMY_EXTRACTION _taxonomy_extraction = TAXONOMY_EXTRACTION.NO; - private Phylogeny _next; + final private static String utree = NexusConstants.UTREE.toLowerCase(); private BufferedReader _br; - private boolean _in_trees_block; - private StringBuilder _nh; - private String _name; - private StringBuilder _translate_sb; + private boolean _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT; private boolean _in_taxalabels; private boolean _in_translate; + private boolean _in_tree; + private boolean _in_trees_block; private boolean _is_rooted; + private String _name; + private Phylogeny _next; + private Object _nexus_source; + private StringBuilder _nh; + private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT; private boolean _rooted_info_present; - private boolean _in_tree; + private List _taxlabels; + private TAXONOMY_EXTRACTION _taxonomy_extraction = TAXONOMY_EXTRACTION.NO; + private String _title; + private Map _translate_map; + private StringBuilder _translate_sb; + + @Override + public String getName() { + return "Nexus Phylogenies Parser"; + } @Override public final boolean hasNext() { @@ -110,7 +118,8 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P _translate_map = new HashMap(); _nh = new StringBuilder(); _name = ""; - _translate_sb = new StringBuilder(); + _title = ""; + _translate_sb = null; _next = null; _in_trees_block = false; _in_taxalabels = false; @@ -143,7 +152,8 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P _taxonomy_extraction = taxonomy_extraction; } - private final void createPhylogeny( final String name, + private final void createPhylogeny( final String title, + final String name, final StringBuilder nhx, final boolean rooted_info_present, final boolean is_rooted ) throws IOException { @@ -160,7 +170,19 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P if ( p == null ) { throw new PhylogenyParserException( "failed to create phylogeny" ); } - p.setName( name ); + String myname = null; + if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) { + myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")"; + } + else if ( !ForesterUtil.isEmpty( title ) ) { + myname = title.replace( '_', ' ' ).trim(); + } + else if ( !ForesterUtil.isEmpty( name ) ) { + myname = name.trim(); + } + if ( !ForesterUtil.isEmpty( myname ) ) { + p.setName( myname ); + } if ( rooted_info_present ) { p.setRooted( is_rooted ); } @@ -186,6 +208,11 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) { ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction ); } + else if ( _replace_underscores ) { + if ( !ForesterUtil.isEmpty( node.getName() ) ) { + node.setName( node.getName().replace( '_', ' ' ).trim() ); + } + } } } _next = p; @@ -204,6 +231,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P _in_trees_block = true; _in_taxalabels = false; _in_translate = false; + _title = ""; } else if ( line_lc.startsWith( taxlabels ) ) { _in_trees_block = false; @@ -211,20 +239,25 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P _in_translate = false; } else if ( line_lc.startsWith( translate ) ) { + _translate_sb = new StringBuilder(); _in_taxalabels = false; _in_translate = true; } else if ( _in_trees_block ) { - //FIXME TODO need to work on this "title" and "link" - if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) { - // Do nothing. + if ( line_lc.startsWith( "title" ) ) { + final Matcher title_m = TITLE_PATTERN.matcher( line ); + if ( title_m.lookingAt() ) { + _title = title_m.group( 1 ); + } + } + else if ( line_lc.startsWith( "link" ) ) { } else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { _in_trees_block = false; _in_tree = false; _in_translate = false; if ( _nh.length() > 0 ) { - createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted ); + createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); _nh = new StringBuilder(); _name = ""; _rooted_info_present = false; @@ -238,7 +271,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P boolean might = false; if ( _nh.length() > 0 ) { might = true; - createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted ); + createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); _nh = new StringBuilder(); _name = ""; _rooted_info_present = false; @@ -271,7 +304,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) { _in_tree = false; _in_translate = false; - createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted ); + createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); _nh = new StringBuilder(); _name = ""; _rooted_info_present = false; @@ -316,7 +349,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P } } if ( _nh.length() > 0 ) { - createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted ); + createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted ); if ( _next != null ) { return; } @@ -331,10 +364,10 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P for( final String pair : s.split( "," ) ) { final String[] kv = pair.trim().split( "\\s+" ); if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) { - throw new IOException( "ill-formatted translate values: " + translate_sb ); + throw new IOException( "ill-formatted translate values: " + pair ); } if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) { - throw new IOException( "ill-formatted translate values: " + translate_sb ); + throw new IOException( "ill-formatted translate values: " + pair ); } String key = ""; String value = ""; @@ -356,9 +389,4 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P private final static String removeWhiteSpaceBeforeSemicolon( final String s ) { return s.replaceAll( "\\s+;", ";" ); } - - @Override - public String getName() { - return "Nexus Phylogenies Parser"; - } } diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index 1f872cf..c085575 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -1516,26 +1516,51 @@ public class PhylogenyMethods { } } - final static public void transferInternalNodeNamesToConfidence( final Phylogeny phy, final String confidence_type ) { + final static public boolean isInternalNamesLookLikeConfidences( final Phylogeny phy ) { final PhylogenyNodeIterator it = phy.iteratorPostorder(); while ( it.hasNext() ) { final PhylogenyNode n = it.next(); - if ( !n.isExternal() && !n.getBranchData().isHasConfidences() ) { + if ( !n.isExternal() && !n.isRoot() ) { if ( !ForesterUtil.isEmpty( n.getName() ) ) { - double d = -1.0; + double value = -1; try { - d = Double.parseDouble( n.getName() ); + value = Double.parseDouble( n.getName() ); } - catch ( final Exception e ) { - d = -1.0; + catch ( final NumberFormatException e ) { + return false; } - if ( d >= 0.0 ) { - n.getBranchData().addConfidence( new Confidence( d, confidence_type ) ); - n.setName( "" ); + if ( ( value < 0.0 ) || ( value > 100 ) ) { + return false; } } } } + return true; + } + + final static public void transferInternalNodeNamesToConfidence( final Phylogeny phy, final String confidence_type ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + transferInternalNodeNameToConfidence( confidence_type, it.next() ); + } + } + + private static void transferInternalNodeNameToConfidence( final String confidence_type, final PhylogenyNode n ) { + if ( !n.isExternal() && !n.getBranchData().isHasConfidences() ) { + if ( !ForesterUtil.isEmpty( n.getName() ) ) { + double d = -1.0; + try { + d = Double.parseDouble( n.getName() ); + } + catch ( final Exception e ) { + d = -1.0; + } + if ( d >= 0.0 ) { + n.getBranchData().addConfidence( new Confidence( d, confidence_type ) ); + n.setName( "" ); + } + } + } } final static public void transferNodeNameToField( final Phylogeny phy, diff --git a/forester/java/src/org/forester/phylogeny/data/Accession.java b/forester/java/src/org/forester/phylogeny/data/Accession.java index b3d99a0..fe11d8d 100644 --- a/forester/java/src/org/forester/phylogeny/data/Accession.java +++ b/forester/java/src/org/forester/phylogeny/data/Accession.java @@ -40,7 +40,7 @@ public final class Accession implements PhylogenyData, Comparable { final private String _value; public enum Source { - NCBI, REFSEQ, UNIPROT, GI, EMBL, UNKNOWN; + NCBI, REFSEQ, UNIPROT, GI, EMBL, ENSEMBL, UNKNOWN; @Override public String toString() { @@ -55,6 +55,8 @@ public final class Accession implements PhylogenyData, Comparable { return "gi"; case EMBL: return "embl"; + case ENSEMBL: + return "ensembl"; case UNKNOWN: return "unknown"; default: diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 5103e7d..a7a7c8c 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -6656,6 +6656,35 @@ public final class Test { if ( phylogenies[ 17 ].getNumberOfExternalNodes() != 10 ) { return false; } + final NexusPhylogeniesParser p2 = new NexusPhylogeniesParser(); + phylogenies = null; + phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "S15613.nex", p2 ); + if ( phylogenies.length != 9 ) { + return false; + } + if ( !isEqual( 0.48039661496919533, phylogenies[ 0 ].getNode( "Diadocidia_spinosula" ) + .getDistanceToParent() ) ) { + return false; + } + if ( !isEqual( 0.3959796191512233, phylogenies[ 0 ].getNode( "Diadocidia_stanfordensis" ) + .getDistanceToParent() ) ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "Family Diadocidiidae MLT (Imported_tree_0)" ) ) { + return false; + } + if ( !phylogenies[ 1 ].getName().equals( "Family Diadocidiidae BAT (con_50_majrule)" ) ) { + return false; + } + if ( !phylogenies[ 2 ].getName().equals( "Family Diadocidiidae BAT (con_50_majrule)" ) ) { + return false; + } + if ( !isEqual( 0.065284, phylogenies[ 7 ].getNode( "Bradysia_amoena" ).getDistanceToParent() ) ) { + return false; + } + if ( !isEqual( 0.065284, phylogenies[ 8 ].getNode( "Bradysia_amoena" ).getDistanceToParent() ) ) { + return false; + } } catch ( final Exception e ) { e.printStackTrace( System.out ); @@ -7218,6 +7247,82 @@ public final class Test { if ( phy.isRooted() ) { return false; } + // + final NexusPhylogeniesParser p2 = new NexusPhylogeniesParser(); + p2.setSource( Test.PATH_TO_TEST_DATA + "S15613.nex" ); + // 0 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + if ( !isEqual( 0.48039661496919533, phy.getNode( "Diadocidia_spinosula" ).getDistanceToParent() ) ) { + return false; + } + if ( !isEqual( 0.3959796191512233, phy.getNode( "Diadocidia_stanfordensis" ).getDistanceToParent() ) ) { + return false; + } + // 1 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + // 2 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + // 3 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + // 4 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + // 5 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + // 6 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + // 7 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + // 8 + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + if ( !isEqual( 0.065284, phy.getNode( "Bradysia_amoena" ).getDistanceToParent() ) ) { + return false; + } + if ( p2.hasNext() ) { + return false; + } + phy = p2.next(); + if ( phy != null ) { + return false; + } + // 0 + p2.reset(); + if ( !p2.hasNext() ) { + return false; + } + phy = p2.next(); + if ( !isEqual( 0.48039661496919533, phy.getNode( "Diadocidia_spinosula" ).getDistanceToParent() ) ) { + return false; + } + if ( !isEqual( 0.3959796191512233, phy.getNode( "Diadocidia_stanfordensis" ).getDistanceToParent() ) ) { + return false; + } } catch ( final Exception e ) { e.printStackTrace( System.out ); diff --git a/forester/java/src/org/forester/util/SequenceAccessionTools.java b/forester/java/src/org/forester/util/SequenceAccessionTools.java index cce1b2e..5ab000e 100644 --- a/forester/java/src/org/forester/util/SequenceAccessionTools.java +++ b/forester/java/src/org/forester/util/SequenceAccessionTools.java @@ -64,6 +64,7 @@ public final class SequenceAccessionTools { .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" ); + public final static Pattern ENSEMBL_PATTERN = Pattern.compile( "(?:\\b|_)(ENS[A-Z]*[0-9]+)(?:\\b|_)" ); // RefSeq accession numbers can be distinguished from GenBank accessions // by their distinct prefix format of 2 characters followed by an // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. @@ -243,6 +244,10 @@ public final class SequenceAccessionTools { if ( !ForesterUtil.isEmpty( v ) ) { return new Accession( v, Source.GI ); } + v = parseEnsemlAccessorFromString( s ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Source.ENSEMBL ); + } } return null; } @@ -287,6 +292,14 @@ public final class SequenceAccessionTools { return null; } + public final static String parseEnsemlAccessorFromString( final String s ) { + final Matcher m = ENSEMBL_PATTERN.matcher( s ); + if ( m.find() ) { + return m.group( 1 ); + } + return null; + } + public final static String parseRefSeqAccessorFromString( final String s ) { final Matcher m = REFSEQ_PATTERN.matcher( s ); if ( m.lookingAt() ) { -- 1.7.10.2