X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fphylogeny%2FPhylogenyMethods.java;h=c3fbc95cb6b32731f0ee22c3525775f5cf7a21a9;hb=5cad5dbd4f9e8cf09a123d4ee490cf314c05fd2f;hp=459138beb2900ed34234c3ef77356b99512ea51c;hpb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;p=jalview.git diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index 459138b..c3fbc95 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -107,7 +107,7 @@ public class PhylogenyMethods { return could_extract; } - public static DescriptiveStatistics calculatBranchLengthStatistics( final Phylogeny phy ) { + public static DescriptiveStatistics calculateBranchLengthStatistics( final Phylogeny phy ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { final PhylogenyNode n = iter.next(); @@ -118,7 +118,7 @@ public class PhylogenyMethods { return stats; } - public static List calculatConfidenceStatistics( final Phylogeny phy ) { + public static List calculateConfidenceStatistics( final Phylogeny phy ) { final List stats = new ArrayList(); for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { final PhylogenyNode n = iter.next(); @@ -133,7 +133,7 @@ public class PhylogenyMethods { if ( !ForesterUtil.isEmpty( stats.get( i ).getDescription() ) ) { if ( !stats.get( i ).getDescription().equalsIgnoreCase( c.getType() ) ) { throw new IllegalArgumentException( "support values in node [" + n.toString() - + "] appear inconsistently ordered" ); + + "] appear inconsistently ordered" ); } } stats.get( i ).setDescription( c.getType() ); @@ -305,7 +305,7 @@ public class PhylogenyMethods { return x; } - public static DescriptiveStatistics calculatNumberOfDescendantsPerNodeStatistics( final Phylogeny phy ) { + public static DescriptiveStatistics calculateNumberOfDescendantsPerNodeStatistics( final Phylogeny phy ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { final PhylogenyNode n = iter.next(); @@ -936,12 +936,46 @@ public class PhylogenyMethods { } } + private static enum NDF { + NodeName( "NN" ), + TaxonomyCode( "TC" ), + TaxonomyCommonName( "CN" ), + TaxonomyScientificName( "TS" ), + TaxonomyIdentifier( "TI" ), + TaxonomySynonym( "SY" ), + SequenceName( "SN" ), + GeneName( "GN" ), + SequenceSymbol( "SS" ), + SequenceAccession( "SA" ), + Domain( "DO" ), + Annotation( "AN" ), + CrossRef( "XR" ), + BinaryCharacter( "BC" ), + MolecularSequence( "MS" ); + + private final String _text; + + NDF( final String text ) { + _text = text; + } + + public static NDF fromString( final String text ) { + for( final NDF n : NDF.values() ) { + if ( text.startsWith( n._text ) ) { + return n; + } + } + return null; + } + } + public static List searchData( final String query, final Phylogeny phy, final boolean case_sensitive, final boolean partial, final boolean regex, - final boolean search_domains ) { + final boolean search_domains, + final double domains_confidence_threshold ) { final List nodes = new ArrayList(); if ( phy.isEmpty() || ( query == null ) ) { return nodes; @@ -949,125 +983,160 @@ public class PhylogenyMethods { if ( ForesterUtil.isEmpty( query ) ) { return nodes; } + String my_query = query; + NDF ndf = null; + if ( ( my_query.length() > 2 ) && ( my_query.indexOf( ":" ) == 2 ) ) { + ndf = NDF.fromString( my_query ); + if ( ndf != null ) { + my_query = my_query.substring( 3 ); + } + } for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); boolean match = false; - if ( match( node.getName(), query, case_sensitive, partial, regex ) ) { + if ( ( ( ndf == null ) || ( ndf == NDF.NodeName ) ) + && match( node.getName(), my_query, case_sensitive, partial, regex ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() - && match( node.getNodeData().getTaxonomy().getTaxonomyCode(), query, case_sensitive, partial, regex ) ) { + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyCode ) ) + && node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getTaxonomyCode(), + my_query, + case_sensitive, + partial, + regex ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() - && match( node.getNodeData().getTaxonomy().getCommonName(), query, case_sensitive, partial, regex ) ) { + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyCommonName ) ) + && node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getCommonName(), + my_query, + case_sensitive, + partial, + regex ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyScientificName ) ) + && node.getNodeData().isHasTaxonomy() && match( node.getNodeData().getTaxonomy().getScientificName(), - query, + my_query, case_sensitive, partial, regex ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyIdentifier ) ) + && node.getNodeData().isHasTaxonomy() && ( node.getNodeData().getTaxonomy().getIdentifier() != null ) && match( node.getNodeData().getTaxonomy().getIdentifier().getValue(), - query, + my_query, case_sensitive, partial, regex ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() && !node.getNodeData().getTaxonomy().getSynonyms().isEmpty() ) { + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomySynonym ) ) && node.getNodeData().isHasTaxonomy() + && !node.getNodeData().getTaxonomy().getSynonyms().isEmpty() ) { final List syns = node.getNodeData().getTaxonomy().getSynonyms(); I: for( final String syn : syns ) { - if ( match( syn, query, case_sensitive, partial, regex ) ) { + if ( match( syn, my_query, case_sensitive, partial, regex ) ) { match = true; break I; } } } - if ( !match && node.getNodeData().isHasSequence() - && match( node.getNodeData().getSequence().getName(), query, case_sensitive, partial, regex ) ) { + if ( !match && ( ( ndf == null ) || ( ndf == NDF.SequenceName ) ) && node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getName(), my_query, case_sensitive, partial, regex ) ) { match = true; } - if ( !match && node.getNodeData().isHasSequence() - && match( node.getNodeData().getSequence().getGeneName(), query, case_sensitive, partial, regex ) ) { + if ( !match && ( ( ndf == null ) || ( ndf == NDF.GeneName ) ) && node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getGeneName(), my_query, case_sensitive, partial, regex ) ) { match = true; } - if ( !match && node.getNodeData().isHasSequence() - && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial, regex ) ) { + if ( !match && ( ( ndf == null ) || ( ndf == NDF.SequenceSymbol ) ) && node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getSymbol(), my_query, case_sensitive, partial, regex ) ) { match = true; } if ( !match + && ( ( ndf == null ) || ( ndf == NDF.SequenceAccession ) ) && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) && match( node.getNodeData().getSequence().getAccession().getValue(), - query, + my_query, case_sensitive, partial, regex ) ) { match = true; } - if ( search_domains && !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ( ndf == null ) && search_domains ) || ( ndf == NDF.Domain ) ) + && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getDomainArchitecture() != null ) ) { final DomainArchitecture da = node.getNodeData().getSequence().getDomainArchitecture(); I: for( int i = 0; i < da.getNumberOfDomains(); ++i ) { - if ( match( da.getDomain( i ).getName(), query, case_sensitive, partial, regex ) ) { + if ( ( da.getDomain( i ).getConfidence() <= domains_confidence_threshold ) + && ( match( da.getDomain( i ).getName(), my_query, case_sensitive, partial, regex ) ) ) { match = true; break I; } } } - if ( !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ndf == null ) || ( ndf == NDF.Annotation ) ) && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAnnotations() != null ) ) { for( final Annotation ann : node.getNodeData().getSequence().getAnnotations() ) { - if ( match( ann.getDesc(), query, case_sensitive, partial, regex ) ) { + if ( match( ann.getDesc(), my_query, case_sensitive, partial, regex ) ) { match = true; break; } - if ( match( ann.getRef(), query, case_sensitive, partial, regex ) ) { + if ( match( ann.getRef(), my_query, case_sensitive, partial, regex ) ) { match = true; break; } } } - if ( !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ndf == null ) || ( ndf == NDF.CrossRef ) ) && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getCrossReferences() != null ) ) { for( final Accession x : node.getNodeData().getSequence().getCrossReferences() ) { - if ( match( x.getComment(), query, case_sensitive, partial, regex ) ) { + if ( match( x.getComment(), my_query, case_sensitive, partial, regex ) ) { match = true; break; } - if ( match( x.getSource(), query, case_sensitive, partial, regex ) ) { + if ( match( x.getSource(), my_query, case_sensitive, partial, regex ) ) { match = true; break; } - if ( match( x.getValue(), query, case_sensitive, partial, regex ) ) { + if ( match( x.getValue(), my_query, case_sensitive, partial, regex ) ) { match = true; break; } } } - // - if ( !match && ( node.getNodeData().getBinaryCharacters() != null ) ) { + if ( !match && ( ( ndf == null ) || ( ndf == NDF.BinaryCharacter ) ) + && ( node.getNodeData().getBinaryCharacters() != null ) ) { Iterator it = node.getNodeData().getBinaryCharacters().getPresentCharacters().iterator(); I: while ( it.hasNext() ) { - if ( match( it.next(), query, case_sensitive, partial, regex ) ) { + if ( match( it.next(), my_query, case_sensitive, partial, regex ) ) { match = true; break I; } } it = node.getNodeData().getBinaryCharacters().getGainedCharacters().iterator(); I: while ( it.hasNext() ) { - if ( match( it.next(), query, case_sensitive, partial, regex ) ) { + if ( match( it.next(), my_query, case_sensitive, partial, regex ) ) { match = true; break I; } } } + if ( !match + && ( ndf == NDF.MolecularSequence ) + && node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getMolecularSequence(), + my_query, + case_sensitive, + true, + regex ) ) { + match = true; + } if ( match ) { nodes.add( node ); } @@ -1079,7 +1148,8 @@ public class PhylogenyMethods { final Phylogeny phy, final boolean case_sensitive, final boolean partial, - final boolean search_domains ) { + final boolean search_domains, + final double domains_confidence_threshold ) { final List nodes = new ArrayList(); if ( phy.isEmpty() || ( queries == null ) || ( queries.length < 1 ) ) { return nodes; @@ -1087,15 +1157,28 @@ public class PhylogenyMethods { for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); boolean all_matched = true; - for( final String query : queries ) { + for( String query : queries ) { + if ( query == null ) { + continue; + } + query = query.trim(); + NDF ndf = null; + if ( ( query.length() > 2 ) && ( query.indexOf( ":" ) == 2 ) ) { + ndf = NDF.fromString( query ); + if ( ndf != null ) { + query = query.substring( 3 ); + } + } boolean match = false; if ( ForesterUtil.isEmpty( query ) ) { continue; } - if ( match( node.getName(), query, case_sensitive, partial, false ) ) { + if ( ( ( ndf == null ) || ( ndf == NDF.NodeName ) ) + && match( node.getName(), query, case_sensitive, partial, false ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyCode ) ) + && node.getNodeData().isHasTaxonomy() && match( node.getNodeData().getTaxonomy().getTaxonomyCode(), query, case_sensitive, @@ -1103,7 +1186,8 @@ public class PhylogenyMethods { false ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyCommonName ) ) + && node.getNodeData().isHasTaxonomy() && match( node.getNodeData().getTaxonomy().getCommonName(), query, case_sensitive, @@ -1111,7 +1195,8 @@ public class PhylogenyMethods { false ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyScientificName ) ) + && node.getNodeData().isHasTaxonomy() && match( node.getNodeData().getTaxonomy().getScientificName(), query, case_sensitive, @@ -1119,7 +1204,8 @@ public class PhylogenyMethods { false ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomyIdentifier ) ) + && node.getNodeData().isHasTaxonomy() && ( node.getNodeData().getTaxonomy().getIdentifier() != null ) && match( node.getNodeData().getTaxonomy().getIdentifier().getValue(), query, @@ -1128,7 +1214,7 @@ public class PhylogenyMethods { false ) ) { match = true; } - else if ( node.getNodeData().isHasTaxonomy() + else if ( ( ( ndf == null ) || ( ndf == NDF.TaxonomySynonym ) ) && node.getNodeData().isHasTaxonomy() && !node.getNodeData().getTaxonomy().getSynonyms().isEmpty() ) { final List syns = node.getNodeData().getTaxonomy().getSynonyms(); I: for( final String syn : syns ) { @@ -1138,20 +1224,23 @@ public class PhylogenyMethods { } } } - if ( !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ndf == null ) || ( ndf == NDF.SequenceName ) ) && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getName(), query, case_sensitive, partial, false ) ) { match = true; } if ( !match + && ( ( ndf == null ) || ( ndf == NDF.GeneName ) ) && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getGeneName(), query, case_sensitive, partial, false ) ) { match = true; } - if ( !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ndf == null ) || ( ndf == NDF.SequenceSymbol ) ) + && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial, false ) ) { match = true; } if ( !match + && ( ( ndf == null ) || ( ndf == NDF.SequenceAccession ) ) && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) && match( node.getNodeData().getSequence().getAccession().getValue(), @@ -1161,18 +1250,19 @@ public class PhylogenyMethods { false ) ) { match = true; } - if ( search_domains && !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ( ndf == null ) && search_domains ) || ( ndf == NDF.Domain ) ) + && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getDomainArchitecture() != null ) ) { final DomainArchitecture da = node.getNodeData().getSequence().getDomainArchitecture(); I: for( int i = 0; i < da.getNumberOfDomains(); ++i ) { - if ( match( da.getDomain( i ).getName(), query, case_sensitive, partial, false ) ) { + if ( ( da.getDomain( i ).getConfidence() <= domains_confidence_threshold ) + && match( da.getDomain( i ).getName(), query, case_sensitive, partial, false ) ) { match = true; break I; } } } - // - if ( !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ndf == null ) || ( ndf == NDF.Annotation ) ) && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAnnotations() != null ) ) { for( final Annotation ann : node.getNodeData().getSequence().getAnnotations() ) { if ( match( ann.getDesc(), query, case_sensitive, partial, false ) ) { @@ -1185,7 +1275,7 @@ public class PhylogenyMethods { } } } - if ( !match && node.getNodeData().isHasSequence() + if ( !match && ( ( ndf == null ) || ( ndf == NDF.CrossRef ) ) && node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getCrossReferences() != null ) ) { for( final Accession x : node.getNodeData().getSequence().getCrossReferences() ) { if ( match( x.getComment(), query, case_sensitive, partial, false ) ) { @@ -1202,8 +1292,8 @@ public class PhylogenyMethods { } } } - // - if ( !match && ( node.getNodeData().getBinaryCharacters() != null ) ) { + if ( !match && ( ( ndf == null ) || ( ndf == NDF.BinaryCharacter ) ) + && ( node.getNodeData().getBinaryCharacters() != null ) ) { Iterator it = node.getNodeData().getBinaryCharacters().getPresentCharacters().iterator(); I: while ( it.hasNext() ) { if ( match( it.next(), query, case_sensitive, partial, false ) ) { @@ -1219,6 +1309,16 @@ public class PhylogenyMethods { } } } + if ( !match + && ( ndf == NDF.MolecularSequence ) + && node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getMolecularSequence(), + query, + case_sensitive, + true, + false ) ) { + match = true; + } if ( !match ) { all_matched = false; break; @@ -1365,7 +1465,7 @@ public class PhylogenyMethods { else if ( !( ref_ext_taxo.contains( n.getNodeData().getTaxonomy().getScientificName() ) ) && !( ref_ext_taxo.contains( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) && !( ( n.getNodeData().getTaxonomy().getIdentifier() != null ) && ref_ext_taxo.contains( n - .getNodeData().getTaxonomy().getIdentifier().getValuePlusProvider() ) ) ) { + .getNodeData().getTaxonomy().getIdentifier().getValuePlusProvider() ) ) ) { nodes_to_delete.add( n ); } } @@ -1502,7 +1602,7 @@ public class PhylogenyMethods { n.setName( "" ); } n.getNodeData().getTaxonomy() - .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) ); + .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) ); break; } case TAXONOMY_ID_UNIPROT_2: { @@ -1518,7 +1618,7 @@ public class PhylogenyMethods { n.setName( "" ); } n.getNodeData().getTaxonomy() - .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) ); + .setIdentifier( new Identifier( id, PhyloXmlUtil.UNIPROT_TAX_PROVIDER ) ); break; } case TAXONOMY_ID: {