X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fphylogeny%2FPhylogenyMethods.java;h=806f031f45854d197eccb15af3f7a78fafb33765;hb=a0b68380946f9cc6fe0a2a8f2994299a92e8d907;hp=cc140e4da4609d916541d455ff7eb7fc364eeabb;hpb=9309c84fbe86a9212029fee105c51765220f4e0c;p=jalview.git diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index cc140e4..806f031 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -36,14 +36,18 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.forester.io.parsers.FastaParser; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.phyloxml.PhyloXmlUtil; import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; import org.forester.phylogeny.data.BranchColor; import org.forester.phylogeny.data.BranchWidth; import org.forester.phylogeny.data.Confidence; @@ -71,6 +75,38 @@ public class PhylogenyMethods { throw new CloneNotSupportedException(); } + public static void extractFastaInformation( final Phylogeny phy ) { + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( !ForesterUtil.isEmpty( node.getName() ) ) { + final Matcher name_m = FastaParser.FASTA_DESC_LINE.matcher( node.getName() ); + if ( name_m.lookingAt() ) { + System.out.println(); + // System.out.println( name_m.group( 1 ) ); + // System.out.println( name_m.group( 2 ) ); + // System.out.println( name_m.group( 3 ) ); + // System.out.println( name_m.group( 4 ) ); + final String acc_source = name_m.group( 1 ); + final String acc = name_m.group( 2 ); + final String seq_name = name_m.group( 3 ); + final String tax_sn = name_m.group( 4 ); + if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) ); + } + if ( !ForesterUtil.isEmpty( seq_name ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence( 0 ).setName( seq_name ); + } + if ( !ForesterUtil.isEmpty( tax_sn ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn ); + } + } + } + } + } + public static DescriptiveStatistics calculatBranchLengthStatistics( final Phylogeny phy ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { @@ -409,6 +445,26 @@ public class PhylogenyMethods { phy.externalNodesHaveChanged(); } + public final static List> divideIntoSubTrees( final Phylogeny phy, + final double min_distance_to_root ) { + if ( min_distance_to_root <= 0 ) { + throw new IllegalArgumentException( "attempt to use min distance to root of: " + min_distance_to_root ); + } + final List> l = new ArrayList>(); + setAllIndicatorsToZero( phy ); + for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getIndicator() != 0 ) { + continue; + } + l.add( divideIntoSubTreesHelper( n, min_distance_to_root ) ); + if ( l.isEmpty() ) { + throw new RuntimeException( "this should not have happened" ); + } + } + return l; + } + public static List getAllDescendants( final PhylogenyNode node ) { final List descs = new ArrayList(); final Set encountered = new HashSet(); @@ -709,9 +765,9 @@ public class PhylogenyMethods { * null is returned. * */ - public static SortedMap obtainDistinctTaxonomyCounts( final PhylogenyNode node ) { + public static Map obtainDistinctTaxonomyCounts( final PhylogenyNode node ) { final List descs = node.getAllExternalDescendants(); - final SortedMap tax_map = new TreeMap(); + final Map tax_map = new HashMap(); for( final PhylogenyNode n : descs ) { if ( !n.getNodeData().isHasTaxonomy() || n.getNodeData().getTaxonomy().isEmpty() ) { return null; @@ -918,6 +974,10 @@ public class PhylogenyMethods { match = true; } if ( !match && node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getGeneName(), query, case_sensitive, partial ) ) { + match = true; + } + if ( !match && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial ) ) { match = true; } @@ -940,6 +1000,38 @@ public class PhylogenyMethods { } } } + // + if ( !match && node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getAnnotations() != null ) ) { + for( final Annotation ann : node.getNodeData().getSequence().getAnnotations() ) { + if ( match( ann.getDesc(), query, case_sensitive, partial ) ) { + match = true; + break; + } + if ( match( ann.getRef(), query, case_sensitive, partial ) ) { + match = true; + break; + } + } + } + if ( !match && node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getCrossReferences() != null ) ) { + for( final Accession x : node.getNodeData().getSequence().getCrossReferences() ) { + if ( match( x.getComment(), query, case_sensitive, partial ) ) { + match = true; + break; + } + if ( match( x.getSource(), query, case_sensitive, partial ) ) { + match = true; + break; + } + if ( match( x.getValue(), query, case_sensitive, partial ) ) { + match = true; + break; + } + } + } + // if ( !match && ( node.getNodeData().getBinaryCharacters() != null ) ) { Iterator it = node.getNodeData().getBinaryCharacters().getPresentCharacters().iterator(); I: while ( it.hasNext() ) { @@ -1018,6 +1110,10 @@ public class PhylogenyMethods { match = true; } if ( !match && node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getGeneName(), query, case_sensitive, partial ) ) { + match = true; + } + if ( !match && node.getNodeData().isHasSequence() && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial ) ) { match = true; } @@ -1040,6 +1136,38 @@ public class PhylogenyMethods { } } } + // + if ( !match && node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getAnnotations() != null ) ) { + for( final Annotation ann : node.getNodeData().getSequence().getAnnotations() ) { + if ( match( ann.getDesc(), query, case_sensitive, partial ) ) { + match = true; + break; + } + if ( match( ann.getRef(), query, case_sensitive, partial ) ) { + match = true; + break; + } + } + } + if ( !match && node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getCrossReferences() != null ) ) { + for( final Accession x : node.getNodeData().getSequence().getCrossReferences() ) { + if ( match( x.getComment(), query, case_sensitive, partial ) ) { + match = true; + break; + } + if ( match( x.getSource(), query, case_sensitive, partial ) ) { + match = true; + break; + } + if ( match( x.getValue(), query, case_sensitive, partial ) ) { + match = true; + break; + } + } + } + // if ( !match && ( node.getNodeData().getBinaryCharacters() != null ) ) { Iterator it = node.getNodeData().getBinaryCharacters().getPresentCharacters().iterator(); I: while ( it.hasNext() ) { @@ -1068,6 +1196,12 @@ public class PhylogenyMethods { return nodes; } + public static void setAllIndicatorsToZero( final Phylogeny phy ) { + for( final PhylogenyNodeIterator it = phy.iteratorPostorder(); it.hasNext(); ) { + it.next().setIndicator( ( byte ) 0 ); + } + } + /** * Convenience method. * Sets value for the first confidence value (created if not present, values overwritten otherwise). @@ -1171,6 +1305,11 @@ public class PhylogenyMethods { return n1.getNodeData().getSequence().getSymbol() .compareTo( n2.getNodeData().getSequence().getSymbol() ); } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getGeneName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getGeneName() ) ) ) { + return n1.getNodeData().getSequence().getGeneName() + .compareTo( n2.getNodeData().getSequence().getGeneName() ); + } if ( ( n1.getNodeData().getSequence().getAccession() != null ) && ( n2.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getAccession().getValue() ) @@ -1200,6 +1339,11 @@ public class PhylogenyMethods { return n1.getNodeData().getSequence().getSymbol() .compareTo( n2.getNodeData().getSequence().getSymbol() ); } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getGeneName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getGeneName() ) ) ) { + return n1.getNodeData().getSequence().getGeneName() + .compareTo( n2.getNodeData().getSequence().getGeneName() ); + } if ( ( n1.getNodeData().getSequence().getAccession() != null ) && ( n2.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getAccession().getValue() ) @@ -1266,6 +1410,11 @@ public class PhylogenyMethods { return n1.getNodeData().getSequence().getSymbol() .compareTo( n2.getNodeData().getSequence().getSymbol() ); } + if ( ( !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getGeneName() ) ) + && ( !ForesterUtil.isEmpty( n2.getNodeData().getSequence().getGeneName() ) ) ) { + return n1.getNodeData().getSequence().getGeneName() + .compareTo( n2.getNodeData().getSequence().getGeneName() ); + } if ( ( n1.getNodeData().getSequence().getAccession() != null ) && ( n2.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( n1.getNodeData().getSequence().getAccession().getValue() ) @@ -1544,6 +1693,20 @@ public class PhylogenyMethods { } } + private final static List divideIntoSubTreesHelper( final PhylogenyNode node, + final double min_distance_to_root ) { + final List l = new ArrayList(); + final PhylogenyNode r = moveTowardsRoot( node, min_distance_to_root ); + for( final PhylogenyNode ext : r.getAllExternalDescendants() ) { + if ( ext.getIndicator() != 0 ) { + throw new RuntimeException( "this should not have happened" ); + } + ext.setIndicator( ( byte ) 1 ); + l.add( ext ); + } + return l; + } + /** * Calculates the distance between PhylogenyNodes n1 and n2. * PRECONDITION: n1 is a descendant of n2. @@ -1581,23 +1744,33 @@ public class PhylogenyMethods { return my_s.indexOf( my_query ) >= 0; } else { - return my_s.equals( my_query ); + return Pattern.compile( "(\\b|_)" + Pattern.quote( my_query ) + "(\\b|_)" ).matcher( my_s ).find(); + } + } + + private final static PhylogenyNode moveTowardsRoot( final PhylogenyNode node, final double min_distance_to_root ) { + PhylogenyNode n = node; + PhylogenyNode prev = node; + while ( min_distance_to_root < n.calculateDistanceToRoot() ) { + prev = n; + n = n.getParent(); } + return prev; } public static enum DESCENDANT_SORT_PRIORITY { - TAXONOMY, SEQUENCE, NODE_NAME; + NODE_NAME, SEQUENCE, TAXONOMY; } public static enum PhylogenyNodeField { CLADE_NAME, + SEQUENCE_NAME, + SEQUENCE_SYMBOL, TAXONOMY_CODE, - TAXONOMY_SCIENTIFIC_NAME, TAXONOMY_COMMON_NAME, - SEQUENCE_SYMBOL, - SEQUENCE_NAME, + TAXONOMY_ID, TAXONOMY_ID_UNIPROT_1, TAXONOMY_ID_UNIPROT_2, - TAXONOMY_ID; + TAXONOMY_SCIENTIFIC_NAME; } }