in progress...
authorcmzmasek <chris.zma@outlook.com>
Fri, 18 Aug 2017 21:49:00 +0000 (14:49 -0700)
committercmzmasek <chris.zma@outlook.com>
Fri, 18 Aug 2017 21:49:00 +0000 (14:49 -0700)
forester/java/src/org/forester/clade_analysis/Analysis2.java
forester/java/src/org/forester/clade_analysis/CladeAnalysisTest.java
forester/java/src/org/forester/clade_analysis/Result2.java
forester/java/src/org/forester/util/ForesterUtil.java
forester/test_data/pplacer_2.tre [new file with mode: 0644]

index 4c1d368..f3b8cae 100644 (file)
@@ -33,6 +33,8 @@ package org.forester.clade_analysis;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyNode;
@@ -41,29 +43,62 @@ import org.forester.util.ForesterUtil;
 
 public final class Analysis2 {
 
-    public static Result2 execute( final Phylogeny p, final String query, final String separator ) {
-        final PhylogenyNode qnode = p.getNode( query );
-        if ( qnode.isRoot() ) {
-            throw new IllegalStateException( "Unexpected error: Query " + query
-                    + " is root. This should have never happened" );
-        }
-        if ( qnode.getParent().isRoot() ) {
-            throw new IllegalStateException( "Unexpected error: Parent of query " + query
-                    + " is root. This should have never happened" );
-        }
-        PhylogenyNode qnode_p = qnode.getParent();
-        PhylogenyNode qnode_pp = qnode.getParent().getParent();
-        while ( qnode_p.getNumberOfDescendants() == 1 ) {
-            qnode_p = qnode_p.getParent();
-        }
-        while ( qnode_pp.getNumberOfDescendants() == 1 ) {
-            qnode_pp = qnode_pp.getParent();
+    public static Result2 execute( final Phylogeny p, final Pattern query, final String separator ) {
+        final List<PhylogenyNode> qnodes = p.getNodes( query );
+        final Result2 res = new Result2();
+        for( int i = 0; i < qnodes.size(); ++i ) {
+            final PhylogenyNode qnode = qnodes.get( i );
+            System.out.println( ">>" + qnode.getName() );
+            if ( qnode.isRoot() ) {
+                throw new IllegalArgumentException( "Query " + query + " is root." );
+            }
+            if ( qnode.getParent().isRoot() ) {
+                throw new IllegalArgumentException( "Parent of query " + query + " is root." );
+            }
+            PhylogenyNode qnode_p = qnode.getParent();
+            PhylogenyNode qnode_pp = qnode.getParent().getParent();
+            //This is to deal with internal nodes with 1 descendant.
+            while ( qnode_p.getNumberOfDescendants() == 1 ) {
+                qnode_p = qnode_p.getParent();
+            }
+            while ( qnode_pp.getNumberOfDescendants() == 1 ) {
+                qnode_pp = qnode_pp.getParent();
+            }
+            // final List<PhylogenyNode> qnode_ext_nodes = new ArrayList<PhylogenyNode>();
+            final List<String> qnode_ext_nodes_names = new ArrayList<>();
+            for( final PhylogenyNode qnode_ext_node : qnode_pp.getAllExternalDescendants() ) {
+                final String name = qnode_ext_node.getName();
+                if ( ForesterUtil.isEmptyTrimmed( name ) ) {
+                    throw new IllegalArgumentException( "external node(s) with empty names found" );
+                }
+                final Matcher m = query.matcher( name );
+                if ( !m.find() ) {
+                    qnode_ext_nodes_names.add( name );
+                }
+            }
+            final int lec_ext_nodes = qnode_ext_nodes_names.size();
+            final int p_ext_nodes = p.getNumberOfExternalNodes() - 1;
+            final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( qnode_ext_nodes_names, separator );
+            System.out.println( greatest_common_prefix );
+            Matcher matcher = query.matcher( qnode.getName() );
+            String conf_str = null;
+            if ( matcher.find() ) {
+                conf_str = matcher.group( 1 );
+            }
+            else {
+                throw new IllegalStateException( "pattern did not match -- this should have never happened!" );
+            }
+            res.setLeastEncompassingCladeSize( lec_ext_nodes );
+            res.setTreeSize( p_ext_nodes );
+            final double conf = Double.parseDouble( conf_str );
+            if ( !ForesterUtil.isEmpty( greatest_common_prefix ) ) {
+                res.addGreatestCommonPrefix( greatest_common_prefix, conf );
+            }
+            else {
+                res.addGreatestCommonPrefix( "?", conf );
+            }
         }
-        final List<PhylogenyNode> qnode_ext_nodes = qnode_pp.getAllExternalDescendants();
-        final int lec_ext_nodes = qnode_ext_nodes.size() - 1;
-        final int p_ext_nodes = p.getNumberOfExternalNodes() - 1;
-        final List<String> qnode_ext_nodes_names = new ArrayList<>();
-        for( final PhylogenyNode qnode_ext_node : qnode_ext_nodes ) {
+        /* for( final PhylogenyNode qnode_ext_node : qnode_ext_nodes ) {
             String name = qnode_ext_node.getName();
             if ( ForesterUtil.isEmptyTrimmed( name ) ) {
                 throw new IllegalArgumentException( "external node(s) with empty names found" );
@@ -72,29 +107,23 @@ public final class Analysis2 {
             if ( !name.equals( query ) ) {
                 qnode_ext_nodes_names.add( name );
             }
-        }
-        final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( qnode_ext_nodes_names, separator );
-        final Result2 res = new Result2();
-        if ( greatest_common_prefix.length() < 1 ) {
-            res.addWarning( "No greatest common prefix" );
-            //res.setGreatestCommonPrefix( "" );
-        }
-        else {
-          //  res.setGreatestCommonPrefix( greatest_common_prefix );
-           // res.addGreatestCommonPrefix( prefix, confidence, separator ); //TODO
-        }
-        if ( qnode_pp.isRoot() ) {
-            res.addWarning( "Least Encompassing Clade is entire tree" );
-        }
-        res.setLeastEncompassingCladeSize( lec_ext_nodes );
-        res.setTreeSize( p_ext_nodes );
-       
-        final String conf = obtainConfidence( qnode_pp );
+        }*/
+        //   if ( greatest_common_prefix.length() < 1 ) {
+        //       res.addWarning( "No greatest common prefix" );
+        //res.setGreatestCommonPrefix( "" );
+        //  }
+        // else {
+        //    //  res.setGreatestCommonPrefix( greatest_common_prefix );
+        // res.addGreatestCommonPrefix( prefix, confidence, separator ); //TODO
+        //   }
+        // if ( qnode_pp.isRoot() ) {
+        //     res.addWarning( "Least Encompassing Clade is entire tree" );
+        // }
+        /*    final String conf = obtainConfidence( qnode_pp );
         if ( conf != null ) {
             res.setGreatestCommonCladeSubtreeConfidence(conf);
-        }
-        
-        final String greatest_common_prefix_up[] = analyzeSiblings( qnode_p, qnode_pp, separator );
+        }*/
+        /*  final String greatest_common_prefix_up[] = analyzeSiblings( qnode_p, qnode_pp, separator );
         res.setGreatestCommonPrefixUp( greatest_common_prefix_up[ 0 ] );
         if ( greatest_common_prefix_up[ 1 ] != null ) {
             res.setGreatestCommonCladeUpSubtreeConfidence( greatest_common_prefix_up[ 1 ] );
@@ -103,12 +132,10 @@ public final class Analysis2 {
         res.setGreatestCommonPrefixDown( greatest_common_prefix_down[ 0 ] );
         if ( greatest_common_prefix_down[ 1 ] != null ) {
             res.setGreatestCommonCladeDownSubtreeConfidence( greatest_common_prefix_down[ 1 ] );
-        }
+        }*/
         return res;
     }
 
-   
-
     private final static String[] analyzeSiblings( final PhylogenyNode child,
                                                    final PhylogenyNode parent,
                                                    final String separator ) {
@@ -134,7 +161,7 @@ public final class Analysis2 {
         final String greatest_common_prefix = ForesterUtil.greatestCommonPrefix( ext_nodes_names, separator );
         return new String[] { greatest_common_prefix, conf };
     }
-    
+
     private final static String obtainConfidence( final PhylogenyNode n ) {
         if ( n.getBranchData().getConfidences() != null && n.getBranchData().getConfidences().size() > 0 ) {
             final List<Confidence> confidences = n.getBranchData().getConfidences();
index 3d1f4b9..207b57c 100644 (file)
@@ -3,6 +3,7 @@ package org.forester.clade_analysis;
 
 import java.io.File;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.util.ParserUtils;
@@ -30,6 +31,10 @@ public class CladeAnalysisTest {
             System.out.println( "Clade analysis 3 failed" );
             failed = true;
         }
+        if ( !testCladeAnalysis4() ) {
+            System.out.println( "Clade analysis 3 failed" );
+            failed = true;
+        }
         if ( !failed ) {
             System.out.println( "OK" );
         }
@@ -45,6 +50,9 @@ public class CladeAnalysisTest {
         if ( !testCladeAnalysis3() ) {
             return false;
         }
+        if ( !testCladeAnalysis4() ) {
+            return false;
+        }
         return true;
     }
 
@@ -711,4 +719,46 @@ public class CladeAnalysisTest {
         }
         return true;
     }
+    
+    private static boolean testCladeAnalysis4() {
+        try {
+            final File intreefile1 = new File( PATH_TO_TEST_DATA + "pplacer_2.tre" );
+            final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+            final PhylogenyParser pp = ParserUtils.createParserDependingOnFileType( intreefile1, true );
+            final Phylogeny p1 = factory.create( intreefile1, pp )[ 0 ];
+            Pattern query = Pattern.compile(".+#\\d+_M=(.+)");
+            Result2 res = Analysis2.execute( p1, query, "." );
+            
+            res.analyzeGreatestCommonPrefixes( 0.3 );
+            System.out.print( res.toString());
+            System.out.println( "------------------------- ");
+            System.out.println();
+            
+           // Result res = Analysis.execute( p1, "A.1.1.1", "." );
+           /* if ( !res.getGreatestCommonPrefix().equals( "A.1" ) ) {
+                return false;
+            }
+            if ( !res.getGreatestCommonPrefixDown().equals( "A.1.1" ) ) {
+                return false;
+            }
+            if ( !res.getGreatestCommonPrefixUp().equals( "A.1.2.1" ) ) {
+                return false;
+            }
+            if ( res.getLeastEncompassingCladeSize() != 4 ) {
+                return false;
+            }
+            if ( res.getTreeSize() != 25 ) {
+                return false;
+            }
+            if ( res.getWarnings().size() != 0 ) {
+                return false;
+            }*/
+          
+        }
+        catch ( final Exception e ) {
+            e.printStackTrace( System.out );
+            return false;
+        }
+        return true;
+    }
 }
index 81353c3..2bd4911 100644 (file)
@@ -62,6 +62,23 @@ public final class Result2 {
         _separator = ".";//TODO make const somewhere
     }
 
+    public List<Prefix> getAllMultiHitPrefixes() {
+        return _all;
+    }
+    
+    public List<Prefix> getCollapsedMultiHitPrefixes() {
+        return _collapsed;
+    }
+    
+    public List<Prefix> getSpecificMultiHitPrefixes() {
+        return _cleaned_spec;
+    }
+    
+    public boolean isHasSpecificMultiHitsPrefixes() {
+        return _has_specifics;
+    }
+    
+    
     void addWarning( final String warning ) {
         _warnings.add( warning );
     }
@@ -201,7 +218,7 @@ public final class Result2 {
                 confidence_sum += prefix.getConfidence();
             }
         }
-        if ( !ForesterUtil.isEqual( confidence_sum, 1.0 ) ) {
+        if ( !ForesterUtil.isEqual( confidence_sum, 1.0, 1E-5 ) ) {
             throw new IllegalArgumentException( "Confidences add up to " + confidence_sum + " instead of 1.0" );
         }
         return collapsed;
index 3d9888b..e453dad 100644 (file)
@@ -621,6 +621,10 @@ public final class ForesterUtil {
     final public static boolean isEqual( final double a, final double b ) {
         return ( ( Math.abs( a - b ) ) < ZERO_DIFF );
     }
+    
+    final public static boolean isEqual( final double a, final double b, final double tolerance ) {
+        return ( ( Math.abs( a - b ) ) < tolerance );
+    }
 
     final public static boolean isEven( final int n ) {
         return ( n % 2 ) == 0;
diff --git a/forester/test_data/pplacer_2.tre b/forester/test_data/pplacer_2.tre
new file mode 100644 (file)
index 0000000..f28c394
--- /dev/null
@@ -0,0 +1,148 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.20/phyloxml.xsd" xmlns="http://www.phyloxml.org">
+<phylogeny rooted="true" rerootable="true">
+  <clade>
+    <branch_length>0.0</branch_length>
+    <clade>
+      <branch_length>0.0679195</branch_length>
+      <clade>
+        <branch_length>0.21174</branch_length>
+        <clade>
+          <branch_length>0.477305</branch_length>
+          <clade>
+            <branch_length>0.309716</branch_length>
+            <clade>
+              <branch_length>0.0152436</branch_length>
+              <clade>
+                <branch_length>0.0857918</branch_length>
+                <clade>
+                  <branch_length>0.162176</branch_length>
+                  <clade>
+                    <branch_length>9.756E-6</branch_length>
+                    <clade>
+                      <branch_length>0.0802987</branch_length>
+                      <clade>
+                        <branch_length>0.0684959</branch_length>
+                        <clade>
+                          <branch_length>0.0761231</branch_length>
+                          <clade>
+                            <branch_length>0.107021</branch_length>
+                            <clade>
+                              <branch_length>1.14092</branch_length>
+                              <clade>
+                                <name>A.1.1.1</name>
+                                <branch_length>1.0E-6</branch_length>
+                              </clade>
+                              <clade>
+                                <name>A.1.1.2</name>
+                                <branch_length>0.043972</branch_length>
+                              </clade>
+                            </clade>
+                            <clade>
+                              <name>CED9_CAEBR_#5_M=0.0277996</name>
+                              <branch_length>1.49689</branch_length>
+                            </clade>
+                          </clade>
+                          <clade>
+                            <name>A.1.1.3</name>
+                            <branch_length>1.11622</branch_length>
+                          </clade>
+                        </clade>
+                        <clade>
+                          <name>CED9_CAEBR_#6_M=0.0273544</name>
+                          <branch_length>1.58319</branch_length>
+                        </clade>
+                      </clade>
+                      <clade>
+                        <branch_length>0.760242</branch_length>
+                        <clade>
+                          <name>A.1.2.1</name>
+                          <branch_length>0.130667</branch_length>
+                        </clade>
+                        <clade>
+                          <name>A.1.2.2</name>
+                          <branch_length>0.127953</branch_length>
+                        </clade>
+                      </clade>
+                    </clade>
+                    <clade>
+                      <name>CED9_CAEBR_#4_M=0.0552666</name>
+                      <branch_length>1.60222</branch_length>
+                    </clade>
+                  </clade>
+                  <clade>
+                    <branch_length>7.591E-6</branch_length>
+                    <clade>
+                      <name>A.2.1.1</name>
+                      <branch_length>1.00994</branch_length>
+                    </clade>
+                    <clade>
+                      <name>CED9_CAEBR_#3_M=0.0552703</name>
+                      <branch_length>1.60221</branch_length>
+                    </clade>
+                  </clade>
+                </clade>
+                <clade>
+                  <name>CED9_CAEBR_#0_M=0.380211</name>
+                  <branch_length>1.54796</branch_length>
+                </clade>
+              </clade>
+              <clade>
+                <branch_length>1.257E-5</branch_length>
+                <clade>
+                  <branch_length>1.11517</branch_length>
+                  <clade>
+                    <branch_length>0.0852309</branch_length>
+                    <clade>
+                      <name>A.3.1.1</name>
+                      <branch_length>0.022644</branch_length>
+                    </clade>
+                    <clade>
+                      <name>A.3.1.1</name>
+                      <branch_length>0.017626</branch_length>
+                    </clade>
+                  </clade>
+                  <clade>
+                    <name>A.3.2.1</name>
+                    <branch_length>0.156409</branch_length>
+                  </clade>
+                </clade>
+                <clade>
+                  <name>CED9_CAEBR_#2_M=0.224819</name>
+                  <branch_length>1.56994</branch_length>
+                </clade>
+              </clade>
+            </clade>
+            <clade>
+              <name>CED9_CAEBR_#1_M=0.229279</name>
+              <branch_length>1.56987</branch_length>
+            </clade>
+          </clade>
+          <clade>
+            <name>C.5</name>
+            <branch_length>0.367867</branch_length>
+            <taxonomy>
+            </taxonomy>
+          </clade>
+        </clade>
+        <clade>
+          <name>A.6</name>
+          <branch_length>0.030507</branch_length>
+        </clade>
+      </clade>
+      <clade>
+        <name>A.7</name>
+        <branch_length>0.026535</branch_length>
+      </clade>
+    </clade>
+    <clade>
+      <name>A.8</name>
+      <branch_length>0.035019</branch_length>
+    </clade>
+    <clade>
+      <name>B.9</name>
+      <branch_length>1.0E-6</branch_length>
+    </clade>
+  </clade>
+</phylogeny>
+</phyloxml>
\ No newline at end of file