in progress...
authorcmzmasek <chris.zma@outlook.com>
Wed, 30 Aug 2017 17:48:53 +0000 (10:48 -0700)
committercmzmasek <chris.zma@outlook.com>
Wed, 30 Aug 2017 17:48:53 +0000 (10:48 -0700)
forester/java/src/org/forester/application/cladinator.java
forester/java/src/org/forester/application/mcc.java
forester/java/src/org/forester/application/msa_compactor.java
forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java
forester/java/src/org/forester/clade_analysis/CladeAnalysisDemo.java
forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java
forester/java/src/org/forester/io/parsers/GeneralMsaParser.java
forester/java/src/org/forester/sequence/BasicSequence.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/ForesterConstants.java
forester/java/src/org/forester/util/ForesterUtil.java

index daf0224..95322e1 100644 (file)
@@ -31,7 +31,10 @@ import java.text.DecimalFormat;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.forester.clade_analysis.AnalysisMulti;
 import org.forester.clade_analysis.AnalysisSingle;
+import org.forester.clade_analysis.Prefix;
+import org.forester.clade_analysis.ResultMulti;
 import org.forester.clade_analysis.ResultSingle;
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.util.ParserUtils;
@@ -44,15 +47,15 @@ import org.forester.util.ForesterUtil;
 public final class cladinator {
 
     final static private String        PRG_NAME      = "cladinator";
-    final static private String        PRG_VERSION   = "0.101";
-    final static private String        PRG_DATE      = "170810";
+    final static private String        PRG_VERSION   = "0.100";
+    final static private String        PRG_DATE      = "170823";
     final static private String        PRG_DESC      = "clades within clades -- analysis of pplacer type outputs";
     final static private String        E_MAIL        = "phyloxml@gmail.com";
     final static private String        WWW           = "https://sites.google.com/site/cmzmasek/home/software/forester";
     final static private String        HELP_OPTION_1 = "help";
     final static private String        HELP_OPTION_2 = "h";
     final static private String        SEP_OPTION    = "s";
-    private final static DecimalFormat df2           = new DecimalFormat( ".##" );
+    private final static DecimalFormat df2           = new DecimalFormat( "0.0#" );
 
     public static void main( final String args[] ) {
         try {
@@ -115,10 +118,69 @@ public final class cladinator {
                 System.out.println( "\nCould not read \"" + intreefile + "\" [" + e.getMessage() + "]\n" );
                 System.exit( -1 );
             }
-            final ResultSingle res = AnalysisSingle.execute( p, query, separator );
+          
+            final ResultMulti res = AnalysisMulti.execute( p, query, separator, 0.5 );
+            
             System.out.println();
             System.out.println( "Result:" );
             System.out.println( "Query                        : " + query );
+            
+            ///////////////////
+            
+         
+         
+            System.out.println( "Collapsed:" );
+          
+              for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) {
+                  System.out.println( prefix );
+              }
+              if ( _has_specifics ) {
+                 
+                  System.out.println( "Specifics:" );
+                 
+                  for( final Prefix prefix : _cleaned_spec ) {
+                      System.out.println( prefix );
+                     
+                  }
+                  
+                  System.out.println( "Collapsed With Specifics:" );
+                 
+                  for( final Prefix prefix : _collapsed ) {
+                      System.out.println( prefix );
+                      
+                      for( final Prefix spec : _cleaned_spec ) {
+                          if ( spec.getPrefix().startsWith( prefix.getPrefix() ) ) {
+                              System.out.println( "    " + spec );
+                             
+                          }
+                      }
+                  }
+              }
+              if ( !ForesterUtil.isEmpty( _all_down ) ) {
+                  
+                  System.out.println( "Collapsed Down:" );
+                  
+                  for( final Prefix prefix : _collapsed_down ) {
+                      System.out.println( prefix );
+                      
+                  }
+              
+              }
+              if ( !ForesterUtil.isEmpty( _all_up ) ) {
+                  
+           
+                  System.out.println( "Collapsed Up:" );
+                 
+                  for( final Prefix prefix : _collapsed_up ) {
+                      System.out.println( prefix );
+                     
+                  }
+             
+              }
+            
+            ///////////////////
+            
+            
             System.out.print( "Greatest Common Prefix       : " + res.getGreatestCommonPrefix() );
             if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefix() )
                     && !ForesterUtil.isEmpty( res.getGreatestCommonCladeSubtreeConfidence() ) ) {
index 23320fc..9f28424 100644 (file)
@@ -83,7 +83,7 @@ public class mcc {
                 msa = FastaParser.parseMsa( is );
             }
             else {
-                msa = GeneralMsaParser.parse( is );
+                msa = GeneralMsaParser.parseMsa( is );
             }
             if ( cla.isOptionSet( FROM_OPTION ) ) {
                 singleCalc( in, from, to, msa );
index e555593..bcfe2a6 100644 (file)
@@ -142,7 +142,7 @@ public class msa_compactor {
                 msa = DeleteableMsa.createInstance( FastaParser.parseMsa( is ) );
             }
             else {
-                msa = DeleteableMsa.createInstance( GeneralMsaParser.parse( is ) );
+                msa = DeleteableMsa.createInstance( GeneralMsaParser.parseMsa( is ) );
             }
             final DescriptiveStatistics initial_msa_stats = MsaMethods.calculateEffectiveLengthStatistics( msa );
             if (cla.isOptionSet( INFO_ONLY_OPTION ) ) {
index 9488a1c..cb5b8de 100644 (file)
@@ -515,7 +515,7 @@ public final class MainFrameApplication extends MainFrame {
                     msa = FastaParser.parseMsa( is );
                 }
                 else {
-                    msa = GeneralMsaParser.parse( is );
+                    msa = GeneralMsaParser.parseMsa( is );
                 }
             }
             catch ( final MsaFormatException e ) {
index 60e2dfb..f791f72 100644 (file)
@@ -106,7 +106,7 @@ public class CladeAnalysisDemo {
             final Phylogeny p1 = factory.create( in, pp )[ 0 ];
             ResultMulti res = AnalysisMulti.execute( p1, 0.5 );
             
-            System.out.println( "DEMO 1:" );
+            System.out.println( "DEMO 2:" );
             System.out.println( "+++++++" );
             System.out.print( res.toString() );
             System.out.println( "------------------------- " );
index f1074f4..5b7b99c 100644 (file)
@@ -437,7 +437,7 @@ public class TestPhylogenyReconstruction {
 
     private static boolean testDistanceCalculationMethods( final File test_dir ) {
         try {
-            final Msa msa0 = GeneralMsaParser.parse( new FileInputStream( test_dir + ForesterUtil.FILE_SEPARATOR
+            final Msa msa0 = GeneralMsaParser.parseMsa( new FileInputStream( test_dir + ForesterUtil.FILE_SEPARATOR
                     + "bcl.aln" ) );
             final BasicSymmetricalDistanceMatrix pwd0 = PairwiseDistanceCalculator.calcKimuraDistances( msa0 );
             if ( pwd0.getSize() != 120 ) {
index f4db6cb..4fa6e81 100644 (file)
@@ -68,7 +68,12 @@ public final class GeneralMsaParser {
                 .matcher( line ).lookingAt() );
     }
 
-    static public Msa parse( final InputStream is ) throws IOException {
+    static final public Msa parseMsa( final InputStream is ) throws IOException {
+        final Msa msa = BasicMsa.createInstance( parseSeqs( is ));
+        return msa;
+    }
+    
+    static final public List<MolecularSequence> parseSeqs( final InputStream is ) throws IOException {
         int block = -1;
         int current_seq_index_per_block = -1;
         String current_name = null;
@@ -145,7 +150,7 @@ public final class GeneralMsaParser {
                                 name = names_in_order.get( current_seq_index_per_block );
                             }
                             catch ( final IndexOutOfBoundsException e ) {
-                                throw new MsaFormatException( "illegalmsa format (line: " + line_counter + "):\n\""
+                                throw new MsaFormatException( "illegal msa format (line: " + line_counter + "):\n\""
                                         + trim( line ) + "\"" );
                             }
                             if ( temp_msa.containsKey( name ) ) {
@@ -173,8 +178,8 @@ public final class GeneralMsaParser {
             seqs.add( BasicSequence.createAaSequence( names_in_order.get( i ), temp_msa.get( names_in_order.get( i ) )
                                                       .toString() ) );
         }
-        final Msa msa = BasicMsa.createInstance( seqs );
-        return msa;
+      
+        return seqs;
     }
 
     private static String trim( final String line ) {
index 6d339ca..2540cbe 100644 (file)
@@ -147,6 +147,30 @@ public class BasicSequence implements MolecularSequence {
         return new BasicSequence( new String( seq.getIdentifier() ), s, seq.getType() );
     }
 
+    public static MolecularSequence createSequence( final String identifier, final String mol_sequence ) {
+        check( identifier, mol_sequence );
+        final TYPE type = ForesterUtil.guessMolecularSequenceType( mol_sequence );
+        final String re;
+        final char repl;
+        if ( type == TYPE.AA ) {
+            re = AA_REGEXP;
+            repl = UNSPECIFIED_AA;
+        }
+        else if ( type == TYPE.DNA ) {
+            re = DNA_REGEXP;
+            repl = UNSPECIFIED_NUC;
+        }
+        else if ( type == TYPE.RNA ) {
+            re = RNA_REGEXP;
+            repl = UNSPECIFIED_NUC;
+        }
+        else {
+            throw new IllegalArgumentException( "could not determine sequence type for: " + mol_sequence);
+        }
+        return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
+                                  .replaceAll( re, Character.toString( repl ) ), type );
+    }
+    
     public static MolecularSequence createAaSequence( final String identifier, final String mol_sequence ) {
         check( identifier, mol_sequence );
         return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR )
index 1a8a216..9cda7b7 100644 (file)
@@ -5627,13 +5627,13 @@ public final class Test {
     private static boolean testGeneralMsaParser() {
         try {
             final String msa_str_0 = "seq1 abcd\n\nseq2 efgh\n";
-            final Msa msa_0 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_0.getBytes() ) );
+            final Msa msa_0 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_0.getBytes() ) );
             final String msa_str_1 = "seq1 abc\nseq2 ghi\nseq1 def\nseq2 jkm\n";
-            final Msa msa_1 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_1.getBytes() ) );
+            final Msa msa_1 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_1.getBytes() ) );
             final String msa_str_2 = "seq1 abc\nseq2 ghi\n\ndef\njkm\n";
-            final Msa msa_2 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_2.getBytes() ) );
+            final Msa msa_2 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_2.getBytes() ) );
             final String msa_str_3 = "seq1 abc\n def\nseq2 ghi\n jkm\n";
-            final Msa msa_3 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_3.getBytes() ) );
+            final Msa msa_3 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_3.getBytes() ) );
             if ( !msa_1.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdef" ) ) {
                 return false;
             }
@@ -5670,7 +5670,7 @@ public final class Test {
             if ( !msa_3.getIdentifier( 1 ).toString().equals( "seq2" ) ) {
                 return false;
             }
-            final Msa msa_4 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_1.txt" ) );
+            final Msa msa_4 = GeneralMsaParser.parseMsa( new FileInputStream( PATH_TO_TEST_DATA + "msa_1.txt" ) );
             if ( !msa_4.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefeeeeeeeexx" ) ) {
                 return false;
             }
@@ -5680,7 +5680,7 @@ public final class Test {
             if ( !msa_4.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "klmnxphhhhhhhhzz" ) ) {
                 return false;
             }
-            final Msa msa_5 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_2.txt" ) );
+            final Msa msa_5 = GeneralMsaParser.parseMsa( new FileInputStream( PATH_TO_TEST_DATA + "msa_2.txt" ) );
             if ( !msa_5.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefxx" ) ) {
                 return false;
             }
@@ -5690,7 +5690,7 @@ public final class Test {
             if ( !msa_5.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "klmnxpzz" ) ) {
                 return false;
             }
-            final Msa msa_6 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_3.txt" ) );
+            final Msa msa_6 = GeneralMsaParser.parseMsa( new FileInputStream( PATH_TO_TEST_DATA + "msa_3.txt" ) );
             if ( !msa_6.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefeeeeeeeexx" ) ) {
                 return false;
             }
index 9f4a241..727f30c 100644 (file)
@@ -27,22 +27,25 @@ package org.forester.util;
 
 public final class ForesterConstants {
 
-    public final static String  FORESTER_VERSION            = "1.045";
-    public final static String  FORESTER_DATE               = "161214";
-    public final static String  PHYLO_XML_VERSION           = "1.20";
-    public final static String  PHYLO_XML_LOCATION          = "http://www.phyloxml.org";
-    public final static String  PHYLO_XML_XSD               = "phyloxml.xsd";
-    public final static String  XML_SCHEMA_INSTANCE         = "http://www.w3.org/2001/XMLSchema-instance";
-    public final static String  LOCAL_PHYLOXML_XSD_RESOURCE = "resources/phyloxml.xsd";
-    public final static String  PHYLO_XML_SUFFIX            = ".xml";
-    public final static String  UTF_8 = "UTF-8";
-    public final static String  ISO_8859_1 = "ISO-8859-1";
-    public final static String  PHYLO_XML_REFERENCE         = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356";
-    public final static boolean RELEASE                     = false;
+    public final static String  FORESTER_VERSION                = "1.045";
+    public final static String  FORESTER_DATE                   = "161214";
+    public final static String  PHYLO_XML_VERSION               = "1.20";
+    public final static String  PHYLO_XML_LOCATION              = "http://www.phyloxml.org";
+    public final static String  PHYLO_XML_XSD                   = "phyloxml.xsd";
+    public final static String  XML_SCHEMA_INSTANCE             = "http://www.w3.org/2001/XMLSchema-instance";
+    public final static String  LOCAL_PHYLOXML_XSD_RESOURCE     = "resources/phyloxml.xsd";
+    public final static String  PHYLO_XML_SUFFIX                = ".xml";
+    public final static String  ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta";
+    public final static String  ID_MAP_FILE_SUFFIX              = ".nim";
+    public final static String  UTF_8                           = "UTF-8";
+    public final static String  ISO_8859_1                      = "ISO-8859-1";
+    public final static String  PHYLO_XML_REFERENCE             = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356";
+    public final static boolean RELEASE                         = false;
 
     public enum PhylogeneticTreeFormats {
-        NH, NHX, NEXUS, PHYLOXML
+                                         NH,
+                                         NHX,
+                                         NEXUS,
+                                         PHYLOXML
     }
-
-  
 }
index c254c8b..dddbf28 100644 (file)
@@ -621,7 +621,7 @@ public final class ForesterUtil {
     final public static boolean isEqual( final double a, final double b ) {
         return ( ( Math.abs( a - b ) ) < ZERO_DIFF );
     }
-    
+
     final public static boolean isEqual( final double a, final double b, final double tolerance ) {
         return ( ( Math.abs( a - b ) ) < tolerance );
     }
@@ -1664,9 +1664,9 @@ public final class ForesterUtil {
     private ForesterUtil() {
     }
 
-    public static List<String> spliIntoPrefixes(final String prefix, final String separator ) {
+    public static List<String> spliIntoPrefixes( final String prefix, final String separator ) {
         final String[] a = prefix.split( Pattern.quote( separator ) );
-        final List<String> l= new ArrayList<String>();
+        final List<String> l = new ArrayList<String>();
         for( int i = 0; i < a.length; ++i ) {
             final StringBuilder sb = new StringBuilder();
             for( int j = 0; j <= i; ++j ) {
@@ -1675,9 +1675,15 @@ public final class ForesterUtil {
                     sb.append( separator );
                 }
             }
-          //  System.out.println( sb.toString() );
-            l.add( sb.toString());
+            //  System.out.println( sb.toString() );
+            l.add( sb.toString() );
         }
         return l;
     }
+
+    //
+    public static boolean isLooksLikeFasta( final File file ) throws IOException {
+        final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase();
+        return ( ( !isEmptyTrimmed( first_line ) && first_line.trim().startsWith( ">" ) ) );
+    }
 }