From 5bae4861f59350b3158d6ebd5034ea7698d81b98 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Wed, 30 Aug 2017 10:48:53 -0700 Subject: [PATCH] in progress... --- .../src/org/forester/application/cladinator.java | 70 ++++++++++++++++++-- .../java/src/org/forester/application/mcc.java | 2 +- .../org/forester/application/msa_compactor.java | 2 +- .../archaeopteryx/MainFrameApplication.java | 2 +- .../forester/clade_analysis/CladeAnalysisDemo.java | 2 +- .../evoinference/TestPhylogenyReconstruction.java | 2 +- .../org/forester/io/parsers/GeneralMsaParser.java | 13 ++-- .../src/org/forester/sequence/BasicSequence.java | 24 +++++++ forester/java/src/org/forester/test/Test.java | 14 ++-- .../src/org/forester/util/ForesterConstants.java | 33 ++++----- .../java/src/org/forester/util/ForesterUtil.java | 16 +++-- 11 files changed, 140 insertions(+), 40 deletions(-) diff --git a/forester/java/src/org/forester/application/cladinator.java b/forester/java/src/org/forester/application/cladinator.java index daf0224..95322e1 100644 --- a/forester/java/src/org/forester/application/cladinator.java +++ b/forester/java/src/org/forester/application/cladinator.java @@ -31,7 +31,10 @@ import java.text.DecimalFormat; import java.util.ArrayList; import java.util.List; +import org.forester.clade_analysis.AnalysisMulti; import org.forester.clade_analysis.AnalysisSingle; +import org.forester.clade_analysis.Prefix; +import org.forester.clade_analysis.ResultMulti; import org.forester.clade_analysis.ResultSingle; import org.forester.io.parsers.PhylogenyParser; import org.forester.io.parsers.util.ParserUtils; @@ -44,15 +47,15 @@ import org.forester.util.ForesterUtil; public final class cladinator { final static private String PRG_NAME = "cladinator"; - final static private String PRG_VERSION = "0.101"; - final static private String PRG_DATE = "170810"; + final static private String PRG_VERSION = "0.100"; + final static private String PRG_DATE = "170823"; final static private String PRG_DESC = "clades within clades -- analysis of pplacer type outputs"; final static private String E_MAIL = "phyloxml@gmail.com"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester"; final static private String HELP_OPTION_1 = "help"; final static private String HELP_OPTION_2 = "h"; final static private String SEP_OPTION = "s"; - private final static DecimalFormat df2 = new DecimalFormat( ".##" ); + private final static DecimalFormat df2 = new DecimalFormat( "0.0#" ); public static void main( final String args[] ) { try { @@ -115,10 +118,69 @@ public final class cladinator { System.out.println( "\nCould not read \"" + intreefile + "\" [" + e.getMessage() + "]\n" ); System.exit( -1 ); } - final ResultSingle res = AnalysisSingle.execute( p, query, separator ); + + final ResultMulti res = AnalysisMulti.execute( p, query, separator, 0.5 ); + System.out.println(); System.out.println( "Result:" ); System.out.println( "Query : " + query ); + + /////////////////// + + + + System.out.println( "Collapsed:" ); + + for( final Prefix prefix : res.getCollapsedMultiHitPrefixes() ) { + System.out.println( prefix ); + } + if ( _has_specifics ) { + + System.out.println( "Specifics:" ); + + for( final Prefix prefix : _cleaned_spec ) { + System.out.println( prefix ); + + } + + System.out.println( "Collapsed With Specifics:" ); + + for( final Prefix prefix : _collapsed ) { + System.out.println( prefix ); + + for( final Prefix spec : _cleaned_spec ) { + if ( spec.getPrefix().startsWith( prefix.getPrefix() ) ) { + System.out.println( " " + spec ); + + } + } + } + } + if ( !ForesterUtil.isEmpty( _all_down ) ) { + + System.out.println( "Collapsed Down:" ); + + for( final Prefix prefix : _collapsed_down ) { + System.out.println( prefix ); + + } + + } + if ( !ForesterUtil.isEmpty( _all_up ) ) { + + + System.out.println( "Collapsed Up:" ); + + for( final Prefix prefix : _collapsed_up ) { + System.out.println( prefix ); + + } + + } + + /////////////////// + + System.out.print( "Greatest Common Prefix : " + res.getGreatestCommonPrefix() ); if ( !ForesterUtil.isEmpty( res.getGreatestCommonPrefix() ) && !ForesterUtil.isEmpty( res.getGreatestCommonCladeSubtreeConfidence() ) ) { diff --git a/forester/java/src/org/forester/application/mcc.java b/forester/java/src/org/forester/application/mcc.java index 23320fc..9f28424 100644 --- a/forester/java/src/org/forester/application/mcc.java +++ b/forester/java/src/org/forester/application/mcc.java @@ -83,7 +83,7 @@ public class mcc { msa = FastaParser.parseMsa( is ); } else { - msa = GeneralMsaParser.parse( is ); + msa = GeneralMsaParser.parseMsa( is ); } if ( cla.isOptionSet( FROM_OPTION ) ) { singleCalc( in, from, to, msa ); diff --git a/forester/java/src/org/forester/application/msa_compactor.java b/forester/java/src/org/forester/application/msa_compactor.java index e555593..bcfe2a6 100644 --- a/forester/java/src/org/forester/application/msa_compactor.java +++ b/forester/java/src/org/forester/application/msa_compactor.java @@ -142,7 +142,7 @@ public class msa_compactor { msa = DeleteableMsa.createInstance( FastaParser.parseMsa( is ) ); } else { - msa = DeleteableMsa.createInstance( GeneralMsaParser.parse( is ) ); + msa = DeleteableMsa.createInstance( GeneralMsaParser.parseMsa( is ) ); } final DescriptiveStatistics initial_msa_stats = MsaMethods.calculateEffectiveLengthStatistics( msa ); if (cla.isOptionSet( INFO_ONLY_OPTION ) ) { diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index 9488a1c..cb5b8de 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -515,7 +515,7 @@ public final class MainFrameApplication extends MainFrame { msa = FastaParser.parseMsa( is ); } else { - msa = GeneralMsaParser.parse( is ); + msa = GeneralMsaParser.parseMsa( is ); } } catch ( final MsaFormatException e ) { diff --git a/forester/java/src/org/forester/clade_analysis/CladeAnalysisDemo.java b/forester/java/src/org/forester/clade_analysis/CladeAnalysisDemo.java index 60e2dfb..f791f72 100644 --- a/forester/java/src/org/forester/clade_analysis/CladeAnalysisDemo.java +++ b/forester/java/src/org/forester/clade_analysis/CladeAnalysisDemo.java @@ -106,7 +106,7 @@ public class CladeAnalysisDemo { final Phylogeny p1 = factory.create( in, pp )[ 0 ]; ResultMulti res = AnalysisMulti.execute( p1, 0.5 ); - System.out.println( "DEMO 1:" ); + System.out.println( "DEMO 2:" ); System.out.println( "+++++++" ); System.out.print( res.toString() ); System.out.println( "------------------------- " ); diff --git a/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java b/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java index f1074f4..5b7b99c 100644 --- a/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java +++ b/forester/java/src/org/forester/evoinference/TestPhylogenyReconstruction.java @@ -437,7 +437,7 @@ public class TestPhylogenyReconstruction { private static boolean testDistanceCalculationMethods( final File test_dir ) { try { - final Msa msa0 = GeneralMsaParser.parse( new FileInputStream( test_dir + ForesterUtil.FILE_SEPARATOR + final Msa msa0 = GeneralMsaParser.parseMsa( new FileInputStream( test_dir + ForesterUtil.FILE_SEPARATOR + "bcl.aln" ) ); final BasicSymmetricalDistanceMatrix pwd0 = PairwiseDistanceCalculator.calcKimuraDistances( msa0 ); if ( pwd0.getSize() != 120 ) { diff --git a/forester/java/src/org/forester/io/parsers/GeneralMsaParser.java b/forester/java/src/org/forester/io/parsers/GeneralMsaParser.java index f4db6cb..4fa6e81 100644 --- a/forester/java/src/org/forester/io/parsers/GeneralMsaParser.java +++ b/forester/java/src/org/forester/io/parsers/GeneralMsaParser.java @@ -68,7 +68,12 @@ public final class GeneralMsaParser { .matcher( line ).lookingAt() ); } - static public Msa parse( final InputStream is ) throws IOException { + static final public Msa parseMsa( final InputStream is ) throws IOException { + final Msa msa = BasicMsa.createInstance( parseSeqs( is )); + return msa; + } + + static final public List parseSeqs( final InputStream is ) throws IOException { int block = -1; int current_seq_index_per_block = -1; String current_name = null; @@ -145,7 +150,7 @@ public final class GeneralMsaParser { name = names_in_order.get( current_seq_index_per_block ); } catch ( final IndexOutOfBoundsException e ) { - throw new MsaFormatException( "illegalmsa format (line: " + line_counter + "):\n\"" + throw new MsaFormatException( "illegal msa format (line: " + line_counter + "):\n\"" + trim( line ) + "\"" ); } if ( temp_msa.containsKey( name ) ) { @@ -173,8 +178,8 @@ public final class GeneralMsaParser { seqs.add( BasicSequence.createAaSequence( names_in_order.get( i ), temp_msa.get( names_in_order.get( i ) ) .toString() ) ); } - final Msa msa = BasicMsa.createInstance( seqs ); - return msa; + + return seqs; } private static String trim( final String line ) { diff --git a/forester/java/src/org/forester/sequence/BasicSequence.java b/forester/java/src/org/forester/sequence/BasicSequence.java index 6d339ca..2540cbe 100644 --- a/forester/java/src/org/forester/sequence/BasicSequence.java +++ b/forester/java/src/org/forester/sequence/BasicSequence.java @@ -147,6 +147,30 @@ public class BasicSequence implements MolecularSequence { return new BasicSequence( new String( seq.getIdentifier() ), s, seq.getType() ); } + public static MolecularSequence createSequence( final String identifier, final String mol_sequence ) { + check( identifier, mol_sequence ); + final TYPE type = ForesterUtil.guessMolecularSequenceType( mol_sequence ); + final String re; + final char repl; + if ( type == TYPE.AA ) { + re = AA_REGEXP; + repl = UNSPECIFIED_AA; + } + else if ( type == TYPE.DNA ) { + re = DNA_REGEXP; + repl = UNSPECIFIED_NUC; + } + else if ( type == TYPE.RNA ) { + re = RNA_REGEXP; + repl = UNSPECIFIED_NUC; + } + else { + throw new IllegalArgumentException( "could not determine sequence type for: " + mol_sequence); + } + return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) + .replaceAll( re, Character.toString( repl ) ), type ); + } + public static MolecularSequence createAaSequence( final String identifier, final String mol_sequence ) { check( identifier, mol_sequence ); return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 1a8a216..9cda7b7 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -5627,13 +5627,13 @@ public final class Test { private static boolean testGeneralMsaParser() { try { final String msa_str_0 = "seq1 abcd\n\nseq2 efgh\n"; - final Msa msa_0 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_0.getBytes() ) ); + final Msa msa_0 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_0.getBytes() ) ); final String msa_str_1 = "seq1 abc\nseq2 ghi\nseq1 def\nseq2 jkm\n"; - final Msa msa_1 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_1.getBytes() ) ); + final Msa msa_1 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_1.getBytes() ) ); final String msa_str_2 = "seq1 abc\nseq2 ghi\n\ndef\njkm\n"; - final Msa msa_2 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_2.getBytes() ) ); + final Msa msa_2 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_2.getBytes() ) ); final String msa_str_3 = "seq1 abc\n def\nseq2 ghi\n jkm\n"; - final Msa msa_3 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_3.getBytes() ) ); + final Msa msa_3 = GeneralMsaParser.parseMsa( new ByteArrayInputStream( msa_str_3.getBytes() ) ); if ( !msa_1.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdef" ) ) { return false; } @@ -5670,7 +5670,7 @@ public final class Test { if ( !msa_3.getIdentifier( 1 ).toString().equals( "seq2" ) ) { return false; } - final Msa msa_4 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_1.txt" ) ); + final Msa msa_4 = GeneralMsaParser.parseMsa( new FileInputStream( PATH_TO_TEST_DATA + "msa_1.txt" ) ); if ( !msa_4.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefeeeeeeeexx" ) ) { return false; } @@ -5680,7 +5680,7 @@ public final class Test { if ( !msa_4.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "klmnxphhhhhhhhzz" ) ) { return false; } - final Msa msa_5 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_2.txt" ) ); + final Msa msa_5 = GeneralMsaParser.parseMsa( new FileInputStream( PATH_TO_TEST_DATA + "msa_2.txt" ) ); if ( !msa_5.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefxx" ) ) { return false; } @@ -5690,7 +5690,7 @@ public final class Test { if ( !msa_5.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "klmnxpzz" ) ) { return false; } - final Msa msa_6 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_3.txt" ) ); + final Msa msa_6 = GeneralMsaParser.parseMsa( new FileInputStream( PATH_TO_TEST_DATA + "msa_3.txt" ) ); if ( !msa_6.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefeeeeeeeexx" ) ) { return false; } diff --git a/forester/java/src/org/forester/util/ForesterConstants.java b/forester/java/src/org/forester/util/ForesterConstants.java index 9f4a241..727f30c 100644 --- a/forester/java/src/org/forester/util/ForesterConstants.java +++ b/forester/java/src/org/forester/util/ForesterConstants.java @@ -27,22 +27,25 @@ package org.forester.util; public final class ForesterConstants { - public final static String FORESTER_VERSION = "1.045"; - public final static String FORESTER_DATE = "161214"; - public final static String PHYLO_XML_VERSION = "1.20"; - public final static String PHYLO_XML_LOCATION = "http://www.phyloxml.org"; - public final static String PHYLO_XML_XSD = "phyloxml.xsd"; - public final static String XML_SCHEMA_INSTANCE = "http://www.w3.org/2001/XMLSchema-instance"; - public final static String LOCAL_PHYLOXML_XSD_RESOURCE = "resources/phyloxml.xsd"; - public final static String PHYLO_XML_SUFFIX = ".xml"; - public final static String UTF_8 = "UTF-8"; - public final static String ISO_8859_1 = "ISO-8859-1"; - public final static String PHYLO_XML_REFERENCE = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356"; - public final static boolean RELEASE = false; + public final static String FORESTER_VERSION = "1.045"; + public final static String FORESTER_DATE = "161214"; + public final static String PHYLO_XML_VERSION = "1.20"; + public final static String PHYLO_XML_LOCATION = "http://www.phyloxml.org"; + public final static String PHYLO_XML_XSD = "phyloxml.xsd"; + public final static String XML_SCHEMA_INSTANCE = "http://www.w3.org/2001/XMLSchema-instance"; + public final static String LOCAL_PHYLOXML_XSD_RESOURCE = "resources/phyloxml.xsd"; + public final static String PHYLO_XML_SUFFIX = ".xml"; + public final static String ID_NORMALIZED_FASTA_FILE_SUFFIX = "_ni.fasta"; + public final static String ID_MAP_FILE_SUFFIX = ".nim"; + public final static String UTF_8 = "UTF-8"; + public final static String ISO_8859_1 = "ISO-8859-1"; + public final static String PHYLO_XML_REFERENCE = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356"; + public final static boolean RELEASE = false; public enum PhylogeneticTreeFormats { - NH, NHX, NEXUS, PHYLOXML + NH, + NHX, + NEXUS, + PHYLOXML } - - } diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index c254c8b..dddbf28 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -621,7 +621,7 @@ public final class ForesterUtil { final public static boolean isEqual( final double a, final double b ) { return ( ( Math.abs( a - b ) ) < ZERO_DIFF ); } - + final public static boolean isEqual( final double a, final double b, final double tolerance ) { return ( ( Math.abs( a - b ) ) < tolerance ); } @@ -1664,9 +1664,9 @@ public final class ForesterUtil { private ForesterUtil() { } - public static List spliIntoPrefixes(final String prefix, final String separator ) { + public static List spliIntoPrefixes( final String prefix, final String separator ) { final String[] a = prefix.split( Pattern.quote( separator ) ); - final List l= new ArrayList(); + final List l = new ArrayList(); for( int i = 0; i < a.length; ++i ) { final StringBuilder sb = new StringBuilder(); for( int j = 0; j <= i; ++j ) { @@ -1675,9 +1675,15 @@ public final class ForesterUtil { sb.append( separator ); } } - // System.out.println( sb.toString() ); - l.add( sb.toString()); + // System.out.println( sb.toString() ); + l.add( sb.toString() ); } return l; } + + // + public static boolean isLooksLikeFasta( final File file ) throws IOException { + final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); + return ( ( !isEmptyTrimmed( first_line ) && first_line.trim().startsWith( ">" ) ) ); + } } -- 1.7.10.2