From 48f7a89be9d34f1930a1f863e608235cc27184c5 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 9 Feb 2011 01:09:48 +0000 Subject: [PATCH] initial commit --- .../src/org/forester/go/BasicGoRelationship.java | 131 + .../java/src/org/forester/go/BasicGoSubset.java | 125 + forester/java/src/org/forester/go/BasicGoTerm.java | 246 + forester/java/src/org/forester/go/BasicGoXRef.java | 182 + forester/java/src/org/forester/go/GoId.java | 83 + forester/java/src/org/forester/go/GoNameSpace.java | 141 + .../java/src/org/forester/go/GoRelationship.java | 42 + forester/java/src/org/forester/go/GoSubset.java | 44 + forester/java/src/org/forester/go/GoTerm.java | 55 + forester/java/src/org/forester/go/GoUtils.java | 221 + forester/java/src/org/forester/go/GoXRef.java | 67 + forester/java/src/org/forester/go/Mapping.java | 33 + forester/java/src/org/forester/go/OBOparser.java | 271 + .../java/src/org/forester/go/PfamToGoMapping.java | 89 + .../java/src/org/forester/go/PfamToGoParser.java | 100 + forester/java/src/org/forester/go/TestGo.java | 698 ++ .../src/org/forester/go/etc/MetaOntologizer.java | 639 ++ .../src/org/forester/go/etc/OntologizerResult.java | 205 + .../src/org/forester/io/parsers/FastaParser.java | 210 + .../org/forester/io/parsers/GeneralMsaParser.java | 186 + .../forester/io/parsers/HmmPfamOutputParser.java | 689 ++ .../io/parsers/HmmscanPerDomainTableParser.java | 595 ++ .../org/forester/io/parsers/PhylogenyParser.java | 44 + .../parsers/SymmetricalDistanceMatrixParser.java | 196 + .../nexus/NexusBinaryStatesMatrixParser.java | 167 + .../io/parsers/nexus/NexusCharactersParser.java | 117 + .../forester/io/parsers/nexus/NexusConstants.java | 48 + .../io/parsers/nexus/NexusFormatException.java | 41 + .../io/parsers/nexus/NexusPhylogeniesParser.java | 338 + .../forester/io/parsers/nexus/PaupLogParser.java | 128 + .../io/parsers/nhx/NHXFormatException.java | 41 + .../src/org/forester/io/parsers/nhx/NHXParser.java | 797 ++ .../src/org/forester/io/parsers/nhx/NHXtags.java | 55 + .../phyloxml/PhyloXmlDataFormatException.java | 40 + .../io/parsers/phyloxml/PhyloXmlException.java | 39 + .../io/parsers/phyloxml/PhyloXmlHandler.java | 454 ++ .../io/parsers/phyloxml/PhyloXmlMapping.java | 134 + .../io/parsers/phyloxml/PhyloXmlParser.java | 313 + .../forester/io/parsers/phyloxml/PhyloXmlUtil.java | 98 + .../forester/io/parsers/phyloxml/XmlElement.java | 213 + .../io/parsers/phyloxml/data/AccessionParser.java | 63 + .../io/parsers/phyloxml/data/AnnotationParser.java | 97 + .../phyloxml/data/BinaryCharactersParser.java | 115 + .../parsers/phyloxml/data/BranchWidthParser.java | 56 + .../io/parsers/phyloxml/data/ColorParser.java | 76 + .../io/parsers/phyloxml/data/ConfidenceParser.java | 62 + .../io/parsers/phyloxml/data/DateParser.java | 95 + .../parsers/phyloxml/data/DistributionParser.java | 83 + .../phyloxml/data/DomainArchitectureParser.java | 76 + .../io/parsers/phyloxml/data/EventParser.java | 97 + .../io/parsers/phyloxml/data/IdentifierParser.java | 67 + .../phyloxml/data/PhylogenyDataPhyloXmlParser.java | 36 + .../io/parsers/phyloxml/data/PointParser.java | 95 + .../io/parsers/phyloxml/data/PolygonParser.java | 68 + .../io/parsers/phyloxml/data/PropertyParser.java | 99 + .../parsers/phyloxml/data/ProteinDomainParser.java | 78 + .../io/parsers/phyloxml/data/ReferenceParser.java | 75 + .../io/parsers/phyloxml/data/SequenceParser.java | 99 + .../phyloxml/data/SequenceRelationParser.java | 91 + .../io/parsers/phyloxml/data/TaxonomyParser.java | 87 + .../io/parsers/phyloxml/data/UriParser.java | 75 + .../src/org/forester/io/parsers/tol/TolParser.java | 286 + .../org/forester/io/parsers/tol/TolXmlHandler.java | 318 + .../org/forester/io/parsers/tol/TolXmlMapping.java | 47 + .../org/forester/io/parsers/util/ParserUtils.java | 73 + .../io/parsers/util/PhylogenyParserException.java | 53 + .../forester/io/writers/PhyloXmlNodeWriter.java | 59 + .../org/forester/io/writers/PhylogenyWriter.java | 761 ++ .../org/forester/io/writers/SequenceWriter.java | 96 + forester/java/src/org/forester/msa/BasicMsa.java | 156 + forester/java/src/org/forester/msa/Mafft.java | 124 + forester/java/src/org/forester/msa/MafftOLD.java | 78 + forester/java/src/org/forester/msa/Msa.java | 52 + .../src/org/forester/msa/MsaFormatException.java | 37 + .../java/src/org/forester/msa/MsaInferrer.java | 39 + forester/java/src/org/forester/msa/MsaTools.java | 126 + .../java/src/org/forester/msa/ResampleableMsa.java | 57 + .../BasicExternalNodeBasedCoverageExtender.java | 178 + .../pccx/BranchCountingBasedScoringMethod.java | 74 + .../pccx/BranchLengthBasedScoringMethod.java | 73 + forester/java/src/org/forester/pccx/Coverage.java | 36 + .../forester/pccx/CoverageCalculationMethod.java | 41 + .../forester/pccx/CoverageCalculationOptions.java | 34 + .../src/org/forester/pccx/CoverageCalculator.java | 63 + .../src/org/forester/pccx/CoverageExtender.java | 43 + .../forester/pccx/ExternalNodeBasedCoverage.java | 100 + .../pccx/ExternalNodeBasedCoverageMethod.java | 130 + .../ExternalNodeBasedCoverageMethodOptions.java | 62 + .../pccx/LogBranchLengthBasedScoringMethod.java | 85 + .../java/src/org/forester/pccx/ModelingUtils.java | 81 + .../pccx/ScoringMethodForExternalNode.java | 80 + forester/java/src/org/forester/pccx/TestPccx.java | 246 + forester/java/src/org/forester/phylogeny/Edge.java | 45 + .../java/src/org/forester/phylogeny/Phylogeny.java | 1335 ++++ .../org/forester/phylogeny/PhylogenyBranch.java | 168 + .../org/forester/phylogeny/PhylogenyMethods.java | 1186 +++ .../src/org/forester/phylogeny/PhylogenyNode.java | 1032 +++ .../src/org/forester/phylogeny/PhylogenyNodeI.java | 47 + .../src/org/forester/phylogeny/data/Accession.java | 142 + .../org/forester/phylogeny/data/Annotation.java | 282 + .../forester/phylogeny/data/BinaryCharacters.java | 319 + .../org/forester/phylogeny/data/BranchColor.java | 111 + .../org/forester/phylogeny/data/BranchData.java | 156 + .../org/forester/phylogeny/data/BranchWidth.java | 91 + .../org/forester/phylogeny/data/Confidence.java | 142 + .../java/src/org/forester/phylogeny/data/Date.java | 188 + .../org/forester/phylogeny/data/Distribution.java | 182 + .../phylogeny/data/DomainArchitecture.java | 221 + .../src/org/forester/phylogeny/data/Event.java | 376 + .../org/forester/phylogeny/data/Identifier.java | 147 + .../org/forester/phylogeny/data/MultipleUris.java | 39 + .../src/org/forester/phylogeny/data/NodeData.java | 524 ++ .../org/forester/phylogeny/data/PhylogenyData.java | 72 + .../forester/phylogeny/data/PhylogenyDataUtil.java | 372 + .../src/org/forester/phylogeny/data/Point.java | 152 + .../src/org/forester/phylogeny/data/Polygon.java | 109 + .../org/forester/phylogeny/data/PropertiesMap.java | 205 + .../src/org/forester/phylogeny/data/Property.java | 332 + .../org/forester/phylogeny/data/ProteinDomain.java | 171 + .../src/org/forester/phylogeny/data/Reference.java | 117 + .../src/org/forester/phylogeny/data/Sequence.java | 388 + .../forester/phylogeny/data/SequenceRelation.java | 149 + .../src/org/forester/phylogeny/data/Taxonomy.java | 394 + .../java/src/org/forester/phylogeny/data/Uri.java | 127 + .../phylogeny/factories/BasicPhylogenyFactory.java | 46 + .../factories/ParserBasedPhylogenyFactory.java | 89 + .../phylogeny/factories/PhylogenyFactory.java | 77 + .../iterators/ChildNodeIteratorForward.java | 141 + .../iterators/ExternalForwardIterator.java | 119 + .../iterators/LevelOrderTreeIterator.java | 147 + .../phylogeny/iterators/PhylogenyNodeIterator.java | 46 + .../phylogeny/iterators/PostOrderStackObject.java | 70 + .../phylogeny/iterators/PostorderTreeIterator.java | 128 + .../phylogeny/iterators/PreorderTreeIterator.java | 115 + .../src/org/forester/sdi/DistanceCalculator.java | 500 ++ forester/java/src/org/forester/sdi/GSDI.java | 389 + forester/java/src/org/forester/sdi/ORcount.java | 382 + forester/java/src/org/forester/sdi/RIO.java | 1126 +++ forester/java/src/org/forester/sdi/RIOn.java | 132 + forester/java/src/org/forester/sdi/SDI.java | 318 + forester/java/src/org/forester/sdi/SDIR.java | 579 ++ forester/java/src/org/forester/sdi/SDIse.java | 203 + forester/java/src/org/forester/sdi/Shin.java | 134 + .../src/org/forester/sdi/TaxonomyAssigner.java | 71 + forester/java/src/org/forester/sdi/TestGSDI.java | 1215 +++ forester/java/src/org/forester/sdi/Tuplet.java | 168 + .../src/org/forester/sequence/BasicSequence.java | 91 + .../java/src/org/forester/sequence/Sequence.java | 53 + .../AdjactantDirectedBinaryDomainCombination.java | 54 + .../AdjactantDirectedCombinableDomains.java | 49 + .../surfacing/BasicBinaryDomainCombination.java | 170 + .../forester/surfacing/BasicCombinableDomains.java | 185 + .../src/org/forester/surfacing/BasicDomain.java | 224 + .../surfacing/BasicDomainSimilarityCalculator.java | 242 + .../BasicGenomeWideCombinableDomains.java | 365 + .../src/org/forester/surfacing/BasicProtein.java | 175 + .../src/org/forester/surfacing/BasicSpecies.java | 83 + .../surfacing/BinaryDomainCombination.java | 56 + .../org/forester/surfacing/CombinableDomains.java | 138 + .../CombinationsBasedPairwiseDomainSimilarity.java | 70 + ...onsBasedPairwiseDomainSimilarityCalculator.java | 59 + .../CountsBasedPairwiseDomainSimilarity.java | 65 + .../surfacing/DirectedBinaryDomainCombination.java | 54 + .../surfacing/DirectedCombinableDomains.java | 48 + .../java/src/org/forester/surfacing/Domain.java | 56 + ...rchitectureBasedGenomeSimilarityCalculator.java | 333 + ...ainCountsBasedPairwiseSimilarityCalculator.java | 41 + .../surfacing/DomainCountsDifferenceUtil.java | 825 ++ .../java/src/org/forester/surfacing/DomainId.java | 131 + .../src/org/forester/surfacing/DomainLengths.java | 143 + .../org/forester/surfacing/DomainLengthsTable.java | 165 + .../surfacing/DomainParsimonyCalculator.java | 744 ++ .../org/forester/surfacing/DomainSimilarity.java | 101 + .../surfacing/DomainSimilarityCalculator.java | 47 + .../surfacing/GenomeWideCombinableDomains.java | 79 + .../src/org/forester/surfacing/MappingResults.java | 58 + .../surfacing/PairwiseDomainSimilarity.java | 41 + .../PairwiseDomainSimilarityCalculator.java | 34 + .../surfacing/PairwiseGenomeComparator.java | 353 + .../surfacing/PrintableDomainSimilarity.java | 717 ++ ...rintableSpeciesSpecificDomainSimilariyData.java | 141 + .../java/src/org/forester/surfacing/Protein.java | 68 + ...ntsBasedPairwiseDomainSimilarityCalculator.java | 41 + .../java/src/org/forester/surfacing/ProteinId.java | 80 + .../src/org/forester/surfacing/SimpleDomain.java | 122 + .../java/src/org/forester/surfacing/Species.java | 32 + .../SpeciesSpecificDomainSimilariyData.java | 50 + .../org/forester/surfacing/SurfacingConstants.java | 48 + .../src/org/forester/surfacing/SurfacingUtil.java | 2414 ++++++ .../src/org/forester/surfacing/TestSurfacing.java | 6277 ++++++++++++++++ forester/java/src/org/forester/test/Test.java | 7939 ++++++++++++++++++++ .../src/org/forester/tools/ConfidenceAssessor.java | 178 + .../src/org/forester/tools/PhylogenyDecorator.java | 525 ++ .../java/src/org/forester/tools/SupportCount.java | 250 + .../src/org/forester/tools/TreeSplitMatrix.java | 257 + .../java/src/org/forester/util/AsciiHistogram.java | 127 + .../forester/util/BasicDescriptiveStatistics.java | 340 + .../java/src/org/forester/util/BasicTable.java | 188 + .../src/org/forester/util/BasicTableParser.java | 108 + .../org/forester/util/CommandLineArguments.java | 281 + .../org/forester/util/CommandProcessBuilder.java | 81 + .../org/forester/util/DescriptiveStatistics.java | 83 + .../src/org/forester/util/ExternalProgram.java | 124 + .../util/FailedConditionCheckException.java | 43 + .../src/org/forester/util/ForesterConstants.java | 39 + .../java/src/org/forester/util/ForesterUtil.java | 1245 +++ .../java/src/org/forester/util/GeneralTable.java | 139 + .../forester/util/IllegalFormatUseException.java | 42 + .../org/forester/util/SystemCommandExecutor.java | 154 + .../org/forester/util/ThreadedStreamHandler.java | 135 + .../java/src/org/forester/util/WindowsUtils.java | 87 + 211 files changed, 56896 insertions(+) create mode 100644 forester/java/src/org/forester/go/BasicGoRelationship.java create mode 100644 forester/java/src/org/forester/go/BasicGoSubset.java create mode 100644 forester/java/src/org/forester/go/BasicGoTerm.java create mode 100644 forester/java/src/org/forester/go/BasicGoXRef.java create mode 100644 forester/java/src/org/forester/go/GoId.java create mode 100644 forester/java/src/org/forester/go/GoNameSpace.java create mode 100644 forester/java/src/org/forester/go/GoRelationship.java create mode 100644 forester/java/src/org/forester/go/GoSubset.java create mode 100644 forester/java/src/org/forester/go/GoTerm.java create mode 100644 forester/java/src/org/forester/go/GoUtils.java create mode 100644 forester/java/src/org/forester/go/GoXRef.java create mode 100644 forester/java/src/org/forester/go/Mapping.java create mode 100644 forester/java/src/org/forester/go/OBOparser.java create mode 100644 forester/java/src/org/forester/go/PfamToGoMapping.java create mode 100644 forester/java/src/org/forester/go/PfamToGoParser.java create mode 100644 forester/java/src/org/forester/go/TestGo.java create mode 100644 forester/java/src/org/forester/go/etc/MetaOntologizer.java create mode 100644 forester/java/src/org/forester/go/etc/OntologizerResult.java create mode 100644 forester/java/src/org/forester/io/parsers/FastaParser.java create mode 100644 forester/java/src/org/forester/io/parsers/GeneralMsaParser.java create mode 100644 forester/java/src/org/forester/io/parsers/HmmPfamOutputParser.java create mode 100644 forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java create mode 100644 forester/java/src/org/forester/io/parsers/PhylogenyParser.java create mode 100644 forester/java/src/org/forester/io/parsers/SymmetricalDistanceMatrixParser.java create mode 100644 forester/java/src/org/forester/io/parsers/nexus/NexusBinaryStatesMatrixParser.java create mode 100644 forester/java/src/org/forester/io/parsers/nexus/NexusCharactersParser.java create mode 100644 forester/java/src/org/forester/io/parsers/nexus/NexusConstants.java create mode 100644 forester/java/src/org/forester/io/parsers/nexus/NexusFormatException.java create mode 100644 forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java create mode 100644 forester/java/src/org/forester/io/parsers/nexus/PaupLogParser.java create mode 100644 forester/java/src/org/forester/io/parsers/nhx/NHXFormatException.java create mode 100644 forester/java/src/org/forester/io/parsers/nhx/NHXParser.java create mode 100644 forester/java/src/org/forester/io/parsers/nhx/NHXtags.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlDataFormatException.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlException.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlHandler.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlMapping.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/XmlElement.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/AccessionParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/AnnotationParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/BinaryCharactersParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/BranchWidthParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/ColorParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/ConfidenceParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/DateParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/DistributionParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/DomainArchitectureParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/EventParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/IdentifierParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/PhylogenyDataPhyloXmlParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/PointParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/PolygonParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/PropertyParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/ProteinDomainParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/ReferenceParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceRelationParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/TaxonomyParser.java create mode 100644 forester/java/src/org/forester/io/parsers/phyloxml/data/UriParser.java create mode 100644 forester/java/src/org/forester/io/parsers/tol/TolParser.java create mode 100644 forester/java/src/org/forester/io/parsers/tol/TolXmlHandler.java create mode 100644 forester/java/src/org/forester/io/parsers/tol/TolXmlMapping.java create mode 100644 forester/java/src/org/forester/io/parsers/util/ParserUtils.java create mode 100644 forester/java/src/org/forester/io/parsers/util/PhylogenyParserException.java create mode 100644 forester/java/src/org/forester/io/writers/PhyloXmlNodeWriter.java create mode 100644 forester/java/src/org/forester/io/writers/PhylogenyWriter.java create mode 100644 forester/java/src/org/forester/io/writers/SequenceWriter.java create mode 100644 forester/java/src/org/forester/msa/BasicMsa.java create mode 100644 forester/java/src/org/forester/msa/Mafft.java create mode 100644 forester/java/src/org/forester/msa/MafftOLD.java create mode 100644 forester/java/src/org/forester/msa/Msa.java create mode 100644 forester/java/src/org/forester/msa/MsaFormatException.java create mode 100644 forester/java/src/org/forester/msa/MsaInferrer.java create mode 100644 forester/java/src/org/forester/msa/MsaTools.java create mode 100644 forester/java/src/org/forester/msa/ResampleableMsa.java create mode 100644 forester/java/src/org/forester/pccx/BasicExternalNodeBasedCoverageExtender.java create mode 100644 forester/java/src/org/forester/pccx/BranchCountingBasedScoringMethod.java create mode 100644 forester/java/src/org/forester/pccx/BranchLengthBasedScoringMethod.java create mode 100644 forester/java/src/org/forester/pccx/Coverage.java create mode 100644 forester/java/src/org/forester/pccx/CoverageCalculationMethod.java create mode 100644 forester/java/src/org/forester/pccx/CoverageCalculationOptions.java create mode 100644 forester/java/src/org/forester/pccx/CoverageCalculator.java create mode 100644 forester/java/src/org/forester/pccx/CoverageExtender.java create mode 100644 forester/java/src/org/forester/pccx/ExternalNodeBasedCoverage.java create mode 100644 forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethod.java create mode 100644 forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethodOptions.java create mode 100644 forester/java/src/org/forester/pccx/LogBranchLengthBasedScoringMethod.java create mode 100644 forester/java/src/org/forester/pccx/ModelingUtils.java create mode 100644 forester/java/src/org/forester/pccx/ScoringMethodForExternalNode.java create mode 100644 forester/java/src/org/forester/pccx/TestPccx.java create mode 100644 forester/java/src/org/forester/phylogeny/Edge.java create mode 100644 forester/java/src/org/forester/phylogeny/Phylogeny.java create mode 100644 forester/java/src/org/forester/phylogeny/PhylogenyBranch.java create mode 100644 forester/java/src/org/forester/phylogeny/PhylogenyMethods.java create mode 100644 forester/java/src/org/forester/phylogeny/PhylogenyNode.java create mode 100644 forester/java/src/org/forester/phylogeny/PhylogenyNodeI.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Accession.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Annotation.java create mode 100644 forester/java/src/org/forester/phylogeny/data/BinaryCharacters.java create mode 100644 forester/java/src/org/forester/phylogeny/data/BranchColor.java create mode 100644 forester/java/src/org/forester/phylogeny/data/BranchData.java create mode 100644 forester/java/src/org/forester/phylogeny/data/BranchWidth.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Confidence.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Date.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Distribution.java create mode 100644 forester/java/src/org/forester/phylogeny/data/DomainArchitecture.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Event.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Identifier.java create mode 100644 forester/java/src/org/forester/phylogeny/data/MultipleUris.java create mode 100644 forester/java/src/org/forester/phylogeny/data/NodeData.java create mode 100644 forester/java/src/org/forester/phylogeny/data/PhylogenyData.java create mode 100644 forester/java/src/org/forester/phylogeny/data/PhylogenyDataUtil.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Point.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Polygon.java create mode 100644 forester/java/src/org/forester/phylogeny/data/PropertiesMap.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Property.java create mode 100644 forester/java/src/org/forester/phylogeny/data/ProteinDomain.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Reference.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Sequence.java create mode 100644 forester/java/src/org/forester/phylogeny/data/SequenceRelation.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Taxonomy.java create mode 100644 forester/java/src/org/forester/phylogeny/data/Uri.java create mode 100644 forester/java/src/org/forester/phylogeny/factories/BasicPhylogenyFactory.java create mode 100644 forester/java/src/org/forester/phylogeny/factories/ParserBasedPhylogenyFactory.java create mode 100644 forester/java/src/org/forester/phylogeny/factories/PhylogenyFactory.java create mode 100644 forester/java/src/org/forester/phylogeny/iterators/ChildNodeIteratorForward.java create mode 100644 forester/java/src/org/forester/phylogeny/iterators/ExternalForwardIterator.java create mode 100644 forester/java/src/org/forester/phylogeny/iterators/LevelOrderTreeIterator.java create mode 100644 forester/java/src/org/forester/phylogeny/iterators/PhylogenyNodeIterator.java create mode 100644 forester/java/src/org/forester/phylogeny/iterators/PostOrderStackObject.java create mode 100644 forester/java/src/org/forester/phylogeny/iterators/PostorderTreeIterator.java create mode 100644 forester/java/src/org/forester/phylogeny/iterators/PreorderTreeIterator.java create mode 100644 forester/java/src/org/forester/sdi/DistanceCalculator.java create mode 100644 forester/java/src/org/forester/sdi/GSDI.java create mode 100644 forester/java/src/org/forester/sdi/ORcount.java create mode 100644 forester/java/src/org/forester/sdi/RIO.java create mode 100644 forester/java/src/org/forester/sdi/RIOn.java create mode 100644 forester/java/src/org/forester/sdi/SDI.java create mode 100644 forester/java/src/org/forester/sdi/SDIR.java create mode 100644 forester/java/src/org/forester/sdi/SDIse.java create mode 100644 forester/java/src/org/forester/sdi/Shin.java create mode 100644 forester/java/src/org/forester/sdi/TaxonomyAssigner.java create mode 100644 forester/java/src/org/forester/sdi/TestGSDI.java create mode 100644 forester/java/src/org/forester/sdi/Tuplet.java create mode 100644 forester/java/src/org/forester/sequence/BasicSequence.java create mode 100644 forester/java/src/org/forester/sequence/Sequence.java create mode 100644 forester/java/src/org/forester/surfacing/AdjactantDirectedBinaryDomainCombination.java create mode 100644 forester/java/src/org/forester/surfacing/AdjactantDirectedCombinableDomains.java create mode 100644 forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java create mode 100644 forester/java/src/org/forester/surfacing/BasicCombinableDomains.java create mode 100644 forester/java/src/org/forester/surfacing/BasicDomain.java create mode 100644 forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java create mode 100644 forester/java/src/org/forester/surfacing/BasicProtein.java create mode 100644 forester/java/src/org/forester/surfacing/BasicSpecies.java create mode 100644 forester/java/src/org/forester/surfacing/BinaryDomainCombination.java create mode 100644 forester/java/src/org/forester/surfacing/CombinableDomains.java create mode 100644 forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java create mode 100644 forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarityCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java create mode 100644 forester/java/src/org/forester/surfacing/DirectedBinaryDomainCombination.java create mode 100644 forester/java/src/org/forester/surfacing/DirectedCombinableDomains.java create mode 100644 forester/java/src/org/forester/surfacing/Domain.java create mode 100644 forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java create mode 100644 forester/java/src/org/forester/surfacing/DomainId.java create mode 100644 forester/java/src/org/forester/surfacing/DomainLengths.java create mode 100644 forester/java/src/org/forester/surfacing/DomainLengthsTable.java create mode 100644 forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/DomainSimilarity.java create mode 100644 forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java create mode 100644 forester/java/src/org/forester/surfacing/MappingResults.java create mode 100644 forester/java/src/org/forester/surfacing/PairwiseDomainSimilarity.java create mode 100644 forester/java/src/org/forester/surfacing/PairwiseDomainSimilarityCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java create mode 100644 forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java create mode 100644 forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDomainSimilariyData.java create mode 100644 forester/java/src/org/forester/surfacing/Protein.java create mode 100644 forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java create mode 100644 forester/java/src/org/forester/surfacing/ProteinId.java create mode 100644 forester/java/src/org/forester/surfacing/SimpleDomain.java create mode 100644 forester/java/src/org/forester/surfacing/Species.java create mode 100644 forester/java/src/org/forester/surfacing/SpeciesSpecificDomainSimilariyData.java create mode 100644 forester/java/src/org/forester/surfacing/SurfacingConstants.java create mode 100644 forester/java/src/org/forester/surfacing/SurfacingUtil.java create mode 100644 forester/java/src/org/forester/surfacing/TestSurfacing.java create mode 100644 forester/java/src/org/forester/test/Test.java create mode 100644 forester/java/src/org/forester/tools/ConfidenceAssessor.java create mode 100644 forester/java/src/org/forester/tools/PhylogenyDecorator.java create mode 100644 forester/java/src/org/forester/tools/SupportCount.java create mode 100644 forester/java/src/org/forester/tools/TreeSplitMatrix.java create mode 100644 forester/java/src/org/forester/util/AsciiHistogram.java create mode 100644 forester/java/src/org/forester/util/BasicDescriptiveStatistics.java create mode 100644 forester/java/src/org/forester/util/BasicTable.java create mode 100644 forester/java/src/org/forester/util/BasicTableParser.java create mode 100644 forester/java/src/org/forester/util/CommandLineArguments.java create mode 100644 forester/java/src/org/forester/util/CommandProcessBuilder.java create mode 100644 forester/java/src/org/forester/util/DescriptiveStatistics.java create mode 100644 forester/java/src/org/forester/util/ExternalProgram.java create mode 100644 forester/java/src/org/forester/util/FailedConditionCheckException.java create mode 100644 forester/java/src/org/forester/util/ForesterConstants.java create mode 100644 forester/java/src/org/forester/util/ForesterUtil.java create mode 100644 forester/java/src/org/forester/util/GeneralTable.java create mode 100644 forester/java/src/org/forester/util/IllegalFormatUseException.java create mode 100644 forester/java/src/org/forester/util/SystemCommandExecutor.java create mode 100644 forester/java/src/org/forester/util/ThreadedStreamHandler.java create mode 100644 forester/java/src/org/forester/util/WindowsUtils.java diff --git a/forester/java/src/org/forester/go/BasicGoRelationship.java b/forester/java/src/org/forester/go/BasicGoRelationship.java new file mode 100644 index 0000000..6f8dd32 --- /dev/null +++ b/forester/java/src/org/forester/go/BasicGoRelationship.java @@ -0,0 +1,131 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public class BasicGoRelationship implements GoRelationship { + + final Type _type; + final GoId _go_id; + + public BasicGoRelationship( final String s ) { + final String[] sa = s.split( " " ); + if ( sa.length != 2 ) { + throw new IllegalArgumentException( "unexpected format for GO relationship: " + s ); + } + final String type = sa[ 0 ].trim(); + final String go_id = sa[ 1 ].trim(); + if ( type.toLowerCase().equals( PART_OF_STR ) ) { + _type = Type.PART_OF; + } + else if ( type.toLowerCase().equals( REGULATES_STR ) ) { + _type = Type.REGULATES; + } + else if ( type.toLowerCase().equals( NEGATIVELY_REGULATES_STR ) ) { + _type = Type.NEGATIVELY_REGULATES; + } + else if ( type.toLowerCase().equals( POSITIVELY_REGULATES_STR ) ) { + _type = Type.POSITIVELY_REGULATES; + } + else { + throw new IllegalArgumentException( "unknown GO relationship type: " + type ); + } + _go_id = new GoId( go_id ); + } + + public BasicGoRelationship( final String type, final String go_id ) { + if ( type.toLowerCase().equals( PART_OF_STR ) ) { + _type = Type.PART_OF; + } + else { + throw new IllegalArgumentException( "unknown GO relationship type: " + type ); + } + _go_id = new GoId( go_id ); + } + + public BasicGoRelationship( final Type type, final GoId go_id ) { + _type = type; + _go_id = go_id; + } + + public int compareTo( final GoRelationship rel ) { + return getGoId().compareTo( rel.getGoId() ); + } + + /** + * Based on value and type. + * + * + */ + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check go relationship equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check go relationship equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return getType().equals( ( ( GoRelationship ) o ).getType() ) + && getGoId().equals( ( ( GoRelationship ) o ).getGoId() ); + } + } + + public GoId getGoId() { + return _go_id; + } + + public Type getType() { + return _type; + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + switch ( getType() ) { + case PART_OF: + sb.append( PART_OF_STR ); + break; + case NEGATIVELY_REGULATES: + sb.append( NEGATIVELY_REGULATES_STR ); + break; + case POSITIVELY_REGULATES: + sb.append( POSITIVELY_REGULATES_STR ); + break; + case REGULATES: + sb.append( REGULATES_STR ); + break; + default: + new AssertionError( "unknown type: " + getType() ); + } + sb.append( ": " ); + sb.append( getGoId().toString() ); + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/go/BasicGoSubset.java b/forester/java/src/org/forester/go/BasicGoSubset.java new file mode 100644 index 0000000..19d20ed --- /dev/null +++ b/forester/java/src/org/forester/go/BasicGoSubset.java @@ -0,0 +1,125 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public class BasicGoSubset implements GoSubset { + + final Type _type; + + public BasicGoSubset( final String s ) { + final String my_s = s.trim().toLowerCase(); + if ( my_s.equals( GOSLIM_GENERIC_STR ) ) { + _type = Type.GOSLIM_GENERIC; + } + else if ( my_s.equals( GOSLIM_GOA_STR ) ) { + _type = Type.GOSLIM_GOA; + } + else if ( my_s.equals( GOSLIM_PIR_STR ) ) { + _type = Type.GOSLIM_PIR; + } + else if ( my_s.equals( GOSUBSET_PROK_STR ) ) { + _type = Type.GOSUBSET_PROK; + } + else if ( my_s.equals( GOSLIM_CANDIDA_STR ) ) { + _type = Type.GOSLIM_CANDIDA; + } + else if ( my_s.equals( GOSLIM_PLANT_STR ) ) { + _type = Type.GOSLIM_PLANT; + } + else if ( my_s.equals( GOSLIM_YEAST_STR ) ) { + _type = Type.GOSLIM_YEAST; + } + else if ( my_s.equals( GOSLIM_POMBE_STR ) ) { + _type = Type.GOSLIM_POMBE; + } + else { + throw new IllegalArgumentException( "unknown GO subset type: " + my_s ); + } + } + + public BasicGoSubset( final Type type ) { + _type = type; + } + + public int compareTo( final GoSubset sub ) { + return getType().compareTo( sub.getType() ); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check go subset equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check go subset equality to " + o + " [" + o.getClass() + + "]" ); + } + else { + return ( getType() == ( ( GoSubset ) o ).getType() ); + } + } + + public Type getType() { + return _type; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + switch ( getType() ) { + case GOSLIM_CANDIDA: + sb.append( GOSLIM_CANDIDA_STR ); + break; + case GOSLIM_GENERIC: + sb.append( GOSLIM_GENERIC_STR ); + break; + case GOSLIM_GOA: + sb.append( GOSLIM_GOA_STR ); + break; + case GOSLIM_PIR: + sb.append( GOSLIM_PIR_STR ); + break; + case GOSLIM_PLANT: + sb.append( GOSLIM_PLANT_STR ); + break; + case GOSLIM_YEAST: + sb.append( GOSLIM_YEAST_STR ); + break; + case GOSUBSET_PROK: + sb.append( GOSUBSET_PROK_STR ); + break; + case GOSLIM_POMBE: + sb.append( GOSLIM_POMBE_STR ); + break; + default: + new AssertionError( "unknown type: " + getType() ); + } + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/go/BasicGoTerm.java b/forester/java/src/org/forester/go/BasicGoTerm.java new file mode 100644 index 0000000..768de1c --- /dev/null +++ b/forester/java/src/org/forester/go/BasicGoTerm.java @@ -0,0 +1,246 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.util.ForesterUtil; + +public class BasicGoTerm implements GoTerm { + + private final GoId _id; + private final String _name; + private final boolean _is_obsolete; + private final GoNameSpace _namespace; + private String _definition; + private List _alt_ids; + private List _super_go_ids; + private List _go_xrefs; + private List _go_subsets; + private String _comment; + private List _go_relationships; + + public BasicGoTerm( final GoId id, final String name, final GoNameSpace namespace, final boolean is_obsolete ) { + if ( ( id == null ) || ForesterUtil.isEmpty( name ) || ( namespace == null ) ) { + throw new IllegalArgumentException( "attempt to create GO term with empty id, name, or namespace" ); + } + _id = id; + _name = name; + _namespace = namespace; + _is_obsolete = is_obsolete; + init(); + } + + public BasicGoTerm( final String id, final String name, final String namespace, final boolean is_obsolete ) { + if ( ForesterUtil.isEmpty( id ) || ForesterUtil.isEmpty( name ) || ForesterUtil.isEmpty( namespace ) ) { + throw new IllegalArgumentException( "attempt to create GO term with empty id, name, or namespace" ); + } + _id = new GoId( id ); + _name = name; + _namespace = new GoNameSpace( namespace ); + _is_obsolete = is_obsolete; + init(); + } + + public StringBuffer asSimpleText() { + return new StringBuffer( getGoId().toString() ); + } + + public StringBuffer asText() { + return new StringBuffer( toString() ); + } + + /** + * Compares based on GO id. + * + */ + public int compareTo( final GoTerm go_term ) { + return getGoId().compareTo( go_term.getGoId() ); + } + + /** + * Makes a shallow copy. + * + * + */ + public PhylogenyData copy() { + final BasicGoTerm gt = new BasicGoTerm( getGoId(), getName(), getGoNameSpace(), isObsolete() ); + gt.setGoXrefs( getGoXRefs() ); + gt.setGoSubsets( getGoSubsets() ); + gt.setSuperTerms( getSuperGoIds() ); + gt.setAltIds( getAltIds() ); + gt.setDefinition( getDefinition() ); + return gt; + } + + /** + * Return true if both GO id and namespace are equal. + * + */ + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check go term equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check go term equality to " + o + " [" + o.getClass() + "]" ); + } + else { + final GoTerm gt = ( GoTerm ) o; + return getGoNameSpace().equals( gt.getGoNameSpace() ) && getGoId().equals( gt.getGoId() ); + } + } + + public List getAltIds() { + return _alt_ids; + } + + @Override + public String getComment() { + return _comment; + } + + @Override + public String getDefinition() { + return _definition; + } + + public GoId getGoId() { + return _id; + } + + public GoNameSpace getGoNameSpace() { + return _namespace; + } + + @Override + public List getGoRelationships() { + return _go_relationships; + } + + @Override + public List getGoSubsets() { + return _go_subsets; + } + + public List getGoXRefs() { + return _go_xrefs; + } + + public String getName() { + return _name; + } + + public List getSuperGoIds() { + return _super_go_ids; + } + + /** + * Hashcode is based on hashcode of GO id. + * + * + */ + @Override + public int hashCode() { + return getGoId().hashCode(); + } + + private void init() { + setGoXrefs( new ArrayList() ); + setSuperTerms( new ArrayList() ); + setAltIds( new ArrayList() ); + setGoRelationships( new ArrayList() ); + setGoSubsets( new ArrayList() ); + setDefinition( "" ); + setComment( "" ); + } + + public boolean isEqual( final PhylogenyData go_term ) { + return equals( go_term ); + } + + public boolean isObsolete() { + return _is_obsolete; + } + + private void setAltIds( final List alt_ids ) { + _alt_ids = alt_ids; + } + + public void setComment( final String comment ) { + _comment = comment; + } + + public void setDefinition( final String definition ) { + _definition = definition; + } + + private void setGoRelationships( final List go_relationships ) { + _go_relationships = go_relationships; + } + + public void setGoSubsets( final List go_subsets ) { + _go_subsets = go_subsets; + } + + private void setGoXrefs( final List xrefs ) { + _go_xrefs = xrefs; + } + + private void setSuperTerms( final List super_terms ) { + _super_go_ids = super_terms; + } + + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( getGoId() ); + sb.append( ": " ); + sb.append( getName() ); + sb.append( " [" ); + sb.append( getGoNameSpace() ); + sb.append( "]" ); + if ( isObsolete() ) { + sb.append( " [is obsolete]" ); + } + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/go/BasicGoXRef.java b/forester/java/src/org/forester/go/BasicGoXRef.java new file mode 100644 index 0000000..8d5b4ae --- /dev/null +++ b/forester/java/src/org/forester/go/BasicGoXRef.java @@ -0,0 +1,182 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public class BasicGoXRef implements GoXRef { + + private final String _xref; + private final Type _type; + + public BasicGoXRef( final String s ) { + final String[] sa = s.split( ":" ); + if ( sa.length < 2 ) { + throw new IllegalArgumentException( "unexpected format for GO xref: " + s ); + } + final String type = sa[ 0 ].trim(); + if ( type.equals( EC_STR ) ) { + _type = Type.EC; + } + else if ( type.equals( META_CYC_STR ) ) { + _type = Type.META_CYC; + } + else if ( type.equals( REACTOME_STR ) ) { + _type = Type.REACTOME; + } + else if ( type.equals( RESID_STR ) ) { + _type = Type.RESID; + } + else if ( type.equals( UM_BBD_ENZYME_ID_STR ) ) { + _type = Type.UM_BBD_ENZYME_ID; + } + else if ( type.equals( UM_BBD_PATHWAY_ID_STR ) ) { + _type = Type.UM_BBD_PATHWAY_ID; + } + else if ( type.equals( UM_BBD_REACTIONID_STR ) ) { + _type = Type.UM_BBD_REACTIONID; + } + else if ( type.equals( TC_STR ) ) { + _type = Type.TC; + } + else if ( type.equals( ARACYC_STR ) ) { + _type = Type.ARACYC; + } + else if ( type.equals( XX_STR ) ) { + _type = Type.XX; + } + else if ( type.equals( PMID_STR ) ) { + _type = Type.PMID; + } + else if ( type.equals( IMG_STR ) ) { + _type = Type.IMG; + } + else if ( type.equals( GOC_STR ) ) { + _type = Type.GOC; + } + else if ( type.equals( KEGG_STR ) ) { + _type = Type.KEGG; + } + else if ( type.equals( WIKIPEDIA_STR ) ) { + _type = Type.WIKIPEDIA; + } + else { + throw new IllegalArgumentException( "unknown GO xref type: " + type ); + } + _xref = sa[ 1 ].trim(); + } + + public BasicGoXRef( final Type type, final String xref ) { + _type = type; + _xref = xref; + } + + public int compareTo( final GoXRef xref ) { + return getXRef().compareTo( xref.getXRef() ); + } + + /** + * Based on value and type. + * + * + */ + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check go xref equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check go xref equality to " + o + " [" + o.getClass() + "]" ); + } + else { + return getXRef().equals( ( ( GoXRef ) o ).getXRef() ) && getType().equals( ( ( GoXRef ) o ).getType() ); + } + } + + public Type getType() { + return _type; + } + + @Override + public String getXRef() { + return _xref; + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + switch ( getType() ) { + case EC: + sb.append( EC_STR ); + break; + case META_CYC: + sb.append( META_CYC_STR ); + break; + case REACTOME: + sb.append( REACTOME_STR ); + break; + case RESID: + sb.append( RESID_STR ); + break; + case UM_BBD_ENZYME_ID: + sb.append( UM_BBD_ENZYME_ID_STR ); + break; + case UM_BBD_PATHWAY_ID: + sb.append( UM_BBD_PATHWAY_ID_STR ); + break; + case UM_BBD_REACTIONID: + sb.append( UM_BBD_REACTIONID_STR ); + break; + case TC: + sb.append( TC_STR ); + break; + case ARACYC: + sb.append( ARACYC_STR ); + break; + case XX: + sb.append( XX_STR ); + break; + case GOC: + sb.append( GOC_STR ); + break; + case IMG: + sb.append( IMG_STR ); + break; + case PMID: + sb.append( PMID_STR ); + break; + case WIKIPEDIA: + sb.append( WIKIPEDIA_STR ); + break; + default: + new AssertionError( "unknown type: " + getType() ); + } + sb.append( ":" ); + sb.append( getXRef() ); + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/go/GoId.java b/forester/java/src/org/forester/go/GoId.java new file mode 100644 index 0000000..3ba3fe5 --- /dev/null +++ b/forester/java/src/org/forester/go/GoId.java @@ -0,0 +1,83 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class GoId implements Comparable { + + private static final int SIZE = 10; + private static final String GO_PREFIX = "GO:"; + private static final String GO_FORMAT = GO_PREFIX + "\\d{7}"; + private static final Pattern GO_PATTERN = Pattern.compile( GO_FORMAT ); + private final String _id; + + public GoId( final String id ) { + if ( id.length() != SIZE ) { + throw new IllegalArgumentException( "unexpected format for GO id: " + id ); + } + final Matcher m = GO_PATTERN.matcher( id ); + if ( !m.matches() ) { + throw new IllegalArgumentException( "unexpected format for GO id: " + id ); + } + _id = id.substring( 3 ); + } + + public int compareTo( final GoId id ) { + return getId().compareTo( id.getId() ); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check go id equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check go id equality to " + o + " [" + o.getClass() + "]" ); + } + else { + return getId().equals( ( ( GoId ) o ).getId() ); + } + } + + public String getId() { + return GO_PREFIX + _id; + } + + @Override + public int hashCode() { + return getId().hashCode(); + } + + @Override + public String toString() { + return getId(); + } +} diff --git a/forester/java/src/org/forester/go/GoNameSpace.java b/forester/java/src/org/forester/go/GoNameSpace.java new file mode 100644 index 0000000..fa76552 --- /dev/null +++ b/forester/java/src/org/forester/go/GoNameSpace.java @@ -0,0 +1,141 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public class GoNameSpace { + + public final String MOLECULAR_FUNCTION_STR = "molecular_function"; + public final String BIOLOGICAL_PROCESS_STR = "biological_process"; + public final String CELLULAR_COMPONENT_STR = "cellular_component"; + public final String UNASSIGNED_STR = "unassigned"; + private final GoNamespaceType _type; + + public GoNameSpace( final GoNamespaceType type ) { + _type = type; + }; + + public GoNameSpace( final String type ) { + if ( type.toLowerCase().equals( MOLECULAR_FUNCTION_STR ) ) { + _type = GoNamespaceType.MOLECULAR_FUNCTION; + } + else if ( type.toLowerCase().equals( BIOLOGICAL_PROCESS_STR ) ) { + _type = GoNamespaceType.BIOLOGICAL_PROCESS; + } + else if ( type.toLowerCase().equals( CELLULAR_COMPONENT_STR ) ) { + _type = GoNamespaceType.CELLULAR_COMPONENT; + } + else if ( type.toLowerCase().equals( UNASSIGNED_STR ) ) { + _type = GoNamespaceType.UNASSIGNED; + } + else { + throw new IllegalArgumentException( "unknown GO namespace: " + type ); + } + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( ( o == null ) || ( o.getClass() != this.getClass() ) ) { + return false; + } + else { + return getType() == ( ( GoNameSpace ) o ).getType(); + } + } + + public GoNamespaceType getType() { + return _type; + } + + public boolean isBiologicalProcess() { + return getType() == GoNamespaceType.BIOLOGICAL_PROCESS; + } + + public boolean isCellularComponent() { + return getType() == GoNamespaceType.CELLULAR_COMPONENT; + } + + public boolean isMolecularFunction() { + return getType() == GoNamespaceType.MOLECULAR_FUNCTION; + } + + public boolean isUnassigned() { + return getType() == GoNamespaceType.UNASSIGNED; + } + + public String toShortString() { + switch ( _type ) { + case BIOLOGICAL_PROCESS: + return ( "B" ); + case CELLULAR_COMPONENT: + return ( "C" ); + case MOLECULAR_FUNCTION: + return ( "M" ); + case UNASSIGNED: + return ( "?" ); + default: + throw new RuntimeException(); + } + } + + @Override + public String toString() { + switch ( _type ) { + case BIOLOGICAL_PROCESS: + return ( BIOLOGICAL_PROCESS_STR ); + case CELLULAR_COMPONENT: + return ( CELLULAR_COMPONENT_STR ); + case MOLECULAR_FUNCTION: + return ( MOLECULAR_FUNCTION_STR ); + case UNASSIGNED: + return ( UNASSIGNED_STR ); + default: + throw new RuntimeException(); + } + } + + public static GoNameSpace createBiologicalProcess() { + return new GoNameSpace( GoNamespaceType.BIOLOGICAL_PROCESS ); + } + + public static GoNameSpace createCellularComponent() { + return new GoNameSpace( GoNamespaceType.CELLULAR_COMPONENT ); + } + + public static GoNameSpace createMolecularFunction() { + return new GoNameSpace( GoNamespaceType.MOLECULAR_FUNCTION ); + } + + public static GoNameSpace createUnassigned() { + return new GoNameSpace( GoNamespaceType.UNASSIGNED ); + } + + public static enum GoNamespaceType { + MOLECULAR_FUNCTION, BIOLOGICAL_PROCESS, CELLULAR_COMPONENT, UNASSIGNED; + } +} diff --git a/forester/java/src/org/forester/go/GoRelationship.java b/forester/java/src/org/forester/go/GoRelationship.java new file mode 100644 index 0000000..d7f5e79 --- /dev/null +++ b/forester/java/src/org/forester/go/GoRelationship.java @@ -0,0 +1,42 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public interface GoRelationship extends Comparable { + + public static final String PART_OF_STR = "part_of"; + public static final String REGULATES_STR = "regulates"; + public static final String NEGATIVELY_REGULATES_STR = "negatively_regulates"; + public static final String POSITIVELY_REGULATES_STR = "positively_regulates"; + + public GoId getGoId(); + + public Type getType(); + + public static enum Type { + PART_OF, REGULATES, NEGATIVELY_REGULATES, POSITIVELY_REGULATES; + } +} diff --git a/forester/java/src/org/forester/go/GoSubset.java b/forester/java/src/org/forester/go/GoSubset.java new file mode 100644 index 0000000..c963217 --- /dev/null +++ b/forester/java/src/org/forester/go/GoSubset.java @@ -0,0 +1,44 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public interface GoSubset extends Comparable { + + public static final String GOSLIM_GENERIC_STR = "goslim_generic"; + public static final String GOSLIM_GOA_STR = "goslim_goa"; + public static final String GOSLIM_PIR_STR = "goslim_pir"; + public static final String GOSUBSET_PROK_STR = "gosubset_prok"; + public static final String GOSLIM_CANDIDA_STR = "goslim_candida"; + public static final String GOSLIM_PLANT_STR = "goslim_plant"; + public static final String GOSLIM_YEAST_STR = "goslim_yeast"; + public static final String GOSLIM_POMBE_STR = "goslim_pombe"; + + public Type getType(); + + public static enum Type { + GOSLIM_GENERIC, GOSLIM_GOA, GOSLIM_PIR, GOSUBSET_PROK, GOSLIM_CANDIDA, GOSLIM_PLANT, GOSLIM_YEAST, GOSLIM_POMBE; + } +} diff --git a/forester/java/src/org/forester/go/GoTerm.java b/forester/java/src/org/forester/go/GoTerm.java new file mode 100644 index 0000000..7068ab6 --- /dev/null +++ b/forester/java/src/org/forester/go/GoTerm.java @@ -0,0 +1,55 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +import java.util.List; + +import org.forester.phylogeny.data.PhylogenyData; + +public interface GoTerm extends PhylogenyData, Comparable { + + public List getAltIds(); + + public String getComment(); + + public String getDefinition(); + + public GoId getGoId(); + + public GoNameSpace getGoNameSpace(); + + public List getGoRelationships(); + + public List getGoSubsets(); + + public List getGoXRefs(); + + public String getName(); + + public List getSuperGoIds(); + + public boolean isObsolete(); +} diff --git a/forester/java/src/org/forester/go/GoUtils.java b/forester/java/src/org/forester/go/GoUtils.java new file mode 100644 index 0000000..680489a --- /dev/null +++ b/forester/java/src/org/forester/go/GoUtils.java @@ -0,0 +1,221 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.util.ForesterUtil; + +public final class GoUtils { + + private GoUtils() { + } + + /** + * This is for counting the how many times each GO term in 'categories' + * is a (direct or indirect) super term of the GO terms in 'experiment_set'. + * + * + * @param categories the set of super terms to be counted + * @param experiment_set the list of GO terms to be analyzed + * @param all_go_terms all terms in the ontology + * @return + */ + public static LinkedHashMap countCategories( final List categories, + final List experiment_set, + final Map all_go_terms ) { + final LinkedHashMap counts = new LinkedHashMap(); + for( final GoTerm experiment_term : experiment_set ) { + final Set super_terms = getAllSuperGoTerms( experiment_term.getGoId(), all_go_terms ); + super_terms.add( experiment_term ); + for( final GoTerm cat : categories ) { + if ( !counts.containsKey( cat.getGoId() ) ) { + counts.put( cat.getGoId(), 0 ); + } + if ( super_terms.contains( cat ) ) { + counts.put( cat.getGoId(), 1 + counts.get( cat.getGoId() ) ); + } + } + } + return counts; + } + + public static LinkedHashMap countCategoriesId( final List categories, + final List experiment_set, + final Map all_go_terms ) { + final LinkedHashMap counts = new LinkedHashMap(); + for( final GoId experiment_id : experiment_set ) { + final Set super_ids = new HashSet(); + for( final GoTerm term : getAllSuperGoTerms( experiment_id, all_go_terms ) ) { + super_ids.add( term.getGoId() ); + } + super_ids.add( experiment_id ); + for( final GoId cat : categories ) { + if ( !counts.containsKey( cat ) ) { + counts.put( cat, 0 ); + } + if ( super_ids.contains( cat ) ) { + counts.put( cat, 1 + counts.get( cat ) ); + } + } + } + return counts; + } + + public static Map createGoIdToGoTermMap( final List go_terms ) { + final Map go_id_to_term_map = new HashMap(); + for( final GoTerm go_term : go_terms ) { + go_id_to_term_map.put( go_term.getGoId(), go_term ); + for( final GoId alt_id : go_term.getAltIds() ) { + go_id_to_term_map.put( alt_id, go_term ); + } + } + return go_id_to_term_map; + } + + public static SortedSet getAllSuperGoIds( final GoId go_id, final Map goid_to_term_map ) { + final SortedSet ids = new TreeSet(); + final SortedSet terms = GoUtils.getAllSuperGoTerms( go_id, goid_to_term_map ); + for( final GoTerm term : terms ) { + ids.add( term.getGoId() ); + } + return ids; + } + + public static SortedSet getAllSuperGoTerms( final GoId go_id, final List go_terms ) { + final Map goid_to_term_map = GoUtils.createGoIdToGoTermMap( go_terms ); + return getAllSuperGoTerms( go_id, goid_to_term_map ); + } + + public static SortedSet getAllSuperGoTerms( final GoId go_id, final Map goid_to_term_map ) { + if ( !goid_to_term_map.containsKey( go_id ) ) { + throw new IllegalArgumentException( "GO id [" + go_id + "] not found in GO id to term map" ); + } + final GoTerm go_term = goid_to_term_map.get( go_id ); + return getAllSuperGoTerms( go_term, goid_to_term_map ); + } + + public static SortedSet getAllSuperGoTerms( final GoTerm go_term, final Map goid_to_term_map ) { + final SortedSet supers = new TreeSet(); + getAllSuperGoTerms( go_term, goid_to_term_map, supers ); + return supers; + } + + private static void getAllSuperGoTerms( final GoTerm go_term, + final Map goid_to_term_map, + final Set supers ) { + if ( ( go_term.getSuperGoIds() != null ) && ( go_term.getSuperGoIds().size() > 0 ) ) { + for( final GoId super_go_id : go_term.getSuperGoIds() ) { + if ( !goid_to_term_map.containsKey( super_go_id ) ) { + throw new IllegalArgumentException( "GO id [" + super_go_id + "] not found in GO id to term map" ); + } + final GoTerm super_go_term = goid_to_term_map.get( super_go_id ); + supers.add( super_go_term ); + getAllSuperGoTerms( super_go_term, goid_to_term_map, supers ); + } + } + } + + public static GoTerm getPenultimateGoTerm( final GoTerm go_term, final Map map ) { + GoTerm my_go_term = go_term; + GoTerm penultimate = my_go_term; + while ( ( my_go_term.getSuperGoIds() != null ) && ( my_go_term.getSuperGoIds().size() > 0 ) ) { + penultimate = my_go_term; + if ( !map.containsKey( my_go_term.getSuperGoIds().get( 0 ) ) ) { + throw new IllegalArgumentException( "GO-id [" + my_go_term.getSuperGoIds().get( 0 ) + + "] not found in map" ); + } + my_go_term = map.get( my_go_term.getSuperGoIds().get( 0 ) ); + } + return penultimate; + } + + public static GoTerm getUltimateGoTerm( final GoTerm go_term, final Map map ) { + GoTerm my_go_term = go_term; + while ( ( my_go_term.getSuperGoIds() != null ) && ( my_go_term.getSuperGoIds().size() > 0 ) ) { + if ( !map.containsKey( my_go_term.getSuperGoIds().get( 0 ) ) ) { + throw new IllegalArgumentException( "GO-id [" + my_go_term.getSuperGoIds().get( 0 ) + + "] not found in map" ); + } + my_go_term = map.get( my_go_term.getSuperGoIds().get( 0 ) ); + } + return my_go_term; + } + + public static SortedMap> parseGoIds( final Object source, + final String start_of_comment_line, + final String start_of_label_line ) throws IOException { + final Pattern label_matcher = Pattern.compile( start_of_label_line + "\\s*(.+?)" ); + final BufferedReader reader = ForesterUtil.obtainReader( source ); + final SortedMap> results = new TreeMap>(); + String line = ""; + String label = ""; + final boolean use_label = !ForesterUtil.isEmpty( start_of_label_line ); + final boolean use_comment = !ForesterUtil.isEmpty( start_of_comment_line ); + List current_list = new ArrayList(); + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( ForesterUtil.isEmpty( line ) || ( use_comment && line.startsWith( start_of_comment_line ) ) ) { + continue; + } + else if ( use_label && line.startsWith( start_of_label_line ) ) { + final Matcher matcher = label_matcher.matcher( line ); + if ( matcher.matches() ) { + if ( !ForesterUtil.isEmpty( label ) ) { + results.put( label, current_list ); + current_list = new ArrayList(); + } + label = matcher.group( 1 ); + } + } + else { + final String[] s = line.split( "\\s+" ); + final GoId id = new GoId( s[ 0 ] ); + current_list.add( id ); + } + } + if ( ForesterUtil.isEmpty( label ) ) { + label = ""; + } + results.put( label, current_list ); + reader.close(); + return results; + } +} diff --git a/forester/java/src/org/forester/go/GoXRef.java b/forester/java/src/org/forester/go/GoXRef.java new file mode 100644 index 0000000..fe96c4b --- /dev/null +++ b/forester/java/src/org/forester/go/GoXRef.java @@ -0,0 +1,67 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public interface GoXRef extends Comparable { + + public static final String EC_STR = "EC"; + public static final String META_CYC_STR = "MetaCyc"; + public static final String REACTOME_STR = "Reactome"; + public static final String RESID_STR = "RESID"; + public static final String UM_BBD_ENZYME_ID_STR = "UM-BBD_enzymeID"; + public static final String UM_BBD_PATHWAY_ID_STR = "UM-BBD_pathwayID"; + public static final String UM_BBD_REACTIONID_STR = "UM-BBD_reactionID"; + public static final String TC_STR = "TC"; + public static final String ARACYC_STR = "AraCyc"; + public static final String XX_STR = "XX"; + public static final String PMID_STR = "PMID"; + public static final String IMG_STR = "IMG"; + public static final String GOC_STR = "GOC"; + public static final String WIKIPEDIA_STR = "Wikipedia"; + public static final String KEGG_STR = "KEGG"; + + public Type getType(); + + public String getXRef(); + + public static enum Type { + EC, + META_CYC, + REACTOME, + RESID, + UM_BBD_ENZYME_ID, + UM_BBD_PATHWAY_ID, + UM_BBD_REACTIONID, + TC, + ARACYC, + XX, + PMID, + IMG, + GOC, + WIKIPEDIA, + KEGG; + } +} diff --git a/forester/java/src/org/forester/go/Mapping.java b/forester/java/src/org/forester/go/Mapping.java new file mode 100644 index 0000000..9a2fbb5 --- /dev/null +++ b/forester/java/src/org/forester/go/Mapping.java @@ -0,0 +1,33 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +public interface Mapping extends Comparable { + + public Object getKey(); + + public GoId getValue(); +} diff --git a/forester/java/src/org/forester/go/OBOparser.java b/forester/java/src/org/forester/go/OBOparser.java new file mode 100644 index 0000000..b79e172 --- /dev/null +++ b/forester/java/src/org/forester/go/OBOparser.java @@ -0,0 +1,271 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.forester.util.ForesterUtil; + +public class OBOparser { + + private final File _input_file; ; + private final ReturnType _return_type; + private int _go_term_count; + + public OBOparser( final File input_file, final ReturnType return_type ) { + switch ( return_type ) { + case BASIC_GO_TERM: + break; + default: + throw new IllegalArgumentException( "unknown return type: " + return_type ); + } + _input_file = input_file; + _return_type = return_type; + init(); + } + + private GoTerm createNewBasicGoTerm( final String id, + final String name, + final String namespace, + final String is_obsolete, + final String comment, + final String definition, + final Set alt_ids, + final List go_xrefs, + final List super_go_ids, + final List go_relationships, + final List go_subsets ) { + final GoTerm gt = new BasicGoTerm( id, name, namespace, is_obsolete.trim().toLowerCase().equals( "true" ) ); + ( ( BasicGoTerm ) gt ).setComment( comment ); + ( ( BasicGoTerm ) gt ).setDefinition( definition ); + for( final GoXRef x : go_xrefs ) { + gt.getGoXRefs().add( x ); + } + for( final GoId s : super_go_ids ) { + gt.getSuperGoIds().add( s ); + } + for( final GoRelationship r : go_relationships ) { + gt.getGoRelationships().add( r ); + } + for( final GoSubset sub : go_subsets ) { + gt.getGoSubsets().add( sub ); + } + for( final String alt_id : alt_ids ) { + gt.getAltIds().add( new GoId( alt_id ) ); + } + ++_go_term_count; + return gt; + } + + private void createNewGoTerm( final List go_terms, + final String id, + final String name, + final String namespace, + final String is_obsolete, + final String comment, + final String definition, + final Set alt_ids, + final List go_xrefs, + final List super_go_ids, + final List go_relationships, + final List go_subsets ) { + GoTerm gt; + switch ( getReturnType() ) { + case BASIC_GO_TERM: + gt = createNewBasicGoTerm( id, + name, + namespace, + is_obsolete, + comment, + definition, + alt_ids, + go_xrefs, + super_go_ids, + go_relationships, + go_subsets ); + break; + default: + throw new AssertionError( "unknown return type: " + getReturnType() ); + } + go_terms.add( gt ); + } + + public int getGoTermCount() { + return _go_term_count; + } + + private File getInputFile() { + return _input_file; + } + + private ReturnType getReturnType() { + return _return_type; + } + + private void init() { + setGoTermCount( 0 ); + } + + public List parse() throws IOException { + final String error = ForesterUtil.isReadableFile( getInputFile() ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new IOException( error ); + } + final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) ); + String line; + final List go_terms = new ArrayList(); + int line_number = 0; + boolean in_term = false; + String id = ""; + String name = ""; + String namespace = ""; + String def = ""; + String comment = ""; + String is_obsolete = ""; + HashSet alt_ids = new HashSet(); + List super_go_ids = new ArrayList(); + List go_xrefs = new ArrayList(); + List go_relationships = new ArrayList(); + List go_subsets = new ArrayList(); + try { + while ( ( line = br.readLine() ) != null ) { + line_number++; + line = line.trim(); + if ( line.length() < 1 ) { + if ( in_term ) { + in_term = false; + } + } + else if ( line.startsWith( "[Term]" ) ) { + in_term = true; + if ( id.length() > 0 ) { + createNewGoTerm( go_terms, + id, + name, + namespace, + is_obsolete, + comment, + def, + alt_ids, + go_xrefs, + super_go_ids, + go_relationships, + go_subsets ); + } + id = ""; + name = ""; + namespace = ""; + alt_ids = new HashSet(); + def = ""; + comment = ""; + is_obsolete = ""; + super_go_ids = new ArrayList(); + go_xrefs = new ArrayList(); + go_relationships = new ArrayList(); + go_subsets = new ArrayList(); + } + else if ( in_term && line.startsWith( "id:" ) ) { + id = line.substring( 3 ).trim(); + } + else if ( in_term && line.startsWith( "name:" ) ) { + name = line.substring( 5 ).trim(); + } + else if ( in_term && line.startsWith( "namespace:" ) ) { + namespace = line.substring( 10 ).trim(); + } + else if ( in_term && line.startsWith( "alt_id:" ) ) { + alt_ids.add( line.substring( 7 ).trim() ); + } + else if ( in_term && line.startsWith( "def:" ) ) { + def = line.substring( 4 ).trim(); + } + else if ( in_term && line.startsWith( "is_obsolete:" ) ) { + is_obsolete = line.substring( 12 ).trim(); + } + else if ( in_term && line.startsWith( "comment:" ) ) { + comment = line.substring( 8 ).trim(); + } + else if ( in_term && line.startsWith( "xref:" ) ) { + final String s = trimOffComment( line.substring( 5 ).trim() ); + go_xrefs.add( new BasicGoXRef( s ) ); + } + else if ( in_term && line.startsWith( "is_a:" ) ) { + final String s = trimOffComment( line.substring( 5 ).trim() ); + super_go_ids.add( new GoId( s ) ); + } + else if ( in_term && line.startsWith( "relationship:" ) ) { + final String s = trimOffComment( line.substring( 13 ).trim() ); + go_relationships.add( new BasicGoRelationship( s ) ); + } + else if ( in_term && line.startsWith( "subset:" ) ) { + final String s = line.substring( 8 ).trim(); + go_subsets.add( new BasicGoSubset( s ) ); + } + } // while ( ( line = br.readLine() ) != null ) + } + catch ( final Exception e ) { + throw new IOException( "parsing problem: " + e.getMessage() + " [at line " + line_number + "]" ); + } + if ( id.length() > 0 ) { + createNewGoTerm( go_terms, + id, + name, + namespace, + is_obsolete, + comment, + def, + alt_ids, + go_xrefs, + super_go_ids, + go_relationships, + go_subsets ); + } + return go_terms; + } + + private void setGoTermCount( final int go_term_count ) { + _go_term_count = go_term_count; + } + + private String trimOffComment( String xref ) { + final int i = xref.indexOf( '!' ); + if ( i > 0 ) { + xref = xref.substring( 0, xref.indexOf( '!' ) ).trim(); + } + return xref; + } + + public static enum ReturnType { + BASIC_GO_TERM + } +} diff --git a/forester/java/src/org/forester/go/PfamToGoMapping.java b/forester/java/src/org/forester/go/PfamToGoMapping.java new file mode 100644 index 0000000..93ee62a --- /dev/null +++ b/forester/java/src/org/forester/go/PfamToGoMapping.java @@ -0,0 +1,89 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +import org.forester.surfacing.DomainId; + +public class PfamToGoMapping implements Mapping { + + private final DomainId _pfam_domain_id; + private final GoId _go_id; + + public PfamToGoMapping( final DomainId pfam_domain_id, final GoId go_id ) { + _pfam_domain_id = pfam_domain_id; + _go_id = go_id; + } + + @Override + public int compareTo( final Mapping m ) { + if ( this == m ) { + return 0; + } + return getKey().compareTo( ( DomainId ) m.getKey() ); + } + + /** + * Based on key and value. + * + * + */ + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check pfam to go mapping equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check pfam to go mapping equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return getKey().equals( ( ( PfamToGoMapping ) o ).getKey() ) + && getValue().equals( ( ( PfamToGoMapping ) o ).getValue() ); + } + } + + @Override + public DomainId getKey() { + return _pfam_domain_id; + } + + @Override + public GoId getValue() { + return _go_id; + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( getKey().toString() ); + sb.append( " > " ); + sb.append( getValue().toString() ); + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/go/PfamToGoParser.java b/forester/java/src/org/forester/go/PfamToGoParser.java new file mode 100644 index 0000000..11a6fc4 --- /dev/null +++ b/forester/java/src/org/forester/go/PfamToGoParser.java @@ -0,0 +1,100 @@ + +package org.forester.go; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.surfacing.DomainId; +import org.forester.util.ForesterUtil; + +public class PfamToGoParser { + + // Pfam:PF00001 7tm_1 > GO:rhodopsin-like receptor activity ; GO:0001584 + private static final String PFAM_TO_GO_FORMAT = "Pfam:\\S+\\s+(\\S+)\\s*>\\s*GO:.+;\\s*(\\S+)"; + private static final Pattern PFAM_TO_GO_PATTERN = Pattern.compile( PFAM_TO_GO_FORMAT ); + private static final String PFAMACC_TO_GO_FORMAT = "Pfam:(\\S+)\\s+\\S+\\s*>\\s*GO:.+;\\s*(\\S+)"; + private static final Pattern PFAMACC_TO_GO_PATTERN = Pattern.compile( PFAMACC_TO_GO_FORMAT ); + private final File _input_file; + private int _mapping_count; + private boolean _use_acc; + + public PfamToGoParser( final File input_file ) { + _input_file = input_file; + init(); + } + + private File getInputFile() { + return _input_file; + } + + public int getMappingCount() { + return _mapping_count; + } + + private void init() { + setMappingCount( 0 ); + setUseAccessors( false ); + } + + public boolean isUseAccessors() { + return _use_acc; + } + + public List parse() throws IOException { + final String error = ForesterUtil.isReadableFile( getInputFile() ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new IOException( error ); + } + final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) ); + String line; + final List mappings = new ArrayList(); + int line_number = 0; + try { + while ( ( line = br.readLine() ) != null ) { + line_number++; + line = line.trim(); + if ( ( line.length() > 0 ) && !line.startsWith( "!" ) ) { + Matcher m = null; + if ( isUseAccessors() ) { + m = PFAMACC_TO_GO_PATTERN.matcher( line ); + } + else { + m = PFAM_TO_GO_PATTERN.matcher( line ); + } + if ( !m.matches() ) { + throw new IOException( "unexpected format [\"" + line + "\"]" ); + } + if ( m.groupCount() != 2 ) { + throw new IOException( "unexpected format [\"" + line + "\"]" ); + } + final String pfam = m.group( 1 ); + final String go = m.group( 2 ); + if ( ForesterUtil.isEmpty( pfam ) || ForesterUtil.isEmpty( go ) ) { + throw new IOException( "unexpected format [\"" + line + "\"]" ); + } + final PfamToGoMapping map = new PfamToGoMapping( new DomainId( pfam ), new GoId( go ) ); + ++_mapping_count; + mappings.add( map ); + } + } // while ( ( line = br.readLine() ) != null ) + } + catch ( final Exception e ) { + throw new IOException( "parsing problem: " + e.getMessage() + " [at line " + line_number + "]" ); + } + return mappings; + } + + private void setMappingCount( final int mapping_count ) { + _mapping_count = mapping_count; + } + + public void setUseAccessors( final boolean use_ids ) { + _use_acc = use_ids; + } +} diff --git a/forester/java/src/org/forester/go/TestGo.java b/forester/java/src/org/forester/go/TestGo.java new file mode 100644 index 0000000..4fdcd29 --- /dev/null +++ b/forester/java/src/org/forester/go/TestGo.java @@ -0,0 +1,698 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.go; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.SortedSet; + +import org.forester.surfacing.DomainId; +import org.forester.util.ForesterUtil; + +public class TestGo { + + private final static double ZERO_DIFF = 1.0E-9; + + public static boolean isEqual( final double a, final double b ) { + return ( ( Math.abs( a - b ) ) < ZERO_DIFF ); + } + + public static boolean test( final File test_dir ) { + System.out.print( " GO ID: " ); + if ( !testGoId() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Namespace: " ); + if ( !testNamespace() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Basic GO term: " ); + if ( !testBasicGoTerm() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " OBO parser: " ); + if ( !testOBOparser( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Pfam to GO mapping: " ); + if ( !testPfamToGoMapping() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Pfam to GO parser: " ); + if ( !testPfamToGoParser( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Super terms: " ); + if ( !testSuperTermGetting( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Super term counting: " ); + if ( !testSuperTermCounting( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + return true; + } + + private static boolean testBasicGoTerm() { + try { + final GoTerm gt1 = new BasicGoTerm( "GO:0047579", + "4-hydroxymandelate oxidase activity", + "molecular_function", + false ); + final GoTerm gt2 = new BasicGoTerm( "GO:0047579", + "4-hydroxymandelate oxidase activity", + "molecular_function", + false ); + final GoTerm gt3 = new BasicGoTerm( "GO:0047579", "?", "molecular_function", true ); + final GoTerm gt4 = new BasicGoTerm( "GO:0047579", + "4-hydroxymandelate oxidase activity", + "biological_process", + false ); + final GoTerm gt5 = new BasicGoTerm( "GO:0047578", + "4-hydroxymandelate oxidase activity", + "molecular_function", + false ); + if ( !gt1.equals( gt2 ) ) { + return false; + } + if ( !gt1.equals( gt3 ) ) { + return false; + } + if ( gt1.equals( gt4 ) ) { + return false; + } + if ( gt1.hashCode() != gt4.hashCode() ) { + return false; + } + if ( gt1.equals( gt5 ) ) { + return false; + } + final GoTerm gt6 = ( GoTerm ) gt5.copy(); + if ( !gt6.equals( gt5 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testGoId() { + try { + final GoId id1 = new GoId( "GO:0042617" ); + final GoId id2 = new GoId( "GO:0042630" ); + final GoId id3 = new GoId( "GO:0042630" ); + if ( id1.equals( id2 ) ) { + return false; + } + if ( !id2.equals( id3 ) ) { + return false; + } + if ( !id1.toString().equals( "GO:0042617" ) ) { + return false; + } + if ( id2.hashCode() != id3.hashCode() ) { + return false; + } + if ( id1.hashCode() == id2.hashCode() ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNamespace() { + try { + final GoNameSpace b = new GoNameSpace( "Biological_process" ); + final GoNameSpace c = new GoNameSpace( "Cellular_Component" ); + final GoNameSpace m = new GoNameSpace( "molecular_function" ); + final GoNameSpace m2 = new GoNameSpace( GoNameSpace.GoNamespaceType.MOLECULAR_FUNCTION ); + if ( b.equals( c ) ) { + return false; + } + if ( !m.equals( m2 ) ) { + return false; + } + if ( !b.toString().equals( "biological_process" ) ) { + return false; + } + if ( !c.toString().equals( "cellular_component" ) ) { + return false; + } + if ( !m.toString().equals( "molecular_function" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testOBOparser( final File test_dir ) { + try { + final OBOparser parser = new OBOparser( new File( test_dir + ForesterUtil.getFileSeparator() + "obo_test" ), + OBOparser.ReturnType.BASIC_GO_TERM ); + final List go_terms = parser.parse(); + if ( parser.getGoTermCount() != 26 ) { + return false; + } + final GoTerm g0 = go_terms.get( 0 ); + final GoTerm g1 = go_terms.get( 1 ); + final GoTerm g3 = go_terms.get( 2 ); + final GoTerm g2 = go_terms.get( 25 ); + if ( !g0.getComment().equals( "" ) ) { + return false; + } + if ( !g0 + .getDefinition() + .equals( "\"The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.\" [GOC:mcc, PMID:10873824, PMID:11389764]" ) ) { + return false; + } + if ( !g0.getGoId().getId().equals( "GO:0000001" ) ) { + return false; + } + if ( g0.getGoNameSpace().equals( GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS ) ) { + return false; + } + if ( g0.getGoNameSpace().getType() != GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS ) { + return false; + } + if ( g0.getGoRelationships().size() != 0 ) { + return false; + } + if ( g0.getGoXRefs().size() != 0 ) { + return false; + } + if ( !g0.getName().equals( "mitochondrion inheritance" ) ) { + return false; + } + if ( g0.getSuperGoIds().size() != 2 ) { + return false; + } + if ( !g0.isObsolete() ) { + return false; + } + if ( !g1.getComment().equals( "comment" ) ) { + return false; + } + if ( !g1 + .getDefinition() + .equals( "\"The maintenance of the structure and integrity of the mitochondrial genome.\" [GOC:ai]" ) ) { + return false; + } + if ( !g1.getGoId().getId().equals( "GO:0000002" ) ) { + return false; + } + if ( g1.getGoNameSpace().equals( GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS ) ) { + return false; + } + if ( g1.getGoNameSpace().getType() != GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS ) { + return false; + } + if ( g1.getGoRelationships().size() != 1 ) { + return false; + } + if ( g1.getGoXRefs().size() != 5 ) { + return false; + } + if ( !g1.getName().equals( "mitochondrial genome maintenance" ) ) { + return false; + } + if ( g1.getSuperGoIds().size() != 1 ) { + return false; + } + if ( g1.isObsolete() ) { + return false; + } + if ( !g1.getGoXRefs().get( 0 ).equals( new BasicGoXRef( "EC:2.4.1.-" ) ) ) { + return false; + } + if ( !g1.getGoXRefs().get( 0 ).getXRef().equals( "2.4.1.-" ) ) { + return false; + } + if ( g1.getGoXRefs().get( 0 ).getType() != GoXRef.Type.EC ) { + return false; + } + if ( g1.getGoXRefs().get( 0 ).equals( new BasicGoXRef( "EC:2.4.1.1" ) ) ) { + return false; + } + if ( g1.getGoXRefs().get( 0 ).equals( new BasicGoXRef( "Reactome:2.4.1.-" ) ) ) { + return false; + } + if ( !g1.getGoXRefs().get( 1 ).equals( new BasicGoXRef( "Reactome:7672" ) ) ) { + return false; + } + if ( !g1.getGoXRefs().get( 2 ).equals( new BasicGoXRef( "MetaCyc:SIROHEME-FERROCHELAT-RXN" ) ) ) { + return false; + } + if ( !g1.getGoXRefs().get( 3 ).equals( new BasicGoXRef( "RESID:AA02376" ) ) ) { + return false; + } + if ( !g1.getGoXRefs().get( 4 ).equals( new BasicGoXRef( "UM-BBD_enzymeID:e0271" ) ) ) { + return false; + } + if ( !g1.getGoRelationships().get( 0 ).equals( new BasicGoRelationship( "part_of GO:0007052" ) ) ) { + return false; + } + if ( !g1.getGoRelationships().get( 0 ).getGoId().equals( new GoId( "GO:0007052" ) ) ) { + return false; + } + if ( !g1.getGoRelationships().get( 0 ).getGoId().getId().equals( "GO:0007052" ) ) { + return false; + } + if ( g1.getGoRelationships().get( 0 ).getType() != GoRelationship.Type.PART_OF ) { + return false; + } + if ( g1.getGoRelationships().get( 0 ).equals( new BasicGoRelationship( "part_of GO:1007052" ) ) ) { + return false; + } + if ( !g1.getSuperGoIds().get( 0 ).equals( new GoId( "GO:0007005" ) ) ) { + return false; + } + if ( g1.getSuperGoIds().get( 0 ).equals( new GoId( "GO:1007005" ) ) ) { + return false; + } + if ( !g2.getGoId().getId().equals( "GO:0000030" ) ) { + return false; + } + if ( !g2.getGoId().equals( new GoId( "GO:0000030" ) ) ) { + return false; + } + if ( g2.getGoId().getId().equals( "GO:0000031" ) ) { + return false; + } + if ( g2.getGoId().equals( new GoId( "GO:0000031" ) ) ) { + return false; + } + if ( g3.getGoSubsets().size() != 3 ) { + return false; + } + if ( !g3.getGoSubsets().contains( new BasicGoSubset( "goslim_generic" ) ) ) { + return false; + } + if ( !g3.getGoSubsets().contains( new BasicGoSubset( "goslim_plant" ) ) ) { + return false; + } + if ( !g3.getGoSubsets().contains( new BasicGoSubset( "gosubset_prok" ) ) ) { + return false; + } + if ( g3.getGoSubsets().contains( new BasicGoSubset( "goslim_candida" ) ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPfamToGoMapping() { + try { + final PfamToGoMapping pg0 = new PfamToGoMapping( new DomainId( "A" ), new GoId( "GO:0000001" ) ); + final PfamToGoMapping pg1 = new PfamToGoMapping( new DomainId( "A" ), new GoId( "GO:0000001" ) ); + final PfamToGoMapping pg2 = new PfamToGoMapping( new DomainId( "B" ), new GoId( "GO:0000001" ) ); + final PfamToGoMapping pg3 = new PfamToGoMapping( new DomainId( "A" ), new GoId( "GO:0000002" ) ); + final PfamToGoMapping pg4 = new PfamToGoMapping( new DomainId( "B" ), new GoId( "GO:0000002" ) ); + if ( !pg0.equals( pg0 ) ) { + return false; + } + if ( !pg0.equals( pg1 ) ) { + return false; + } + if ( pg0.equals( pg2 ) ) { + return false; + } + if ( pg0.equals( pg3 ) ) { + return false; + } + if ( pg0.equals( pg4 ) ) { + return false; + } + if ( pg0.compareTo( pg3 ) != 0 ) { + return false; + } + if ( pg0.compareTo( pg2 ) >= 0 ) { + return false; + } + if ( pg2.compareTo( pg0 ) <= 0 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPfamToGoParser( final File test_dir ) { + try { + final PfamToGoParser parser = new PfamToGoParser( new File( test_dir + ForesterUtil.getFileSeparator() + + "pfam_to_go_test" ) ); + final List mappings = parser.parse(); + if ( parser.getMappingCount() != 426 ) { + return false; + } + if ( mappings.size() != 426 ) { + return false; + } + final PfamToGoMapping m0 = mappings.get( 0 ); + final PfamToGoMapping m1 = mappings.get( 1 ); + final PfamToGoMapping m2 = mappings.get( 2 ); + final PfamToGoMapping m3 = mappings.get( 3 ); + final PfamToGoMapping m4 = mappings.get( 4 ); + final PfamToGoMapping m5 = mappings.get( 5 ); + final PfamToGoMapping m424 = mappings.get( 424 ); + final PfamToGoMapping m425 = mappings.get( 425 ); + if ( !m0.getKey().equals( new DomainId( "7tm_1" ) ) ) { + return false; + } + if ( !m0.getValue().equals( new GoId( "GO:0001584" ) ) ) { + return false; + } + if ( m0.getKey().equals( new DomainId( "7tm_x" ) ) ) { + return false; + } + if ( m0.getValue().equals( new GoId( "GO:0001585" ) ) ) { + return false; + } + if ( !m1.getKey().equals( new DomainId( "7tm_1" ) ) ) { + return false; + } + if ( !m1.getValue().equals( new GoId( "GO:0007186" ) ) ) { + return false; + } + if ( !m2.getKey().equals( new DomainId( "7tm_1" ) ) ) { + return false; + } + if ( !m2.getValue().equals( new GoId( "GO:0016021" ) ) ) { + return false; + } + if ( !m3.getKey().equals( new DomainId( "7tm_2" ) ) ) { + return false; + } + if ( !m3.getValue().equals( new GoId( "GO:0004930" ) ) ) { + return false; + } + if ( !m4.getKey().equals( new DomainId( "7tm_2" ) ) ) { + return false; + } + if ( !m4.getValue().equals( new GoId( "GO:0016020" ) ) ) { + return false; + } + if ( !m5.getKey().equals( new DomainId( "7tm_3" ) ) ) { + return false; + } + if ( !m5.getValue().equals( new GoId( "GO:0008067" ) ) ) { + return false; + } + if ( !m424.getKey().equals( new DomainId( "OMPdecase" ) ) ) { + return false; + } + if ( !m424.getValue().equals( new GoId( "GO:0006207" ) ) ) { + return false; + } + if ( !m425.getKey().equals( new DomainId( "Bac_DNA_binding" ) ) ) { + return false; + } + if ( !m425.getValue().equals( new GoId( "GO:0003677" ) ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testSuperTermCounting( final File test_dir ) { + try { + final OBOparser parser = new OBOparser( new File( test_dir + ForesterUtil.getFileSeparator() + + "gene_ontology_edit.obo" ), OBOparser.ReturnType.BASIC_GO_TERM ); + final List all_go_terms = parser.parse(); + if ( parser.getGoTermCount() != 27748 ) { + return false; + } + final Map goid_to_term_map = GoUtils.createGoIdToGoTermMap( all_go_terms ); + final List categories = new ArrayList(); + final List experiment_set = new ArrayList(); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0005690" ), "snRNP U4atac", GoNameSpace + .createUnassigned(), false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0009698" ), + "phenylpropanoid metabolic process", + GoNameSpace.createUnassigned(), + false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0008150" ), "biological_process", GoNameSpace + .createUnassigned(), false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0006915" ), + "apoptosis", + GoNameSpace.createUnassigned(), + false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0001783" ), "B cell apoptosis", GoNameSpace + .createUnassigned(), false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0010657" ), "muscle cell apoptosis", GoNameSpace + .createUnassigned(), false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0010657" ), "muscle cell apoptosis", GoNameSpace + .createUnassigned(), false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0010658" ), + "striated muscle cell apoptosis", + GoNameSpace.createUnassigned(), + false ) ); + experiment_set.add( new BasicGoTerm( new GoId( "GO:0043065" ), + "positive regulation of apoptosis", + GoNameSpace.createUnassigned(), + false ) ); + categories + .add( new BasicGoTerm( new GoId( "GO:0016265" ), "death", GoNameSpace.createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0006915" ), + "apoptosis", + GoNameSpace.createUnassigned(), + false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0008150" ), "biological_process", GoNameSpace + .createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0010657" ), "muscle cell apoptosis", GoNameSpace + .createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0010658" ), "striated muscle cell apoptosis", GoNameSpace + .createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0046242" ), "o-xylene biosynthetic process", GoNameSpace + .createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0016326" ), "kinesin motor activity", GoNameSpace + .createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0005575" ), "cellular_component", GoNameSpace + .createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0032502" ), "developmental process", GoNameSpace + .createUnassigned(), false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0051094" ), + "positive regulation of developmental process", + GoNameSpace.createUnassigned(), + false ) ); + categories.add( new BasicGoTerm( new GoId( "GO:0048522" ), + "positive regulation of cellular process", + GoNameSpace.createUnassigned(), + false ) ); + final Map counts = GoUtils.countCategories( categories, experiment_set, goid_to_term_map ); + // death + if ( counts.get( new GoId( "GO:0016265" ) ) != 5 ) { + return false; + } + // apoptosis + if ( counts.get( new GoId( "GO:0006915" ) ) != 5 ) { + return false; + } + // biological_process + if ( counts.get( new GoId( "GO:0008150" ) ) != 8 ) { + return false; + } + // muscle cell apoptosis + if ( counts.get( new GoId( "GO:0010657" ) ) != 3 ) { + return false; + } + // striated muscle cell apoptosis + if ( counts.get( new GoId( "GO:0010658" ) ) != 1 ) { + return false; + } + // o-xylene biosynthetic process + if ( counts.get( new GoId( "GO:0046242" ) ) != 0 ) { + return false; + } + // kinesin motor activity + if ( counts.get( new GoId( "GO:0016326" ) ) != 0 ) { + return false; + } + // cellular_component + if ( counts.get( new GoId( "GO:0005575" ) ) != 1 ) { + return false; + } + // developmental process + if ( counts.get( new GoId( "GO:0032502" ) ) != 5 ) { + return false; + } + // positive regulation of developmental process + if ( counts.get( new GoId( "GO:0051094" ) ) != 1 ) { + return false; + } + // positive regulation of cellular process + if ( counts.get( new GoId( "GO:0048522" ) ) != 1 ) { + return false; + } + final List categories_id = new ArrayList(); + final List experiment_set_id = new ArrayList(); + experiment_set_id.add( new GoId( "GO:0005690" ) ); + experiment_set_id.add( new GoId( "GO:0009698" ) ); + experiment_set_id.add( new GoId( "GO:0008150" ) ); + experiment_set_id.add( new GoId( "GO:0006915" ) ); + experiment_set_id.add( new GoId( "GO:0001783" ) ); + experiment_set_id.add( new GoId( "GO:0010657" ) ); + experiment_set_id.add( new GoId( "GO:0010657" ) ); + experiment_set_id.add( new GoId( "GO:0010658" ) ); + categories_id.add( new GoId( "GO:0016265" ) ); + categories_id.add( new GoId( "GO:0006915" ) ); + categories_id.add( new GoId( "GO:0008150" ) ); + categories_id.add( new GoId( "GO:0010657" ) ); + categories_id.add( new GoId( "GO:0010658" ) ); + categories_id.add( new GoId( "GO:0046242" ) ); + categories_id.add( new GoId( "GO:0016326" ) ); + categories_id.add( new GoId( "GO:0005575" ) ); + final Map counts_id = GoUtils.countCategoriesId( categories_id, + experiment_set_id, + goid_to_term_map ); + // death + if ( counts_id.get( new GoId( "GO:0016265" ) ) != 5 ) { + return false; + } + // apoptosis + if ( counts_id.get( new GoId( "GO:0006915" ) ) != 5 ) { + return false; + } + // biological_process + if ( counts_id.get( new GoId( "GO:0008150" ) ) != 7 ) { + return false; + } + // muscle cell apoptosis + if ( counts_id.get( new GoId( "GO:0010657" ) ) != 3 ) { + return false; + } + // striated muscle cell apoptosis + if ( counts_id.get( new GoId( "GO:0010658" ) ) != 1 ) { + return false; + } + // o-xylene biosynthetic process + if ( counts_id.get( new GoId( "GO:0046242" ) ) != 0 ) { + return false; + } + // kinesin motor activity + if ( counts_id.get( new GoId( "GO:0016326" ) ) != 0 ) { + return false; + } + // cellular_componen + if ( counts_id.get( new GoId( "GO:0005575" ) ) != 1 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testSuperTermGetting( final File test_dir ) { + try { + final OBOparser parser = new OBOparser( new File( test_dir + ForesterUtil.getFileSeparator() + + "gene_ontology_edit.obo" ), OBOparser.ReturnType.BASIC_GO_TERM ); + final List go_terms = parser.parse(); + if ( parser.getGoTermCount() != 27748 ) { + return false; + } + final Map goid_to_term_map = GoUtils.createGoIdToGoTermMap( go_terms ); + final SortedSet b_cell_selection = GoUtils.getAllSuperGoTerms( new GoId( "GO:0002339" ), + goid_to_term_map ); + if ( b_cell_selection.size() != 2 ) { + return false; + } + if ( !b_cell_selection.contains( new BasicGoTerm( new GoId( "GO:0002376" ), + "immune system process", + GoNameSpace.createBiologicalProcess(), + false ) ) ) { + return false; + } + if ( !b_cell_selection.contains( new BasicGoTerm( new GoId( "GO:0008150" ), + "biological process", + GoNameSpace.createBiologicalProcess(), + false ) ) ) { + return false; + } + final SortedSet b_cell_differentation = GoUtils.getAllSuperGoTerms( new GoId( "GO:0030183" ), + goid_to_term_map ); + if ( b_cell_differentation.size() != 12 ) { + return false; + } + final SortedSet biological_process = GoUtils.getAllSuperGoTerms( new GoId( "GO:0008150" ), + goid_to_term_map ); + if ( biological_process.size() != 0 ) { + return false; + } + final SortedSet protein_aa_phosphorylation = GoUtils.getAllSuperGoTerms( new GoId( "GO:0006468" ), + goid_to_term_map ); + if ( protein_aa_phosphorylation.size() != 16 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } +} diff --git a/forester/java/src/org/forester/go/etc/MetaOntologizer.java b/forester/java/src/org/forester/go/etc/MetaOntologizer.java new file mode 100644 index 0000000..970f939 --- /dev/null +++ b/forester/java/src/org/forester/go/etc/MetaOntologizer.java @@ -0,0 +1,639 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.go.etc; + +import java.awt.Color; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.Writer; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.go.GoId; +import org.forester.go.GoNameSpace; +import org.forester.go.GoTerm; +import org.forester.go.GoUtils; +import org.forester.go.OBOparser; +import org.forester.go.PfamToGoMapping; +import org.forester.surfacing.BasicSpecies; +import org.forester.surfacing.DomainId; +import org.forester.surfacing.Species; +import org.forester.surfacing.SurfacingConstants; +import org.forester.surfacing.SurfacingUtil; +import org.forester.util.ForesterUtil; + +public class MetaOntologizer { + + private final static NumberFormat FORMATER = new DecimalFormat( "0.00E0" ); + private final static Color MIN_COLOR = new Color( 0, 200, 50 ); + private final static Color MAX_COLOR = new Color( 0, 0, 0 ); + final static private String PRG_NAME = "meta_ontologizer"; + private static final boolean VERBOSE = true; + //table-a_41_dollo_all_gains_d-Topology-Elim-Bonferroni.txt: + private final static Pattern PATTERN_ONTOLOGIZER_TABLE_OUTPUT = Pattern.compile( ".*table-(.+)_dollo_.*", + Pattern.CASE_INSENSITIVE ); //TODO this might need some work... + + private static boolean hasResultsForSpecies( final Map go_id_to_terms, + final SortedMap> species_to_results_map, + final String species, + final GoNameSpace.GoNamespaceType namespace ) { + for( final OntologizerResult ontologizer_result : species_to_results_map.get( species ) ) { + if ( go_id_to_terms.get( ontologizer_result.getGoId() ).getGoNameSpace().getType() == namespace ) { + return true; + } + } + return false; + } + + private static StringBuilder obtainDomainsForGoId( final List pfam_to_go, + final SortedSet domains_per_species, + final Map all_go_terms, + final GoId query_go_id, + final Set found_domain_ids ) { + final StringBuilder sb = new StringBuilder(); + D: for( final DomainId domain_id : domains_per_species ) { + for( final PfamToGoMapping ptg : pfam_to_go ) { + if ( ptg.getKey().equals( domain_id ) ) { + final GoId go_id = ptg.getValue(); + final Set super_ids = new HashSet(); + for( final GoTerm term : GoUtils.getAllSuperGoTerms( go_id, all_go_terms ) ) { + super_ids.add( term.getGoId() ); + } + super_ids.add( go_id ); + if ( super_ids.contains( query_go_id ) ) { + sb.append( "[" + + domain_id + "] " ); + found_domain_ids.add( domain_id ); + continue D; + } + } + } + } + return sb; + } + + private static String obtainSpecies( final File ontologizer_outfile ) { + final Matcher matcher = PATTERN_ONTOLOGIZER_TABLE_OUTPUT.matcher( ontologizer_outfile.getName() ); + String species = null; + if ( matcher.matches() ) { + species = matcher.group( 1 ); + if ( VERBOSE ) { + ForesterUtil + .programMessage( PRG_NAME, "species for [" + ontologizer_outfile + "] is [" + species + "]" ); + } + } + else { + throw new RuntimeException( "pattern [" + PATTERN_ONTOLOGIZER_TABLE_OUTPUT + "] did not match [" + + ontologizer_outfile.getName() + "]" ); + } + return species; + } + + private static SortedMap> parseDomainGainLossFile( final File input ) + throws IOException { + final String error = ForesterUtil.isReadableFile( input ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new IOException( error ); + } + final SortedMap> speciesto_to_domain_id = new TreeMap>(); + final BufferedReader br = new BufferedReader( new FileReader( input ) ); + String line; + int line_number = 0; + Species current_species = null; + try { + while ( ( line = br.readLine() ) != null ) { + line_number++; + line = line.trim(); + if ( ( ForesterUtil.isEmpty( line ) ) || ( line.startsWith( "##" ) ) ) { + // Ignore. + } + else if ( line.startsWith( "#" ) ) { + current_species = new BasicSpecies( line.substring( 1 ) ); + speciesto_to_domain_id.put( current_species, new TreeSet() ); + } + else { + if ( current_species == null ) { + throw new IOException( "parsing problem [at line " + line_number + "] in [" + input + "]" ); + } + speciesto_to_domain_id.get( current_species ).add( new DomainId( line ) ); + } + } + } + catch ( final Exception e ) { + throw new IOException( "parsing problem [at line " + line_number + "] in [" + input + "]: " + + e.getMessage() ); + } + return speciesto_to_domain_id; + } + + private static void processOneSpecies( final Map go_id_to_terms, + final Writer b_html_writer, + final Writer b_tab_writer, + final Writer c_html_writer, + final Writer c_tab_writer, + final Writer m_html_writer, + final Writer m_tab_writer, + final SortedMap> species_to_results_map, + final String species, + final double p_adjusted_upper_limit, + final SortedSet domains_per_species, + final List pfam_to_go, + final Set domain_ids_with_go_annot ) throws IOException { + final SortedSet ontologizer_results = species_to_results_map.get( species ); + for( final OntologizerResult ontologizer_result : ontologizer_results ) { + final GoTerm go_term = go_id_to_terms.get( ontologizer_result.getGoId() ); + Writer current_html_writer = b_html_writer; + Writer current_tab_writer = b_tab_writer; + switch ( go_term.getGoNameSpace().getType() ) { + case CELLULAR_COMPONENT: + current_html_writer = c_html_writer; + current_tab_writer = c_tab_writer; + break; + case MOLECULAR_FUNCTION: + current_html_writer = m_html_writer; + current_tab_writer = m_tab_writer; + break; + } + writeValuesToTabWriter( species, ontologizer_result, go_term, current_tab_writer ); + writeValuesToHtmlWriter( ontologizer_result, + go_term, + current_html_writer, + p_adjusted_upper_limit, + species, + go_id_to_terms, + domains_per_species, + pfam_to_go, + domain_ids_with_go_annot ); + } + } + + public static void reformat( final File ontologizer_outdir, + final String result_file_prefix, + final File domain_gain_loss_file, + final String outfile_base, + final File obo_file, + final double p_adjusted_upper_limit, + final String comment, + final List pfam_to_go ) throws IOException { + if ( !ontologizer_outdir.exists() ) { + throw new IllegalArgumentException( "[" + ontologizer_outdir + "] does not exist" ); + } + if ( !ontologizer_outdir.isDirectory() ) { + throw new IllegalArgumentException( "[" + ontologizer_outdir + "] is not a directory" ); + } + if ( !obo_file.exists() ) { + throw new IllegalArgumentException( "[" + obo_file + "] does not exist" ); + } + if ( ( p_adjusted_upper_limit < 0.0 ) || ( p_adjusted_upper_limit > 1.0 ) ) { + throw new IllegalArgumentException( "adjusted P values limit [" + p_adjusted_upper_limit + + "] is out of range" ); + } + SortedMap> speciesto_to_domain_id = null; + if ( domain_gain_loss_file != null ) { + if ( !domain_gain_loss_file.exists() ) { + throw new IllegalArgumentException( "[" + domain_gain_loss_file + "] does not exist" ); + } + speciesto_to_domain_id = parseDomainGainLossFile( domain_gain_loss_file ); + if ( VERBOSE ) { + ForesterUtil.programMessage( PRG_NAME, "parsed gain/loss domains for " + speciesto_to_domain_id.size() + + " species from [" + domain_gain_loss_file + "]" ); + } + } + final String[] children = ontologizer_outdir.list(); + final List ontologizer_outfiles = new ArrayList(); + if ( children == null ) { + throw new IllegalArgumentException( "problem with [" + ontologizer_outdir + "]" ); + } + else { + for( final String filename : children ) { + if ( filename.startsWith( result_file_prefix ) ) { + ontologizer_outfiles.add( new File( filename ) ); + } + } + } + if ( VERBOSE ) { + ForesterUtil.programMessage( PRG_NAME, "need to analyze " + ontologizer_outfiles.size() + + " Ontologizer outfiles from [" + ontologizer_outdir + "]" ); + } + final OBOparser parser = new OBOparser( obo_file, OBOparser.ReturnType.BASIC_GO_TERM ); + final List go_terms = parser.parse(); + if ( VERBOSE ) { + ForesterUtil.programMessage( PRG_NAME, "parsed " + go_terms.size() + " GO terms from [" + obo_file + "]" ); + } + final Map go_id_to_terms = GoUtils.createGoIdToGoTermMap( go_terms ); + //FIXME not needed? when doe sthis error arise? + // if ( go_id_to_terms.size() != go_terms.size() ) { + // throw new IllegalArgumentException( "GO terms with non-unique ids found" ); + // } + final String b_file_html = outfile_base + "_B.html"; + final String b_file_txt = outfile_base + "_B.txt"; + final String m_file_html = outfile_base + "_C.html"; + final String m_file_txt = outfile_base + "_C.txt"; + final String c_file_html = outfile_base + "_M.html"; + final String c_file_txt = outfile_base + "_M.txt"; + final Writer b_html_writer = ForesterUtil.createBufferedWriter( b_file_html ); + final Writer b_tab_writer = ForesterUtil.createBufferedWriter( b_file_txt ); + final Writer c_html_writer = ForesterUtil.createBufferedWriter( m_file_html ); + final Writer c_tab_writer = ForesterUtil.createBufferedWriter( m_file_txt ); + final Writer m_html_writer = ForesterUtil.createBufferedWriter( c_file_html ); + final Writer m_tab_writer = ForesterUtil.createBufferedWriter( c_file_txt ); + final SortedMap> species_to_results_map = new TreeMap>(); + for( final File ontologizer_outfile : ontologizer_outfiles ) { + final String species = obtainSpecies( ontologizer_outfile ); + final List ontologizer_results = OntologizerResult.parse( new File( ontologizer_outdir + + ForesterUtil.FILE_SEPARATOR + ontologizer_outfile ) ); + final SortedSet filtered_ontologizer_results = new TreeSet(); + for( final OntologizerResult ontologizer_result : ontologizer_results ) { + if ( ontologizer_result.getPAdjusted() <= p_adjusted_upper_limit ) { + filtered_ontologizer_results.add( ontologizer_result ); + } + } + species_to_results_map.put( species, filtered_ontologizer_results ); + } + writeLabelsToTabWriter( b_tab_writer ); + writeLabelsToTabWriter( c_tab_writer ); + writeLabelsToTabWriter( m_tab_writer ); + String domain_gain_loss_file_full_path_str = null; + if ( domain_gain_loss_file != null ) { + domain_gain_loss_file_full_path_str = domain_gain_loss_file.getAbsolutePath(); + } + writeHtmlHeader( b_html_writer, + GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS.toString() + " | Pmax = " + + p_adjusted_upper_limit + " | " + comment, + ontologizer_outdir.getAbsolutePath(), + domain_gain_loss_file_full_path_str ); + writeHtmlHeader( c_html_writer, + GoNameSpace.GoNamespaceType.CELLULAR_COMPONENT.toString() + " | Pmax = " + + p_adjusted_upper_limit + " | " + comment, + ontologizer_outdir.getAbsolutePath(), + domain_gain_loss_file_full_path_str ); + writeHtmlHeader( m_html_writer, + GoNameSpace.GoNamespaceType.MOLECULAR_FUNCTION.toString() + " | Pmax = " + + p_adjusted_upper_limit + " | " + comment, + ontologizer_outdir.getAbsolutePath(), + domain_gain_loss_file_full_path_str ); + for( final String species : species_to_results_map.keySet() ) { + if ( hasResultsForSpecies( go_id_to_terms, + species_to_results_map, + species, + GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS ) ) { + writeHtmlSpecies( b_html_writer, species ); + } + if ( hasResultsForSpecies( go_id_to_terms, + species_to_results_map, + species, + GoNameSpace.GoNamespaceType.CELLULAR_COMPONENT ) ) { + writeHtmlSpecies( c_html_writer, species ); + } + if ( hasResultsForSpecies( go_id_to_terms, + species_to_results_map, + species, + GoNameSpace.GoNamespaceType.MOLECULAR_FUNCTION ) ) { + writeHtmlSpecies( m_html_writer, species ); + } + SortedSet domains_per_species = null; + if ( ( speciesto_to_domain_id != null ) && ( speciesto_to_domain_id.size() > 0 ) ) { + domains_per_species = speciesto_to_domain_id.get( new BasicSpecies( species ) ); + } + final Set domain_ids_with_go_annot = new HashSet(); + processOneSpecies( go_id_to_terms, + b_html_writer, + b_tab_writer, + c_html_writer, + c_tab_writer, + m_html_writer, + m_tab_writer, + species_to_results_map, + species, + p_adjusted_upper_limit, + domains_per_species, + pfam_to_go, + domain_ids_with_go_annot ); + if ( ( speciesto_to_domain_id != null ) && ( speciesto_to_domain_id.size() > 0 ) ) { + if ( hasResultsForSpecies( go_id_to_terms, + species_to_results_map, + species, + GoNameSpace.GoNamespaceType.BIOLOGICAL_PROCESS ) ) { + writeHtmlDomains( b_html_writer, domains_per_species, domain_ids_with_go_annot ); + } + if ( hasResultsForSpecies( go_id_to_terms, + species_to_results_map, + species, + GoNameSpace.GoNamespaceType.CELLULAR_COMPONENT ) ) { + writeHtmlDomains( c_html_writer, domains_per_species, domain_ids_with_go_annot ); + } + if ( hasResultsForSpecies( go_id_to_terms, + species_to_results_map, + species, + GoNameSpace.GoNamespaceType.MOLECULAR_FUNCTION ) ) { + writeHtmlDomains( m_html_writer, domains_per_species, domain_ids_with_go_annot ); + } + } + } + writeHtmlEnd( b_html_writer ); + writeHtmlEnd( c_html_writer ); + writeHtmlEnd( m_html_writer ); + b_html_writer.close(); + b_tab_writer.close(); + c_html_writer.close(); + c_tab_writer.close(); + m_html_writer.close(); + m_tab_writer.close(); + if ( VERBOSE ) { + ForesterUtil.programMessage( PRG_NAME, "successfully wrote biological process summary to [" + b_file_html + + "]" ); + ForesterUtil.programMessage( PRG_NAME, "successfully wrote biological process summary to [" + b_file_txt + + "]" ); + ForesterUtil.programMessage( PRG_NAME, "successfully wrote molecular function summary to [" + m_file_html + + "]" ); + ForesterUtil.programMessage( PRG_NAME, "successfully wrote molecular function summary to [" + m_file_txt + + "]" ); + ForesterUtil.programMessage( PRG_NAME, "successfully wrote cellular component summary to [" + c_file_html + + "]" ); + ForesterUtil.programMessage( PRG_NAME, "successfully wrote cellular component summary to [" + c_file_txt + + "]" ); + } + } + + private static void writeHtmlDomains( final Writer writer, + final SortedSet domains, + final Set domain_ids_with_go_annot ) throws IOException { + writer.write( "" ); + writer.write( "" ); + if ( domains != null ) { + for( final DomainId domain : domains ) { + if ( !domain_ids_with_go_annot.contains( domain ) ) { + writer.write( "[" + domain + "] " ); + } + } + } + writer.write( "" ); + writer.write( "" ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + private static void writeHtmlEnd( final Writer writer ) throws IOException { + writer.write( "" ); + writer.write( "" ); + writer.write( "" ); + } + + private static void writeHtmlHeader( final Writer w, + final String desc, + final String ontologizer_outdir, + final String domain_gain_loss_file ) throws IOException { + w.write( "" ); + w.write( "" ); + w.write( desc ); + w.write( "" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "

" ); + w.write( "meta ontologizer" ); + w.write( "

" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "

" ); + w.write( desc ); + w.write( "

" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + if ( !ForesterUtil.isEmpty( domain_gain_loss_file ) ) { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + } + w.write( "
" ); + w.write( "ontolgizer output directory analysed:" ); + w.write( "" ); + w.write( ontologizer_outdir ); + w.write( "
" ); + w.write( "domain gain or loss file:" ); + w.write( "" ); + w.write( domain_gain_loss_file ); + w.write( "
" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( "" ); + w.write( "" ); + w.write( "" ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + + private static void writeHtmlSpecies( final Writer writer, final String species ) throws IOException { + writer.write( "" ); + writer.write( "" ); + writer.write( "" ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + private static void writeLabelsToTabWriter( final Writer writer ) throws IOException { + writer.write( "#species" ); + writer.write( "\t" ); + writer.write( "GO name" ); + writer.write( "\t" ); + writer.write( "GO id" ); + writer.write( "\t" ); + writer.write( "P adjusted" ); + writer.write( "\t" ); + writer.write( "P" ); + writer.write( "\t" ); + writer.write( "Pop total" ); + writer.write( "\t" ); + writer.write( "Pop term" ); + writer.write( "\t" ); + writer.write( "Study total" ); + writer.write( "\t" ); + writer.write( "Study term" ); + writer.write( "\t" ); + writer.write( "is trivial" ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + private static void writeValuesToHtmlWriter( final OntologizerResult ontologizer_result, + final GoTerm go_term, + final Writer writer, + final double p_adjusted_upper_limit, + final String species, + final Map go_id_to_terms, + final SortedSet domains_per_species, + final List pfam_to_go, + final Set domain_ids_with_go_annot ) throws IOException { + final Color p_adj_color = ForesterUtil.calcColor( ontologizer_result.getPAdjusted(), + 0, + p_adjusted_upper_limit, + MIN_COLOR, + MAX_COLOR ); + final Color p_color = ForesterUtil.calcColor( ontologizer_result.getP(), + 0, + p_adjusted_upper_limit, + MIN_COLOR, + MAX_COLOR ); + writer.write( "" ); + writer.write( "" ); + writer.write( "" ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + private static void writeValuesToTabWriter( final String species, + final OntologizerResult ontologizer_result, + final GoTerm got_term, + final Writer writer ) throws IOException { + writer.write( species ); + writer.write( "\t" ); + writer.write( got_term.getName() ); + writer.write( "\t" ); + writer.write( ontologizer_result.getGoId().getId() ); + writer.write( "\t" ); + writer.write( String.valueOf( ontologizer_result.getPAdjusted() ) ); + writer.write( "\t" ); + writer.write( String.valueOf( ontologizer_result.getP() ) ); + writer.write( "\t" ); + writer.write( String.valueOf( ontologizer_result.getPopTotal() ) ); + writer.write( "\t" ); + writer.write( String.valueOf( ontologizer_result.getPopTerm() ) ); + writer.write( "\t" ); + writer.write( String.valueOf( ontologizer_result.getStudyTotal() ) ); + writer.write( "\t" ); + writer.write( String.valueOf( ontologizer_result.getStudyTerm() ) ); + writer.write( "\t" ); + writer.write( String.valueOf( ontologizer_result.isTrivial() ) ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } +} diff --git a/forester/java/src/org/forester/go/etc/OntologizerResult.java b/forester/java/src/org/forester/go/etc/OntologizerResult.java new file mode 100644 index 0000000..d7e2be5 --- /dev/null +++ b/forester/java/src/org/forester/go/etc/OntologizerResult.java @@ -0,0 +1,205 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.go.etc; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.go.GoId; +import org.forester.util.ForesterUtil; + +/* + * + * Note: this class has a natural ordering that is inconsistent with equals. + */ +public class OntologizerResult implements Comparable { + + final private GoId _goid; + final private int _pop_total; + final private int _pop_term; + final private int _study_total; + final private int _study_term; + final private int _pop_family; + final private int _study_family; + final private int _nparents; + final private boolean _is_trivial; + final private double _p; + final private double _p_adjusted; + final private double _p_min; + final private TYPE _type; + + private OntologizerResult( final String s ) { + if ( ForesterUtil.isEmpty( s ) ) { + throw new IllegalArgumentException( "result string is null or empty" ); + } + final String[] tokens = s.split( "\t" ); + if ( ( tokens.length != 9 ) && ( tokens.length != 11 ) && ( tokens.length != 12 ) ) { + throw new IllegalArgumentException( "result string [" + s + "] has unexpected format" ); + } + _goid = new GoId( tokens[ 0 ] ); + _pop_total = Integer.parseInt( tokens[ 1 ] ); + _pop_term = Integer.parseInt( tokens[ 2 ] ); + _study_total = Integer.parseInt( tokens[ 3 ] ); + _study_term = Integer.parseInt( tokens[ 4 ] ); + if ( tokens.length == 11 ) { + // Topology Elim + // ID Pop.total Pop.term Study.total Study.term Pop.family Study.family is.trivial p p.adjusted p.min + _type = TYPE.TOPOLOGY; + _pop_family = Integer.parseInt( tokens[ 5 ] ); + _study_family = Integer.parseInt( tokens[ 6 ] ); + _is_trivial = Boolean.parseBoolean( tokens[ 7 ] ); + _p = Double.parseDouble( tokens[ 8 ] ); + _p_adjusted = Double.parseDouble( tokens[ 9 ] ); + _p_min = Double.parseDouble( tokens[ 10 ] ); + _nparents = -1; + } + else if ( tokens.length == 9 ) { + // Term for Term + // ID Pop.total Pop.term Study.total Study.term p p.adjusted p.min name + _type = TYPE.TERM_FOR_TERM; + _pop_family = -1; + _study_family = -1; + _nparents = -1; + _is_trivial = false; + _p = Double.parseDouble( tokens[ 5 ] ); + _p_adjusted = Double.parseDouble( tokens[ 6 ] ); + _p_min = Double.parseDouble( tokens[ 7 ] ); + } + else { + // Parent Child Union + // ID Pop.total Pop.term Study.total Study.term Pop.family Study.family nparents is.trivial p p.adjusted p.min + _type = TYPE.PARENT_CHILD; + _pop_family = Integer.parseInt( tokens[ 5 ] ); + _study_family = Integer.parseInt( tokens[ 6 ] ); + _nparents = Integer.parseInt( tokens[ 7 ] ); + _is_trivial = Boolean.parseBoolean( tokens[ 8 ] ); + _p = Double.parseDouble( tokens[ 9 ] ); + _p_adjusted = Double.parseDouble( tokens[ 10 ] ); + _p_min = Double.parseDouble( tokens[ 11 ] ); + } + } + + @Override + public int compareTo( final OntologizerResult o ) { + if ( this == o ) { + return 0; + } + else if ( getPAdjusted() < o.getPAdjusted() ) { + return -1; + } + else if ( getPAdjusted() > o.getPAdjusted() ) { + return 1; + } + else { + return 0; + } + } + + public GoId getGoId() { + return _goid; + } + + public int getNParents() { + return _nparents; + } + + public double getP() { + return _p; + } + + public double getPAdjusted() { + return _p_adjusted; + } + + public double getPMin() { + return _p_min; + } + + public int getPopFamily() { + return _pop_family; + } + + public int getPopTerm() { + return _pop_term; + } + + public int getPopTotal() { + return _pop_total; + } + + public int getStudyFamily() { + return _study_family; + } + + public int getStudyTerm() { + return _study_term; + } + + public int getStudyTotal() { + return _study_total; + } + + public TYPE getType() { + return _type; + } + + public boolean isTrivial() { + return _is_trivial; + } + + public static List parse( final File input ) throws IOException { + final String error = ForesterUtil.isReadableFile( input ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new IOException( error ); + } + final BufferedReader br = new BufferedReader( new FileReader( input ) ); + String line; + final List results = new ArrayList(); + int line_number = 0; + try { + while ( ( line = br.readLine() ) != null ) { + line_number++; + line = line.trim(); + if ( line.startsWith( "GO:" ) ) { + results.add( new OntologizerResult( line ) ); + } + } + } + catch ( final Exception e ) { + throw new IOException( "parsing problem [at line " + line_number + "] in [" + input + "]: " + + e.getMessage() ); + } + return results; + } + + public static enum TYPE { + TOPOLOGY, TERM_FOR_TERM, PARENT_CHILD; + } +} diff --git a/forester/java/src/org/forester/io/parsers/FastaParser.java b/forester/java/src/org/forester/io/parsers/FastaParser.java new file mode 100644 index 0000000..4c6845c --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/FastaParser.java @@ -0,0 +1,210 @@ +// $Id: +// +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.msa.BasicMsa; +import org.forester.msa.Msa; +import org.forester.msa.MsaFormatException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.Sequence; +import org.forester.util.ForesterUtil; + +public class FastaParser { + + private static final Pattern NAME_REGEX = Pattern.compile( "^\\s*>\\s*(.+)" ); + private static final Pattern SEQ_REGEX = Pattern.compile( "^\\s*(.+)" ); + private static final Pattern ANYTHING_REGEX = Pattern.compile( "[\\d\\s]+" ); + //>gi|71834668|ref|NP_001025424.1| Bcl2 [Danio rerio] + private static final Pattern FASTA_DESC_LINE = Pattern + .compile( ">?\\s*([^|]+)\\|([^|]+)\\S*\\s+(.+)\\s+\\[(.+)\\]" ); + + public static void main( final String[] args ) { + final String a = ">gi|71834668|ref|NP_001025424.1| Bcl2 [Danio rerio]"; + final Matcher name_m = FASTA_DESC_LINE.matcher( a ); + if ( name_m.lookingAt() ) { + System.out.println(); + System.out.println( name_m.group( 1 ) ); + System.out.println( name_m.group( 2 ) ); + System.out.println( name_m.group( 3 ) ); + System.out.println( name_m.group( 4 ) ); + } + else { + System.out.println( "Does not match." ); + } + } + + static public boolean isLikelyFasta( final InputStream is ) throws IOException { + final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + String line = null; + while ( ( line = reader.readLine() ) != null ) { + final boolean is_name_line = NAME_REGEX.matcher( line ).lookingAt(); + if ( canIgnore( line, true, false ) ) { + continue; + } + else if ( is_name_line ) { + reader.close(); + return true; + } + else if ( SEQ_REGEX.matcher( line ).lookingAt() ) { + reader.close(); + return false; + } + } + reader.close(); + return false; + } + + static public Msa parseMsa( final InputStream is ) throws IOException { + return BasicMsa.createInstance( parse( is ) ); + } + + static public Msa parseMsa( final String s ) throws IOException { + return parseMsa( s.getBytes() ); + } + + static public Msa parseMsa( final byte[] bytes ) throws IOException { + return parseMsa( new ByteArrayInputStream( bytes ) ); + } + + static public List parse( final InputStream is ) throws IOException { + final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + String line = null; + int line_counter = 0; + boolean saw_first_seq = false; + StringBuilder current_seq = null; + StringBuilder name = null; + final List temp_msa = new ArrayList(); + while ( ( line = reader.readLine() ) != null ) { + ++line_counter; + final Matcher name_m = NAME_REGEX.matcher( line ); + final boolean is_name_line = name_m.lookingAt(); + if ( canIgnore( line, saw_first_seq, is_name_line ) ) { + continue; + } + final Matcher seq_m = SEQ_REGEX.matcher( line ); + if ( is_name_line ) { + saw_first_seq = true; + addSeq( name, current_seq, temp_msa ); + name = new StringBuilder( name_m.group( 1 ).trim() ); + current_seq = new StringBuilder(); + } + else if ( seq_m.lookingAt() ) { + if ( name.length() < 1 ) { + reader.close(); + throw new MsaFormatException( "illegally formatted fasta msa (line: " + line_counter + "):\n\"" + + trim( line ) + "\"" ); + } + current_seq.append( seq_m.group( 1 ).replaceAll( "\\s+", "" ) ); + } + else { + reader.close(); + throw new MsaFormatException( "illegally formatted fasta msa (line: " + line_counter + "):\n\"" + + trim( line ) + "\"" ); + } + } + addSeq( name, current_seq, temp_msa ); + reader.close(); + final List seqs = new ArrayList(); + for( int i = 0; i < temp_msa.size(); ++i ) { + seqs.add( BasicSequence.createAaSequence( temp_msa.get( i )[ 0 ].toString(), temp_msa.get( i )[ 1 ] + .toString() ) ); + } + return seqs; + } + + static private boolean canIgnore( final String line, final boolean saw_first_seq, final boolean is_name_line ) { + if ( ( line.length() < 1 ) || ANYTHING_REGEX.matcher( line ).matches() ) { + return true; + } + if ( !saw_first_seq && !is_name_line ) { + return true; + } + return false; + } + + private static void addSeq( final StringBuilder name, final StringBuilder seq, final List temp_msa ) { + if ( ( name != null ) && ( seq != null ) && ( name.length() > 0 ) && ( seq.length() > 0 ) ) { + final StringBuilder[] ary = new StringBuilder[ 2 ]; + ary[ 0 ] = name; + ary[ 1 ] = seq; + temp_msa.add( ary ); + } + } + + private static String trim( final String line ) { + if ( line.length() > 100 ) { + return line.substring( 0, 100 ) + " ..."; + } + return line; + } + + public static void extractFastaInformation( final Phylogeny phy ) { + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( !ForesterUtil.isEmpty( node.getName() ) ) { + final Matcher name_m = FASTA_DESC_LINE.matcher( node.getName() ); + if ( name_m.lookingAt() ) { + System.out.println(); + // System.out.println( name_m.group( 1 ) ); + // System.out.println( name_m.group( 2 ) ); + // System.out.println( name_m.group( 3 ) ); + // System.out.println( name_m.group( 4 ) ); + final String acc_source = name_m.group( 1 ); + final String acc = name_m.group( 2 ); + final String seq_name = name_m.group( 3 ); + final String tax_sn = name_m.group( 4 ); + if ( !ForesterUtil.isEmpty( acc_source ) && !ForesterUtil.isEmpty( acc ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence( 0 ).setAccession( new Accession( acc, acc_source ) ); + } + if ( !ForesterUtil.isEmpty( seq_name ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence( 0 ).setName( seq_name ); + } + if ( !ForesterUtil.isEmpty( tax_sn ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy( 0 ).setScientificName( tax_sn ); + } + } + } + } + } +} diff --git a/forester/java/src/org/forester/io/parsers/GeneralMsaParser.java b/forester/java/src/org/forester/io/parsers/GeneralMsaParser.java new file mode 100644 index 0000000..4b7e920 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/GeneralMsaParser.java @@ -0,0 +1,186 @@ +// $Id: +// +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.msa.BasicMsa; +import org.forester.msa.Msa; +import org.forester.msa.MsaFormatException; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.Sequence; + +public final class GeneralMsaParser { + + private static final Pattern NAME_SEQ_PATTERN = Pattern.compile( "(\\S+)\\s+(\\S+)\\s*" ); + private static final Pattern INDENTED_SEQ_PATTERN = Pattern.compile( "\\s+(\\S+)\\s*" ); + private static final Pattern NON_INDENTED_SEQ_PATTERN = Pattern.compile( "(\\S+).*" ); + private static final Pattern PROBCONS_REGEX = Pattern.compile( "^CLUSTAL\\s" ); + private static final Pattern MUSCLE_REGEX = Pattern.compile( "^MUSCLE\\s\\(" ); + private static final Pattern CLUSTAL_REGEX = Pattern.compile( "^PROBCONS\\s" ); + private static final Pattern ANYTHING_REGEX = Pattern.compile( "[\\d\\s]+" ); + private static final Pattern SELEX_SPECIAL_LINES_REGEX = Pattern.compile( "\\s+[*\\.:\\s]+" ); + private static final Pattern SPECIAL_LINES_REGEX = Pattern.compile( "^\\s*(#|%|//|!!)" ); + private static final Pattern ERROR_REGEX = Pattern.compile( "\\S+\\s+\\S+\\s+\\S+" ); + + static private boolean canIgnore( final String line ) { + if ( ( line.length() < 1 ) || ANYTHING_REGEX.matcher( line ).matches() ) { + return true; + } + return ( SELEX_SPECIAL_LINES_REGEX.matcher( line ).matches() || SPECIAL_LINES_REGEX.matcher( line ).lookingAt() ); + } + + static private boolean isProgramNameLine( final String line ) { + return ( PROBCONS_REGEX.matcher( line ).lookingAt() || CLUSTAL_REGEX.matcher( line ).lookingAt() || MUSCLE_REGEX + .matcher( line ).lookingAt() ); + } + + static public Msa parse( final InputStream is ) throws IOException { + int block = -1; + int current_seq_index_per_block = -1; + String current_name = null; + boolean saw_ignorable = true; + boolean is_first = true; + final Map temp_msa = new HashMap(); + final List names_in_order = new ArrayList(); + final BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + String line = null; + int line_counter = 0; + while ( ( line = reader.readLine() ) != null ) { + ++line_counter; + if ( canIgnore( line ) ) { + saw_ignorable = true; + } + else if ( !( is_first && isProgramNameLine( line ) ) ) { + if ( ERROR_REGEX.matcher( line ).lookingAt() ) { + throw new MsaFormatException( "unrecognized msa format (line: " + line_counter + "):\n\"" + + trim( line ) + "\"" ); + } + if ( canIgnore( line ) ) { + saw_ignorable = true; + } + final Matcher name_seq_m = NAME_SEQ_PATTERN.matcher( line ); + Matcher ind_seq_m = null; + Matcher non_ind_seq_m = null; + boolean ind_seq_m_matches = false; + boolean non_ind_seq_m_matches = false; + final boolean name_seq_m_matches = name_seq_m.matches(); + if ( !name_seq_m_matches ) { + ind_seq_m = INDENTED_SEQ_PATTERN.matcher( line ); + ind_seq_m_matches = ind_seq_m.matches(); + if ( !ind_seq_m_matches ) { + non_ind_seq_m = NON_INDENTED_SEQ_PATTERN.matcher( line ); + non_ind_seq_m_matches = non_ind_seq_m.lookingAt(); + } + } + if ( name_seq_m_matches || ind_seq_m_matches || non_ind_seq_m_matches ) { + if ( saw_ignorable ) { + ++block; + current_seq_index_per_block = -1; + saw_ignorable = false; + } + ++current_seq_index_per_block; + if ( name_seq_m_matches ) { + final String name = name_seq_m.group( 1 ); + final String seq = name_seq_m.group( 2 ); + if ( temp_msa.containsKey( name ) ) { + temp_msa.get( name ).append( seq ); + } + else { + temp_msa.put( name, new StringBuilder( seq ) ); + names_in_order.add( name ); + } + current_name = name; + } + else if ( ind_seq_m_matches ) { + if ( temp_msa.containsKey( current_name ) ) { + temp_msa.get( current_name ).append( ind_seq_m.group( 1 ) ); + } + else { + throw new MsaFormatException( "illegal msa format (line: " + line_counter + "):\n\"" + + trim( line ) + "\"" ); + } + } + else if ( non_ind_seq_m_matches ) { + if ( block == 0 ) { + throw new MsaFormatException( "illegal msa format: first block cannot contain un-named sequence (line: " + + line_counter + "):\n\"" + trim( line ) + "\"" ); + } + else { + String name = ""; + try { + name = names_in_order.get( current_seq_index_per_block ); + } + catch ( final IndexOutOfBoundsException e ) { + throw new MsaFormatException( "illegalmsa format (line: " + line_counter + "):\n\"" + + trim( line ) + "\"" ); + } + if ( temp_msa.containsKey( name ) ) { + temp_msa.get( name ).append( non_ind_seq_m.group( 1 ) ); + } + else { + throw new MsaFormatException( "illegal msa format (line: " + line_counter + "):\n\"" + + trim( line ) + "\"" ); + } + } + current_name = null; + } + } + else { + throw new MsaFormatException( "illegal msa format (line: " + line_counter + "):\n\"" + trim( line ) + + "\"" ); + } + if ( is_first ) { + is_first = false; + } + } + } // while ( ( line = reader.readLine() ) != null ) + final List seqs = new ArrayList(); + for( int i = 0; i < names_in_order.size(); ++i ) { + seqs.add( BasicSequence.createAaSequence( names_in_order.get( i ), temp_msa.get( names_in_order.get( i ) ) + .toString() ) ); + } + final Msa msa = BasicMsa.createInstance( seqs ); + return msa; + } + + private static String trim( final String line ) { + if ( line.length() > 100 ) { + return line.substring( 0, 100 ) + " ..."; + } + return line; + } +} diff --git a/forester/java/src/org/forester/io/parsers/HmmPfamOutputParser.java b/forester/java/src/org/forester/io/parsers/HmmPfamOutputParser.java new file mode 100644 index 0000000..42f94ee --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/HmmPfamOutputParser.java @@ -0,0 +1,689 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.surfacing.BasicDomain; +import org.forester.surfacing.BasicProtein; +import org.forester.surfacing.Domain; +import org.forester.surfacing.DomainId; +import org.forester.surfacing.Protein; +import org.forester.surfacing.SurfacingUtil; +import org.forester.util.ForesterUtil; + +public final class HmmPfamOutputParser { + + private static final String RETRO = "RETRO"; + private static final String PHAGE = "PHAGE"; + private static final String VIR = "VIR"; + private static final String TRANSPOS = "TRANSPOS"; + private static final String RV = "RV"; + private static final String GAG = "GAG_"; + private static final String HCV = "HCV_"; // New. Added on Jun 11, after 1st submission. + private static final String HERPES = "Herpes_"; // New. Added on Jun 11, after 1st submission. + private static final int E_VALUE_MAXIMUM_DEFAULT = -1; + private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN; + private static final boolean IGNORE_DUFS_DEFAULT = false; + private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + private final Set _filter; + private final FilterType _filter_type; + private final File _input_file; + private final String _species; + private final String _model_type; + private double _e_value_maximum; + private Map _individual_domain_score_cutoffs; + private boolean _ignore_dufs; + private boolean _ignore_virus_like_ids; + private boolean _allow_non_unique_query; + private boolean _verbose; + private int _max_allowed_overlap; + private boolean _ignore_engulfed_domains; + private ReturnType _return_type; + private int _proteins_encountered; + private int _proteins_ignored_due_to_filter; + private int _proteins_stored; + private int _domains_encountered; + private int _domains_ignored_due_to_duf; + private int _domains_ignored_due_to_overlap; + private int _domains_ignored_due_to_e_value; + private int _domains_ignored_due_to_individual_score_cutoff; + private int _domains_stored; + private SortedSet _domains_stored_set; + private long _time; + private int _domains_ignored_due_to_negative_domain_filter; + private Map _domains_ignored_due_to_negative_domain_filter_counts_map; + private int _domains_ignored_due_to_virus_like_id; + private Map _domains_ignored_due_to_virus_like_id_counts_map; + + public HmmPfamOutputParser( final File input_file, final String species, final String model_type ) { + _input_file = input_file; + _species = species; + _model_type = model_type; + _filter = null; + _filter_type = FilterType.NONE; + init(); + } + + public HmmPfamOutputParser( final File input_file, + final String species, + final String model_type, + final Set filter, + final FilterType filter_type ) { + _input_file = input_file; + _species = species; + _model_type = model_type; + _filter = filter; + _filter_type = filter_type; + init(); + } + + private void actuallyAddProtein( final List proteins, final Protein current_protein ) { + final List l = current_protein.getProteinDomains(); + for( final Domain d : l ) { + getDomainsStoredSet().add( d.getDomainId() ); + } + proteins.add( current_protein ); + ++_proteins_stored; + } + + private void addProtein( final List proteins, final Protein current_protein ) { + if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { + final Set domain_ids_in_protein = new HashSet(); + for( final Domain d : current_protein.getProteinDomains() ) { + domain_ids_in_protein.add( d.getDomainId() ); + } + domain_ids_in_protein.retainAll( getFilter() ); + if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) { + if ( domain_ids_in_protein.size() > 0 ) { + actuallyAddProtein( proteins, current_protein ); + } + else { + ++_proteins_ignored_due_to_filter; + } + } + else { + if ( domain_ids_in_protein.size() < 1 ) { + actuallyAddProtein( proteins, current_protein ); + } + else { + ++_proteins_ignored_due_to_filter; + } + } + } + else { + actuallyAddProtein( proteins, current_protein ); + } + } + + public int getDomainsEncountered() { + return _domains_encountered; + } + + public int getDomainsIgnoredDueToDuf() { + return _domains_ignored_due_to_duf; + } + + public int getDomainsIgnoredDueToEval() { + return _domains_ignored_due_to_e_value; + } + + public int getDomainsIgnoredDueToIndividualScoreCutoff() { + return _domains_ignored_due_to_individual_score_cutoff; + } + + public int getDomainsIgnoredDueToNegativeDomainFilter() { + return _domains_ignored_due_to_negative_domain_filter; + } + + public Map getDomainsIgnoredDueToNegativeDomainFilterCountsMap() { + return _domains_ignored_due_to_negative_domain_filter_counts_map; + } + + public int getDomainsIgnoredDueToOverlap() { + return _domains_ignored_due_to_overlap; + } + + public Map getDomainsIgnoredDueToVirusLikeIdCountsMap() { + return _domains_ignored_due_to_virus_like_id_counts_map; + } + + public int getDomainsIgnoredDueToVirusLikeIds() { + return _domains_ignored_due_to_virus_like_id; + } + + public int getDomainsStored() { + return _domains_stored; + } + + public SortedSet getDomainsStoredSet() { + return _domains_stored_set; + } + + private double getEValueMaximum() { + return _e_value_maximum; + } + + private Set getFilter() { + return _filter; + } + + private FilterType getFilterType() { + return _filter_type; + } + + private Map getIndividualDomainScoreCutoffs() { + return _individual_domain_score_cutoffs; + } + + private File getInputFile() { + return _input_file; + } + + private int getMaxAllowedOverlap() { + return _max_allowed_overlap; + } + + private String getModelType() { + return _model_type; + } + + public int getProteinsEncountered() { + return _proteins_encountered; + } + + public int getProteinsIgnoredDueToFilter() { + return _proteins_ignored_due_to_filter; + } + + public int getProteinsStored() { + return _proteins_stored; + } + + private ReturnType getReturnType() { + return _return_type; + } + + private String getSpecies() { + return _species; + } + + public long getTime() { + return _time; + } + + private void init() { + _e_value_maximum = HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT; + setIgnoreDufs( HmmPfamOutputParser.IGNORE_DUFS_DEFAULT ); + setReturnType( HmmPfamOutputParser.RETURN_TYPE_DEFAULT ); + _max_allowed_overlap = HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT; + setIndividualDomainScoreCutoffs( null ); + setIgnoreEngulfedDomains( false ); + setIgnoreVirusLikeIds( false ); + setAllowNonUniqueQuery( false ); + setVerbose( false ); + intitCounts(); + } + + private void intitCounts() { + setDomainsStoredSet( new TreeSet() ); + setDomainsEncountered( 0 ); + setProteinsEncountered( 0 ); + setProteinsIgnoredDueToFilter( 0 ); + setDomainsIgnoredDueToNegativeFilter( 0 ); + setDomainsIgnoredDueToDuf( 0 ); + setDomainsIgnoredDueToEval( 0 ); + setDomainsIgnoredDueToIndividualScoreCutoff( 0 ); + setDomainsIgnoredDueToVirusLikeId( 0 ); + setDomainsIgnoredDueToOverlap( 0 ); + setDomainsStored( 0 ); + setProteinsStored( 0 ); + setTime( 0 ); + setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap() ); + setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap() ); + } + + private boolean isAllowNonUniqueQuery() { + return _allow_non_unique_query; + } + + private boolean isIgnoreDufs() { + return _ignore_dufs; + } + + private boolean isIgnoreEngulfedDomains() { + return _ignore_engulfed_domains; + } + + private boolean isIgnoreVirusLikeIds() { + return _ignore_virus_like_ids; + } + + private boolean isVerbose() { + return _verbose; + } + + public List parse() throws IOException { + intitCounts(); + final Set queries = new HashSet(); + final String error = ForesterUtil.isReadableFile( getInputFile() ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new IOException( error ); + } + final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) ); + String line; + final List proteins = new ArrayList(); + Protein current_protein = null; + int line_number = 0; + boolean saw_double_slash = true; + boolean can_parse_domains = false; + boolean saw_parsed_for_domains = false; + boolean saw_query_sequence = false; + boolean was_not_unique = false; + final long start_time = new Date().getTime(); + while ( ( line = br.readLine() ) != null ) { + line_number++; + if ( line.length() < 1 ) { + continue; + } + else if ( line.startsWith( "Query sequence:" ) ) { + ++_proteins_encountered; + if ( !saw_double_slash ) { + throw new IOException( "unexpected format [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + saw_double_slash = false; + saw_query_sequence = true; + was_not_unique = false; + final String query = line.substring( 16 ).trim(); + if ( ForesterUtil.isEmpty( query ) ) { + throw new IOException( "query sequence cannot be empty [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + if ( queries.contains( query ) ) { + if ( !isAllowNonUniqueQuery() ) { + throw new IOException( "query \"" + query + "\" is not unique [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + else if ( isVerbose() ) { + ForesterUtil.printWarningMessage( getClass().getName(), "query \"" + query + + "\" is not unique [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + } + else { + queries.add( query ); + } + if ( current_protein != null ) { + throw new IOException( "unexpected format [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) { + current_protein = new BasicProtein( query, getSpecies() ); + } + else { + throw new IllegalArgumentException( "unknown return type" ); + } + } + else if ( line.startsWith( "Accession:" ) ) { + if ( !saw_query_sequence || ( current_protein == null ) ) { + throw new IOException( "unexpected format [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + ( ( BasicProtein ) current_protein ).setAccession( line.substring( 11 ).trim() ); + } + else if ( line.startsWith( "Description:" ) ) { + if ( !saw_query_sequence || ( current_protein == null ) ) { + throw new IOException( "unexpected format [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + if ( was_not_unique ) { + if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) { + current_protein = new BasicProtein( current_protein.getProteinId() + " " + + line.substring( 13 ).trim(), getSpecies() ); + } + } + else { + ( ( BasicProtein ) current_protein ).setDescription( line.substring( 13 ).trim() ); + } + } + else if ( line.startsWith( "Parsed for domains:" ) ) { + if ( !saw_query_sequence ) { + throw new IOException( "unexpected format [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + saw_query_sequence = false; + saw_parsed_for_domains = true; + } + else if ( saw_parsed_for_domains && line.startsWith( "--------" ) ) { + can_parse_domains = true; + saw_parsed_for_domains = false; + } + else if ( line.startsWith( "Alignments of top-scoring domains:" ) ) { + if ( !can_parse_domains ) { + throw new IOException( "unexpected format [line " + line_number + "] in [" + + getInputFile().getCanonicalPath() + "]" ); + } + can_parse_domains = false; + } + else if ( line.startsWith( "//" ) ) { + can_parse_domains = false; + saw_double_slash = true; + if ( current_protein.getProteinDomains().size() > 0 ) { + if ( ( getMaxAllowedOverlap() != HmmPfamOutputParser.MAX_ALLOWED_OVERLAP_DEFAULT ) + || isIgnoreEngulfedDomains() ) { + final int domains_count = current_protein.getNumberOfProteinDomains(); + current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(), + isIgnoreEngulfedDomains(), + current_protein ); + final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains(); + _domains_stored -= domains_removed; + _domains_ignored_due_to_overlap += domains_removed; + } + addProtein( proteins, current_protein ); + } + current_protein = null; + } + else if ( can_parse_domains && ( line.indexOf( "[no hits above thresholds]" ) == -1 ) ) { + final String[] s = line.split( "\\s+" ); + if ( s.length != 10 ) { + throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line " + + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + final String id = s[ 0 ]; + final String domain_count_str = s[ 1 ]; + final String from_str = s[ 2 ]; + final String to_str = s[ 3 ]; + final String query_match_str = s[ 4 ]; + final String hmm_match_str = s[ 7 ]; + final String score_str = s[ 8 ]; + final String e_value_str = s[ 9 ]; + int from = -1; + int to = -1; + double e_value = -1; + double score = -1; + boolean is_complete_hmm_match = false; + boolean is_complete_query_match = false; + try { + from = Integer.valueOf( from_str ).intValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse seq-f from \"" + line + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + try { + to = Integer.valueOf( to_str ).intValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse seq-t from \"" + line + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + try { + score = Double.valueOf( score_str ).doubleValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse score from \"" + line + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + try { + e_value = Double.valueOf( e_value_str ).doubleValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse E-value from \"" + line + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + if ( hmm_match_str.equals( "[]" ) ) { + is_complete_hmm_match = true; + } + else if ( !( hmm_match_str.equals( ".]" ) || hmm_match_str.equals( "[." ) || hmm_match_str + .equals( ".." ) ) ) { + throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line " + + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + if ( query_match_str.equals( ".." ) ) { + is_complete_query_match = true; + } + else if ( !( query_match_str.equals( ".]" ) || query_match_str.equals( "[." ) || query_match_str + .equals( "[]" ) ) ) { + throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line " + + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + final String[] ct = domain_count_str.split( "/" ); + if ( ct.length != 2 ) { + throw new IOException( "unexpected format in hmmpfam output: \"" + line + "\" [line " + + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + final String number_str = ct[ 0 ]; + final String total_str = ct[ 1 ]; + int number = -1; + int total = -1; + try { + number = Integer.valueOf( ( number_str ) ).intValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse domain number from \"" + line + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + try { + total = Integer.valueOf( ( total_str ) ).intValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse domain count from \"" + line + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + ++_domains_encountered; + boolean failed_cutoff = false; + if ( getIndividualDomainScoreCutoffs() != null ) { + if ( getIndividualDomainScoreCutoffs().containsKey( id ) ) { + final double cutoff = Double.parseDouble( getIndividualDomainScoreCutoffs().get( id ) ); + if ( score < cutoff ) { + failed_cutoff = true; + } + } + else { + throw new IOException( "could not find a score cutoff value for domain id \"" + id + + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + } + final String uc_id = id.toUpperCase(); + if ( failed_cutoff ) { + ++_domains_ignored_due_to_individual_score_cutoff; + } + else if ( ( getEValueMaximum() != HmmPfamOutputParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( e_value > getEValueMaximum() ) ) { + ++_domains_ignored_due_to_e_value; + } + else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) { + ++_domains_ignored_due_to_duf; + } + else if ( isIgnoreVirusLikeIds() + && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO ) + || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG ) + || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) ) ) { + ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), id ); + ++_domains_ignored_due_to_virus_like_id; + } + else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) + && getFilter().contains( new DomainId( id ) ) ) { + ++_domains_ignored_due_to_negative_domain_filter; + ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), id ); + } + else { + final BasicDomain pd = new BasicDomain( id, + from, + to, + ( short ) number, + ( short ) total, + e_value, + score ); + current_protein.addProteinDomain( pd ); + ++_domains_stored; + } + } + } // while ( ( line = br.readLine() ) != null ) + setTime( new Date().getTime() - start_time ); + if ( !saw_double_slash ) { + throw new IOException( "file ends unexpectedly [line " + line_number + "]" ); + } + return proteins; + } + + public void setAllowNonUniqueQuery( final boolean allow_non_unique_query ) { + _allow_non_unique_query = allow_non_unique_query; + } + + private void setDomainsEncountered( final int domains_encountered ) { + _domains_encountered = domains_encountered; + } + + private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) { + _domains_ignored_due_to_duf = domains_ignored_due_to_duf; + } + + public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { + _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value; + } + + public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { + _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff; + } + + private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map domains_ignored_due_to_negative_domain_filter_counts_map ) { + _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map; + } + + private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) { + _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter; + } + + private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) { + _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap; + } + + private void setDomainsIgnoredDueToVirusLikeId( final int i ) { + _domains_ignored_due_to_virus_like_id = i; + } + + private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map domains_ignored_due_to_virus_like_id_counts_map ) { + _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map; + } + + private void setDomainsStored( final int domains_stored ) { + _domains_stored = domains_stored; + } + + private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { + _domains_stored_set = _storeddomains_stored; + } + + public void setEValueMaximum( final double e_value_maximum ) { + if ( e_value_maximum < 0.0 ) { + throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); + } + _e_value_maximum = e_value_maximum; + } + + public void setIgnoreDufs( final boolean ignore_dufs ) { + _ignore_dufs = ignore_dufs; + } + + /** + * To ignore domains which are completely engulfed by domains (individual + * ones or stretches of overlapping ones) with better support values. + * + * + * @param ignored_engulfed_domains + */ + public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) { + _ignore_engulfed_domains = ignore_engulfed_domains; + } + + public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) { + _ignore_virus_like_ids = ignore_virus_like_ids; + } + + /** + * Sets the individual domain score cutoff values (for example, gathering + * thresholds from Pfam). Domain ids are the keys, cutoffs the values. + * + * @param individual_domain_score_cutoffs + */ + public void setIndividualDomainScoreCutoffs( final Map individual_domain_score_cutoffs ) { + _individual_domain_score_cutoffs = individual_domain_score_cutoffs; + } + + public void setMaxAllowedOverlap( final int max_allowed_overlap ) { + if ( max_allowed_overlap < 0 ) { + throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." ); + } + _max_allowed_overlap = max_allowed_overlap; + } + + private void setProteinsEncountered( final int proteins_encountered ) { + _proteins_encountered = proteins_encountered; + } + + private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) { + _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter; + } + + private void setProteinsStored( final int proteins_stored ) { + _proteins_stored = proteins_stored; + } + + public void setReturnType( final ReturnType return_type ) { + _return_type = return_type; + } + + private void setTime( final long time ) { + _time = time; + } + + public void setVerbose( final boolean verbose ) { + _verbose = verbose; + } + + public static enum FilterType { + NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN + } + + public static enum ReturnType { + UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN + } +} diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java new file mode 100644 index 0000000..a63ba50 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java @@ -0,0 +1,595 @@ +// $Id: +// $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.surfacing.BasicDomain; +import org.forester.surfacing.BasicProtein; +import org.forester.surfacing.Domain; +import org.forester.surfacing.DomainId; +import org.forester.surfacing.Protein; +import org.forester.surfacing.SurfacingUtil; +import org.forester.util.ForesterUtil; + +public final class HmmscanPerDomainTableParser { + + private static final String RETRO = "RETRO"; + private static final String PHAGE = "PHAGE"; + private static final String VIR = "VIR"; + private static final String TRANSPOS = "TRANSPOS"; + private static final String RV = "RV"; + private static final String GAG = "GAG_"; + private static final String HCV = "HCV_"; + private static final String HERPES = "HERPES_"; + private static final String BACULO = "BACULO_"; + private static final int E_VALUE_MAXIMUM_DEFAULT = -1; + private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN; + private static final boolean IGNORE_DUFS_DEFAULT = false; + private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + private final Set _filter; + private final FilterType _filter_type; + private final File _input_file; + private final String _species; + private double _e_value_maximum; + private Map _individual_score_cutoffs; + private boolean _ignore_dufs; + private boolean _ignore_virus_like_ids; + private int _max_allowed_overlap; + private boolean _ignore_engulfed_domains; + private ReturnType _return_type; + private int _proteins_encountered; + private int _proteins_ignored_due_to_filter; + private int _proteins_stored; + private int _domains_encountered; + private int _domains_ignored_due_to_duf; + private int _domains_ignored_due_to_overlap; + private int _domains_ignored_due_to_e_value; + private int _domains_ignored_due_to_individual_score_cutoff; + private int _domains_stored; + private SortedSet _domains_stored_set; + private long _time; + private int _domains_ignored_due_to_negative_domain_filter; + private Map _domains_ignored_due_to_negative_domain_filter_counts_map; + private int _domains_ignored_due_to_virus_like_id; + private Map _domains_ignored_due_to_virus_like_id_counts_map; + private final INDIVIDUAL_SCORE_CUTOFF _ind_cutoff; + + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) { + _input_file = input_file; + _species = species; + _filter = null; + _filter_type = FilterType.NONE; + _ind_cutoff = individual_cutoff_applies_to; + init(); + } + + public HmmscanPerDomainTableParser( final File input_file, + final String species, + final Set filter, + final FilterType filter_type, + final INDIVIDUAL_SCORE_CUTOFF individual_cutoff_applies_to ) { + _input_file = input_file; + _species = species; + _filter = filter; + _filter_type = filter_type; + _ind_cutoff = individual_cutoff_applies_to; + init(); + } + + private void actuallyAddProtein( final List proteins, final Protein current_protein ) { + final List l = current_protein.getProteinDomains(); + for( final Domain d : l ) { + getDomainsStoredSet().add( d.getDomainId() ); + } + proteins.add( current_protein ); + ++_proteins_stored; + } + + private void addProtein( final List proteins, Protein current_protein ) { + if ( ( getMaxAllowedOverlap() != HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT ) + || isIgnoreEngulfedDomains() ) { + final int domains_count = current_protein.getNumberOfProteinDomains(); + current_protein = SurfacingUtil.removeOverlappingDomains( getMaxAllowedOverlap(), + isIgnoreEngulfedDomains(), + current_protein ); + final int domains_removed = domains_count - current_protein.getNumberOfProteinDomains(); + _domains_stored -= domains_removed; + _domains_ignored_due_to_overlap += domains_removed; + } + if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { + final Set domain_ids_in_protein = new HashSet(); + for( final Domain d : current_protein.getProteinDomains() ) { + domain_ids_in_protein.add( d.getDomainId() ); + } + domain_ids_in_protein.retainAll( getFilter() ); + if ( getFilterType() == FilterType.POSITIVE_PROTEIN ) { + if ( domain_ids_in_protein.size() > 0 ) { + actuallyAddProtein( proteins, current_protein ); + } + else { + ++_proteins_ignored_due_to_filter; + } + } + else { + if ( domain_ids_in_protein.size() < 1 ) { + actuallyAddProtein( proteins, current_protein ); + } + else { + ++_proteins_ignored_due_to_filter; + } + } + } + else { + actuallyAddProtein( proteins, current_protein ); + } + } + + public int getDomainsEncountered() { + return _domains_encountered; + } + + public int getDomainsIgnoredDueToDuf() { + return _domains_ignored_due_to_duf; + } + + public int getDomainsIgnoredDueToEval() { + return _domains_ignored_due_to_e_value; + } + + public int getDomainsIgnoredDueToIndividualScoreCutoff() { + return _domains_ignored_due_to_individual_score_cutoff; + } + + public int getDomainsIgnoredDueToNegativeDomainFilter() { + return _domains_ignored_due_to_negative_domain_filter; + } + + public Map getDomainsIgnoredDueToNegativeDomainFilterCountsMap() { + return _domains_ignored_due_to_negative_domain_filter_counts_map; + } + + public int getDomainsIgnoredDueToOverlap() { + return _domains_ignored_due_to_overlap; + } + + public Map getDomainsIgnoredDueToVirusLikeIdCountsMap() { + return _domains_ignored_due_to_virus_like_id_counts_map; + } + + public int getDomainsIgnoredDueToVirusLikeIds() { + return _domains_ignored_due_to_virus_like_id; + } + + public int getDomainsStored() { + return _domains_stored; + } + + public SortedSet getDomainsStoredSet() { + return _domains_stored_set; + } + + private double getEValueMaximum() { + return _e_value_maximum; + } + + private Set getFilter() { + return _filter; + } + + private FilterType getFilterType() { + return _filter_type; + } + + public INDIVIDUAL_SCORE_CUTOFF getIndividualCutoffAppliesTo() { + return _ind_cutoff; + } + + private Map getIndividualScoreCutoffs() { + return _individual_score_cutoffs; + } + + private File getInputFile() { + return _input_file; + } + + private int getMaxAllowedOverlap() { + return _max_allowed_overlap; + } + + public int getProteinsEncountered() { + return _proteins_encountered; + } + + public int getProteinsIgnoredDueToFilter() { + return _proteins_ignored_due_to_filter; + } + + public int getProteinsStored() { + return _proteins_stored; + } + + private ReturnType getReturnType() { + return _return_type; + } + + private String getSpecies() { + return _species; + } + + public long getTime() { + return _time; + } + + private void init() { + _e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; + setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT ); + setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT ); + _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT; + setIndividualScoreCutoffs( null ); + setIgnoreEngulfedDomains( false ); + setIgnoreVirusLikeIds( false ); + intitCounts(); + } + + private void intitCounts() { + setDomainsStoredSet( new TreeSet() ); + setDomainsEncountered( 0 ); + setProteinsEncountered( 0 ); + setProteinsIgnoredDueToFilter( 0 ); + setDomainsIgnoredDueToNegativeFilter( 0 ); + setDomainsIgnoredDueToDuf( 0 ); + setDomainsIgnoredDueToEval( 0 ); + setDomainsIgnoredDueToIndividualScoreCutoff( 0 ); + setDomainsIgnoredDueToVirusLikeId( 0 ); + setDomainsIgnoredDueToOverlap( 0 ); + setDomainsStored( 0 ); + setProteinsStored( 0 ); + setTime( 0 ); + setDomainsIgnoredDueToVirusLikeIdCountsMap( new TreeMap() ); + setDomainsIgnoredDueToNegativeDomainFilterCountsMap( new TreeMap() ); + } + + private boolean isIgnoreDufs() { + return _ignore_dufs; + } + + private boolean isIgnoreEngulfedDomains() { + return _ignore_engulfed_domains; + } + + private boolean isIgnoreVirusLikeIds() { + return _ignore_virus_like_ids; + } + + public List parse() throws IOException { + if ( ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE ) + && ( ( getIndividualScoreCutoffs() == null ) || ( getIndividualScoreCutoffs().size() < 1 ) ) ) { + throw new RuntimeException( "attempt to use individual cuttoffs with having set them" ); + } + intitCounts(); + final Set prev_queries = new HashSet(); + final String error = ForesterUtil.isReadableFile( getInputFile() ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new IOException( error ); + } + final BufferedReader br = new BufferedReader( new FileReader( getInputFile() ) ); + String line; + final List proteins = new ArrayList(); + Protein current_protein = null; + int line_number = 0; + final long start_time = new Date().getTime(); + String prev_query = ""; + int prev_qlen = -1; + while ( ( line = br.readLine() ) != null ) { + line_number++; + if ( ForesterUtil.isEmpty( line ) || line.startsWith( "#" ) ) { + continue; + } + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 + // # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord + // # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target + // #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- + // Ion_trans PF00520.24 201 jgi|Nemve1|7|gw.28.1.1 - 1604 6.3e-169 557.4 95.3 1 4 1.5e-41 3e-38 130.8 11.1 3 171 140 307 139 346 0.81 Ion transport protein + // Ion_trans PF00520.24 201 jgi|Nemve1|7|gw.28.1.1 - 1604 6.3e-169 557.4 95.3 2 4 9.1e-45 1.8e-41 141.3 13.1 4 200 479 664 476 665 0.97 Ion transport protein + // Ion_trans PF00520.24 201 jgi|Nemve1|7|gw.28.1.1 - 1604 6.3e-169 557.4 95.3 3 4 5.2e-45 1e-41 142.1 14.0 1 201 900 1117 900 1117 0.96 Ion transport protein + // Ion_trans PF00520.24 201 jgi|Nemve1|7|gw.28.1.1 - 1604 6.3e-169 557.4 95.3 4 4 9.2e-51 1.8e-47 160.9 11.3 1 201 1217 1423 1217 1423 0.97 Ion transport protein + // PKD_channel PF08016.5 426 jgi|Nemve1|7|gw.28.1.1 - 1604 5.9e-19 67.4 70.5 1 8 0.00053 1.1 7.3 0.4 220 264 142 191 134 200 0.73 Polycystin cation channel + final String tokens[] = line.split( "\\s+" ); + final String target_id = tokens[ 0 ]; + final String target_acc = tokens[ 1 ]; + final int tlen = parseInt( tokens[ 2 ], line_number, "tlen" ); + final String query = tokens[ 3 ]; + final String query_acc = tokens[ 4 ]; + final int qlen = parseInt( tokens[ 5 ], line_number, "qlen" ); + final double fs_e_value = parseDouble( tokens[ 6 ], line_number, "E-value" ); + final double fs_score = parseDouble( tokens[ 7 ], line_number, "score" ); + final int domain_number = parseInt( tokens[ 9 ], line_number, "count" ); + final int total_domains = parseInt( tokens[ 10 ], line_number, "total" ); + final double c_e_value = parseDouble( tokens[ 11 ], line_number, "c-Evalue" ); + final double i_e_value = parseDouble( tokens[ 12 ], line_number, "i-Evalue" ); + final double domain_score = parseDouble( tokens[ 13 ], line_number, "score" ); + final int hmm_from = parseInt( tokens[ 15 ], line_number, "hmm from" ); + final int hmm_to = parseInt( tokens[ 16 ], line_number, "hmm to" ); + final int ali_from = parseInt( tokens[ 17 ], line_number, "ali from" ); + final int ali_to = parseInt( tokens[ 18 ], line_number, "ali to" ); + final int env_from = parseInt( tokens[ 19 ], line_number, "env from" ); + final int env_to = parseInt( tokens[ 20 ], line_number, "env to" ); + ++_domains_encountered; + if ( !query.equals( prev_query ) || ( qlen != prev_qlen ) ) { + if ( query.equals( prev_query ) ) { + throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen + ", " + + prev_qlen ); + } + if ( prev_queries.contains( query ) ) { + throw new IOException( "more than one protein named [" + query + "]" ); + } + prev_query = query; + prev_qlen = qlen; + prev_queries.add( query ); + if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) { + addProtein( proteins, current_protein ); + } + if ( getReturnType() == ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ) { + current_protein = new BasicProtein( query, getSpecies() ); + } + else { + throw new IllegalArgumentException( "unknown return type" ); + } + } + boolean failed_cutoff = false; + if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.NONE ) { + if ( getIndividualScoreCutoffs().containsKey( target_id ) ) { + final double cutoff = getIndividualScoreCutoffs().get( target_id ); + if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE ) { + if ( fs_score < cutoff ) { + failed_cutoff = true; + } + } + else if ( getIndividualCutoffAppliesTo() != INDIVIDUAL_SCORE_CUTOFF.DOMAIN ) { + if ( domain_score < cutoff ) { + failed_cutoff = true; + } + } + } + else { + throw new IOException( "could not find a score cutoff value for domain id \"" + target_id + + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + } + final String uc_id = target_id.toUpperCase(); + if ( failed_cutoff ) { + ++_domains_ignored_due_to_individual_score_cutoff; + } + else if ( ali_from == ali_to ) { + //Ignore + } + else if ( ( getEValueMaximum() != HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT ) + && ( fs_e_value > getEValueMaximum() ) ) { + ++_domains_ignored_due_to_e_value; + } + else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) { + ++_domains_ignored_due_to_duf; + } + else if ( isIgnoreVirusLikeIds() + && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO ) + || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG ) + || uc_id.startsWith( HCV ) || uc_id.startsWith( HERPES ) || uc_id.startsWith( BACULO ) ) ) { + ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToVirusLikeIdCountsMap(), target_id ); + ++_domains_ignored_due_to_virus_like_id; + } + else if ( ( getFilterType() == FilterType.NEGATIVE_DOMAIN ) + && getFilter().contains( new DomainId( target_id ) ) ) { + ++_domains_ignored_due_to_negative_domain_filter; + ForesterUtil.increaseCountingMap( getDomainsIgnoredDueToNegativeDomainFilterCountsMap(), target_id ); + } + else { + try { + final Domain pd = new BasicDomain( target_id, + ali_from, + ali_to, + ( short ) domain_number, + ( short ) total_domains, + fs_e_value, + fs_score, + i_e_value, + domain_score ); + current_protein.addProteinDomain( pd ); + } + catch ( final IllegalArgumentException e ) { + throw new IOException( "problem with domain parsing at line " + line_number + "[" + line + "]: " + + e.getMessage() ); + } + ++_domains_stored; + } + } // while ( ( line = br.readLine() ) != null ) + if ( ( current_protein != null ) && ( current_protein.getProteinDomains().size() > 0 ) ) { + addProtein( proteins, current_protein ); + } + setProteinsEncountered( prev_queries.size() ); + setTime( new Date().getTime() - start_time ); + return proteins; + } + + private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException { + double d = -1; + try { + d = Double.valueOf( double_str ).doubleValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + return d; + } + + private int parseInt( final String double_str, final int line_number, final String label ) throws IOException { + int i = -1; + try { + i = Integer.valueOf( double_str ).intValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number + + "] in [" + getInputFile().getCanonicalPath() + "]" ); + } + return i; + } + + private void setDomainsEncountered( final int domains_encountered ) { + _domains_encountered = domains_encountered; + } + + private void setDomainsIgnoredDueToDuf( final int domains_ignored_due_to_duf ) { + _domains_ignored_due_to_duf = domains_ignored_due_to_duf; + } + + public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { + _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value; + } + + public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { + _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff; + } + + private void setDomainsIgnoredDueToNegativeDomainFilterCountsMap( final Map domains_ignored_due_to_negative_domain_filter_counts_map ) { + _domains_ignored_due_to_negative_domain_filter_counts_map = domains_ignored_due_to_negative_domain_filter_counts_map; + } + + private void setDomainsIgnoredDueToNegativeFilter( final int domains_ignored_due_to_negative_domain_filter ) { + _domains_ignored_due_to_negative_domain_filter = domains_ignored_due_to_negative_domain_filter; + } + + private void setDomainsIgnoredDueToOverlap( final int domains_ignored_due_to_overlap ) { + _domains_ignored_due_to_overlap = domains_ignored_due_to_overlap; + } + + private void setDomainsIgnoredDueToVirusLikeId( final int i ) { + _domains_ignored_due_to_virus_like_id = i; + } + + private void setDomainsIgnoredDueToVirusLikeIdCountsMap( final Map domains_ignored_due_to_virus_like_id_counts_map ) { + _domains_ignored_due_to_virus_like_id_counts_map = domains_ignored_due_to_virus_like_id_counts_map; + } + + private void setDomainsStored( final int domains_stored ) { + _domains_stored = domains_stored; + } + + private void setDomainsStoredSet( final SortedSet _storeddomains_stored ) { + _domains_stored_set = _storeddomains_stored; + } + + public void setEValueMaximum( final double e_value_maximum ) { + if ( e_value_maximum < 0.0 ) { + throw new IllegalArgumentException( "attempt to set the maximum E-value to a negative value" ); + } + _e_value_maximum = e_value_maximum; + } + + public void setIgnoreDufs( final boolean ignore_dufs ) { + _ignore_dufs = ignore_dufs; + } + + /** + * To ignore domains which are completely engulfed by domains (individual + * ones or stretches of overlapping ones) with better support values. + * + * + * @param ignored_engulfed_domains + */ + public void setIgnoreEngulfedDomains( final boolean ignore_engulfed_domains ) { + _ignore_engulfed_domains = ignore_engulfed_domains; + } + + public void setIgnoreVirusLikeIds( final boolean ignore_virus_like_ids ) { + _ignore_virus_like_ids = ignore_virus_like_ids; + } + + /** + * Sets the individual score cutoff values (for example, gathering + * thresholds from Pfam). Domain ids are the keys, cutoffs the values. + * + * @param individual_score_cutoffs + */ + public void setIndividualScoreCutoffs( final Map individual_score_cutoffs ) { + _individual_score_cutoffs = individual_score_cutoffs; + } + + public void setMaxAllowedOverlap( final int max_allowed_overlap ) { + if ( max_allowed_overlap < 0 ) { + throw new IllegalArgumentException( "Attempt to set max allowed overlap to less than zero." ); + } + _max_allowed_overlap = max_allowed_overlap; + } + + private void setProteinsEncountered( final int proteins_encountered ) { + _proteins_encountered = proteins_encountered; + } + + private void setProteinsIgnoredDueToFilter( final int proteins_ignored_due_to_filter ) { + _proteins_ignored_due_to_filter = proteins_ignored_due_to_filter; + } + + private void setProteinsStored( final int proteins_stored ) { + _proteins_stored = proteins_stored; + } + + public void setReturnType( final ReturnType return_type ) { + _return_type = return_type; + } + + private void setTime( final long time ) { + _time = time; + } + + public static enum FilterType { + NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN + } + + static public enum INDIVIDUAL_SCORE_CUTOFF { + FULL_SEQUENCE, DOMAIN, NONE; + } + + public static enum ReturnType { + UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN + } +} diff --git a/forester/java/src/org/forester/io/parsers/PhylogenyParser.java b/forester/java/src/org/forester/io/parsers/PhylogenyParser.java new file mode 100644 index 0000000..d319d96 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/PhylogenyParser.java @@ -0,0 +1,44 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers; + +import java.io.IOException; + +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; + +/* + * @author Christian Zmasek + * + * TODO To change the template for this generated type comment go to Window - + * Preferences - Java - Code Style - Code Templates + */ +public interface PhylogenyParser { + + public Phylogeny[] parse() throws IOException; + + public void setSource( Object source ) throws PhylogenyParserException, IOException; +} diff --git a/forester/java/src/org/forester/io/parsers/SymmetricalDistanceMatrixParser.java b/forester/java/src/org/forester/io/parsers/SymmetricalDistanceMatrixParser.java new file mode 100644 index 0000000..b9df246 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/SymmetricalDistanceMatrixParser.java @@ -0,0 +1,196 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers; + +import java.io.IOException; +import java.util.List; + +import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.ForesterUtil; + +/* + * This can read full, lower triangular, and upper triangular distance matrices. + * In the case of a full matrix, the lower triangular values are used. Format + * (by example): id1 0 id2 0.3 0 id3 0.4 0.4 0 + * + * OR + * + * id1 id2 0.3 id3 0.4 0.4 + * + * Numbers before are after the data are ignored. + * + * + * + * + * @author Christian M Zmasek + */ +public class SymmetricalDistanceMatrixParser { + + private final static InputMatrixType INPUT_MATRIX_TYPE_DEFAULT = InputMatrixType.LOWER_TRIANGLE; + private final static String COMMENT = "#"; + private final static String VALUE_SEPARATOR = " "; + private int _matrix_size; + private InputMatrixType _input_matrix_type; + + private SymmetricalDistanceMatrixParser() { + init(); + } + + private void checkValueIsZero( final BasicTable table, final int row, final int i, final int start_row ) + throws IOException { + double d = 0.0; + final String table_value = table.getValue( i, row + start_row ); + if ( ForesterUtil.isEmpty( table_value ) ) { + throw new IOException( "value is null or empty at [" + ( i - 1 ) + ", " + row + "]" ); + } + try { + d = Double.parseDouble( table_value ); + } + catch ( final NumberFormatException e ) { + throw new IOException( "illegal format for distance [" + table_value + "] at [" + ( i - 1 ) + ", " + row + + "]" ); + } + if ( !ForesterUtil.isEqual( 0.0, d ) ) { + throw new IOException( "attempt to use non-zero diagonal value [" + table_value + "] at [" + ( i - 1 ) + + ", " + row + "]" ); + } + } + + private InputMatrixType getInputMatrixType() { + return _input_matrix_type; + } + + private int getMatrixSize() { + return _matrix_size; + } + + private void init() { + setInputMatrixType( INPUT_MATRIX_TYPE_DEFAULT ); + reset(); + } + + public DistanceMatrix[] parse( final Object source ) throws IOException { + reset(); + final List> tables = BasicTableParser.parse( source, VALUE_SEPARATOR, false, COMMENT, true ); + final DistanceMatrix[] distance_matrices = new DistanceMatrix[ tables.size() ]; + int i = 0; + for( final BasicTable table : tables ) { + distance_matrices[ i++ ] = transform( table ); + } + return distance_matrices; + } + + private void reset() { + setMatrixSize( -1 ); + } + + public void setInputMatrixType( final InputMatrixType input_matrix_type ) { + _input_matrix_type = input_matrix_type; + } + + private void setMatrixSize( final int matrix_size ) { + _matrix_size = matrix_size; + } + + private void transferValue( final BasicTable table, + final DistanceMatrix distance_matrix, + final int row, + final int col, + final int start_row, + final int col_offset ) throws IOException { + double d = 0.0; + final String table_value = table.getValue( col, row + start_row ); + if ( ForesterUtil.isEmpty( table_value ) ) { + throw new IOException( "value is null or empty at [" + ( col - 1 ) + ", " + row + "]" ); + } + try { + d = Double.parseDouble( table_value ); + } + catch ( final NumberFormatException e ) { + throw new IOException( "illegal format for distance [" + table_value + "] at [" + ( col - 1 ) + ", " + row + + "]" ); + } + distance_matrix.setValue( col - 1 + col_offset, row, d ); + } + + private DistanceMatrix transform( final BasicTable table ) throws IllegalArgumentException, IOException { + boolean first_line_is_size = false; + if ( table.getNumberOfColumns() < 3 ) { + throw new IllegalArgumentException( "attempt to create distance matrix with with less than 3 columns [columns: " + + table.getNumberOfColumns() + ", rows: " + table.getNumberOfRows() + "]" ); + } + if ( table.getNumberOfColumns() == table.getNumberOfRows() ) { + first_line_is_size = true; + } + else if ( table.getNumberOfColumns() != table.getNumberOfRows() + 1 ) { + throw new IllegalArgumentException( "attempt to create distance matrix with illegal dimensions [columns: " + + table.getNumberOfColumns() + ", rows: " + table.getNumberOfRows() + "]" ); + } + final DistanceMatrix distance_matrix = new BasicSymmetricalDistanceMatrix( table.getNumberOfColumns() - 1 ); + int start_row = 0; + if ( first_line_is_size ) { + start_row = 1; + } + for( int row = 0; row < table.getNumberOfRows() - start_row; row++ ) { + distance_matrix.setIdentifier( row, table.getValue( 0, row + start_row ) ); + switch ( getInputMatrixType() ) { + case LOWER_TRIANGLE: + for( int col = 1; col <= row; ++col ) { + transferValue( table, distance_matrix, row, col, start_row, 0 ); + } + checkValueIsZero( table, row, row + 1, start_row ); + break; + case UPPER_TRIANGLE: + for( int col = 1; col < ( table.getNumberOfColumns() - row ); ++col ) { + transferValue( table, distance_matrix, row, col, start_row, row ); + } + break; + default: + throw new AssertionError( "unkwnown input matrix type [" + getInputMatrixType() + "]" ); + } + } + if ( getMatrixSize() < 1 ) { + setMatrixSize( distance_matrix.getSize() ); + } + else if ( getMatrixSize() != distance_matrix.getSize() ) { + throw new IOException( "attempt to use matrices of unequal size: [" + getMatrixSize() + "] vs [" + + distance_matrix.getSize() + "]" ); + } + return distance_matrix; + } + + public static SymmetricalDistanceMatrixParser createInstance() { + return new SymmetricalDistanceMatrixParser(); + } + + public enum InputMatrixType { + UPPER_TRIANGLE, LOWER_TRIANGLE + } +} diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusBinaryStatesMatrixParser.java b/forester/java/src/org/forester/io/parsers/nexus/NexusBinaryStatesMatrixParser.java new file mode 100644 index 0000000..a9d9d3d --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nexus/NexusBinaryStatesMatrixParser.java @@ -0,0 +1,167 @@ +// $Id: +// Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2009-2010 Christian M. Zmasek +// Copyright (C) 2009-2010 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester/ + +package org.forester.io.parsers.nexus; + +import java.io.BufferedReader; +import java.io.IOException; + +import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; +import org.forester.io.parsers.util.ParserUtils; +import org.forester.io.parsers.util.PhylogenyParserException; + +public class NexusBinaryStatesMatrixParser { + + private Object _nexus_source; + private CharacterStateMatrix _matrix; + private int _nchar; + private int _ntax; + + public CharacterStateMatrix getMatrix() { + return _matrix; + } + + public int getNChar() { + return _nchar; + } + + private Object getNexusSource() { + return _nexus_source; + } + + public int getNTax() { + return _ntax; + } + + public void parse() throws IOException { + reset(); + final BufferedReader reader = ParserUtils.createReader( getNexusSource() ); + String line; + boolean in_matrix = false; + int identifier_index = 0; + int max_character_index = -1; + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) { + if ( line.toLowerCase().indexOf( NexusConstants.NCHAR.toLowerCase() ) >= 0 ) { + final int i = line.toLowerCase().indexOf( NexusConstants.NCHAR.toLowerCase() ); + String s = line.toLowerCase().substring( i + 6 ); + s = s.replace( ';', ' ' ).trim(); + setNChar( Integer.parseInt( s ) ); + } + else if ( line.toLowerCase().indexOf( NexusConstants.NTAX.toLowerCase() ) >= 0 ) { + final int i = line.toLowerCase().indexOf( NexusConstants.NTAX.toLowerCase() ); + String s = line.toLowerCase().substring( i + 5 ); + s = s.replace( ';', ' ' ).trim(); + setNTax( Integer.parseInt( s ) ); + } + else if ( line.toLowerCase().startsWith( NexusConstants.MATRIX.toLowerCase() ) ) { + in_matrix = true; + if ( getNTax() < 1 ) { + throw new NexusFormatException( "did not encounter " + NexusConstants.NTAX ); + } + if ( getNChar() < 1 ) { + throw new NexusFormatException( "did not encounter " + NexusConstants.NCHAR ); + } + if ( getMatrix() != null ) { + throw new NexusFormatException( "more than one matrix present" ); + } + setMatrix( new BasicCharacterStateMatrix( getNTax(), getNChar() ) ); + } + else if ( line.toLowerCase().startsWith( NexusConstants.END.toLowerCase() ) ) { + in_matrix = false; + } + else if ( in_matrix ) { + final String[] line_ary = line.split( "\\s+" ); + final String label = line_ary[ 0 ].trim(); + String states_str = line_ary[ 1 ].trim(); + if ( states_str.endsWith( ";" ) ) { + in_matrix = false; + states_str = states_str.substring( 0, states_str.length() - 1 ); + } + final char[] states = states_str.toCharArray(); + getMatrix().setIdentifier( identifier_index, label ); + int character_index = 0; + for( final char state : states ) { + if ( state == BinaryStates.PRESENT.toChar() ) { + try { + getMatrix().setState( identifier_index, character_index, BinaryStates.PRESENT ); + } + catch ( final ArrayIndexOutOfBoundsException ex ) { + throw new NexusFormatException( "problem at line " + line + " [" + ex + "]" ); + } + } + else if ( state == BinaryStates.ABSENT.toChar() ) { + try { + getMatrix().setState( identifier_index, character_index, BinaryStates.ABSENT ); + } + catch ( final ArrayIndexOutOfBoundsException ex ) { + throw new NexusFormatException( "problem at line " + line + " [" + ex + "]" ); + } + } + else { + throw new NexusFormatException( "illegal state " + state ); + } + ++character_index; + } + if ( ( max_character_index > 0 ) && ( max_character_index != character_index ) ) { + throw new NexusFormatException( "unequal number of characters at line " + line ); + } + max_character_index = character_index; + ++identifier_index; + } + } + } + } + + private void reset() { + setMatrix( null ); + setNChar( -1 ); + setNTax( -1 ); + } + + private void setMatrix( final CharacterStateMatrix matrix ) { + _matrix = matrix; + } + + private void setNChar( final int nchar ) { + _nchar = nchar; + } + + private void setNTax( final int ntax ) { + _ntax = ntax; + } + + public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException { + if ( nexus_source == null ) { + throw new PhylogenyParserException( getClass() + ": attempt to parse null object." ); + } + _nexus_source = nexus_source; + } +} diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusCharactersParser.java b/forester/java/src/org/forester/io/parsers/nexus/NexusCharactersParser.java new file mode 100644 index 0000000..5d0ed60 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nexus/NexusCharactersParser.java @@ -0,0 +1,117 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/ + +package org.forester.io.parsers.nexus; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.util.ParserUtils; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.util.ForesterUtil; + +public class NexusCharactersParser { + + final private static String charstatelabels = NexusConstants.CHARSTATELABELS.toLowerCase(); + private Object _nexus_source; + private String[] _char_state_labels; + + public String[] getCharStateLabels() { + return _char_state_labels; + } + + private Object getNexusSource() { + return _nexus_source; + } + + public void parse() throws IOException { + reset(); + final BufferedReader reader = ParserUtils.createReader( getNexusSource() ); + String line; + boolean in_charstatelabels = false; + final List labels_list = new ArrayList(); + int counter = 1; + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) { + if ( line.toLowerCase().startsWith( charstatelabels ) ) { + in_charstatelabels = true; + } + else if ( in_charstatelabels ) { + String label = line; + if ( label.indexOf( ' ' ) > 0 ) { + final String[] s = label.split( "\\s+" ); + label = s[ 1 ]; + int count = -1; + try { + count = Integer.parseInt( s[ 0 ] ); + } + catch ( final NumberFormatException ex ) { + throw new NexusFormatException( "failed to parse character label number from: " + line ); + } + if ( count != counter ) { + throw new NexusFormatException( "character label numbers are not in order, current line: " + + line ); + } + } + ++counter; + label = label.replaceAll( "[\\s;\"',]+", "" ); + if ( !ForesterUtil.isEmpty( label ) ) { + if ( labels_list.contains( label ) ) { + throw new NexusFormatException( "character label [" + label + "] is not unique" ); + } + labels_list.add( label ); + } + } + if ( line.endsWith( ";" ) ) { + in_charstatelabels = false; + } + } + } + setCharStateLabels( new String[ labels_list.size() ] ); + int i = 0; + for( final String label : labels_list ) { + getCharStateLabels()[ i++ ] = label; + } + } + + private void reset() { + setCharStateLabels( new String[ 0 ] ); + } + + private void setCharStateLabels( final String[] char_state_labels ) { + _char_state_labels = char_state_labels; + } + + public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException { + if ( nexus_source == null ) { + throw new PhylogenyParserException( getClass() + ": attempt to parse null object." ); + } + _nexus_source = nexus_source; + } +} diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusConstants.java b/forester/java/src/org/forester/io/parsers/nexus/NexusConstants.java new file mode 100644 index 0000000..67512d7 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nexus/NexusConstants.java @@ -0,0 +1,48 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.nexus; + +public final class NexusConstants { + + public final static String NEXUS = "#NEXUS"; + public final static String BEGIN_TAXA = "Begin Taxa;"; + public final static String BEGIN_TREES = "Begin Trees;"; + public final static String TREE = "Tree"; + public final static String DIMENSIONS = "Dimensions"; + public final static String NTAX = "NTax"; + public final static String NCHAR = "NChar"; + public final static String TAXLABELS = "TaxLabels"; + public final static String CHARSTATELABELS = "CharStateLabels"; + public final static String END = "End;"; + public final static String MATRIX = "Matrix"; + public final static String BEGIN_CHARACTERS = "Begin Characters;"; + public final static String FORMAT = "Format"; + public final static String DATATYPE = "DataType"; + public final static String STANDARD = "Standard"; + public final static String SYMBOLS = "Symbols"; + public static final String TRANSLATE = "Translate"; + public static final String UTREE = "UTREE"; +} diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusFormatException.java b/forester/java/src/org/forester/io/parsers/nexus/NexusFormatException.java new file mode 100644 index 0000000..7e20bee --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nexus/NexusFormatException.java @@ -0,0 +1,41 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.nexus; + +import java.io.IOException; + +public class NexusFormatException extends IOException { + + private static final long serialVersionUID = -8750474393398183410L; + + public NexusFormatException() { + super(); + } + + public NexusFormatException( final String message ) { + super( message ); + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java new file mode 100644 index 0000000..879a0e1 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java @@ -0,0 +1,338 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.nexus; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.archaeopteryx.Constants; +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nhx.NHXFormatException; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.util.ParserUtils; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +public class NexusPhylogeniesParser implements PhylogenyParser { + + final private static String begin_trees = NexusConstants.BEGIN_TREES.toLowerCase(); + final private static String taxlabels = NexusConstants.TAXLABELS.toLowerCase(); + final private static String translate = NexusConstants.TRANSLATE.toLowerCase(); + final private static String tree = NexusConstants.TREE.toLowerCase(); + final private static String utree = NexusConstants.UTREE.toLowerCase(); + final private static String end = NexusConstants.END.toLowerCase(); + final private static String endblock = "endblock"; + final private static Pattern TREE_NAME_PATTERN = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+", + Pattern.CASE_INSENSITIVE ); + final private static Pattern ROOTEDNESS_PATTERN = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" ); + private Object _nexus_source; + private List _phylogenies; + private List _taxlabels; + private Map _translate_map; + private boolean _replace_underscores = NHXParser.REPLACE_UNDERSCORES_DEFAULT; + private boolean _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT; + + private void createPhylogeny( final String name, + final StringBuffer nhx, + final boolean rooted_info_present, + final boolean is_rooted ) throws IOException { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final NHXParser pars = new NHXParser(); + pars.setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.NO ); + pars.setReplaceUnderscores( isReplaceUnderscores() ); + pars.setIgnoreQuotes( isIgnoreQuotes() ); + if ( rooted_info_present ) { + pars.setGuessRootedness( false ); + } + final Phylogeny p = factory.create( nhx, pars )[ 0 ]; + p.setName( name ); + if ( rooted_info_present ) { + p.setRooted( is_rooted ); + } + if ( ( getTaxlabels().size() > 0 ) || ( getTranslateMap().size() > 0 ) ) { + final PhylogenyNodeIterator it = p.iteratorExternalForward(); + while ( it.hasNext() ) { + final PhylogenyNode node = it.next(); + if ( ( getTranslateMap().size() > 0 ) && getTranslateMap().containsKey( node.getName() ) ) { + node.setName( getTranslateMap().get( node.getName() ).replaceAll( "['\"]+", "" ) ); + } + else if ( getTaxlabels().size() > 0 ) { + int i = -1; + try { + i = Integer.parseInt( node.getName() ); + } + catch ( final NumberFormatException e ) { + // Ignore. + } + if ( i > 0 ) { + node.setName( getTaxlabels().get( i - 1 ).replaceAll( "['\"]+", "" ) ); + } + } + } + } + getPhylogenies().add( p ); + } + + private Object getNexusSource() { + return _nexus_source; + } + + private List getPhylogenies() { + return _phylogenies; + } + + private Phylogeny[] getPhylogeniesAsArray() { + final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ]; + for( int i = 0; i < getPhylogenies().size(); ++i ) { + p[ i ] = getPhylogenies().get( i ); + } + return p; + } + + private List getTaxlabels() { + return _taxlabels; + } + + private Map getTranslateMap() { + return _translate_map; + } + + private boolean isIgnoreQuotes() { + return _ignore_quotes_in_nh_data; + } + + private boolean isReplaceUnderscores() { + return _replace_underscores; + } + + public Phylogeny[] parse() throws IOException, NHXFormatException { + reset(); + final BufferedReader reader = ParserUtils.createReader( getNexusSource() ); + String line; + String name = ""; + StringBuffer nhx = new StringBuffer(); + final StringBuffer translate_sb = new StringBuffer(); + boolean in_trees_block = false; + boolean in_taxalabels = false; + boolean in_translate = false; + final boolean in_comment = false; + boolean in_tree = false; + boolean rooted_info_present = false; + boolean is_rooted = false; + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) { + line = ForesterUtil.collapseWhiteSpace( line ); + line = removeWhiteSpaceBeforeSemicolon( line ); + final String line_lc = line.toLowerCase(); + if ( line_lc.startsWith( begin_trees ) ) { + in_trees_block = true; + in_taxalabels = false; + in_translate = false; + } + else if ( line_lc.startsWith( taxlabels ) ) { + in_trees_block = false; + in_taxalabels = true; + in_translate = false; + } + else if ( line_lc.startsWith( translate ) ) { + in_taxalabels = false; + in_translate = true; + } + else if ( in_trees_block ) { + //FIXME TODO need to work on this "title" and "link" + if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) { + // Do nothing. + } + else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { + in_trees_block = false; + in_tree = false; + in_translate = false; + if ( nhx.length() > 0 ) { + createPhylogeny( name, nhx, rooted_info_present, is_rooted ); + nhx = new StringBuffer(); + name = ""; + rooted_info_present = false; + is_rooted = false; + } + } + else if ( line_lc.startsWith( tree ) || ( line_lc.startsWith( utree ) ) ) { + if ( nhx.length() > 0 ) { + createPhylogeny( name, nhx, rooted_info_present, is_rooted ); + nhx = new StringBuffer(); + name = ""; + rooted_info_present = false; + is_rooted = false; + } + in_tree = true; + nhx.append( line.substring( line.indexOf( '=' ) ) ); + final Matcher name_matcher = TREE_NAME_PATTERN.matcher( line ); + if ( name_matcher.matches() ) { + name = name_matcher.group( 1 ); + name = name.replaceAll( "['\"]+", "" ); + } + final Matcher rootedness_matcher = ROOTEDNESS_PATTERN.matcher( line ); + if ( rootedness_matcher.matches() ) { + final String s = rootedness_matcher.group( 1 ); + line = line.replaceAll( "\\[\\&.\\]", "" ); + rooted_info_present = true; + if ( s.toUpperCase().equals( "R" ) ) { + is_rooted = true; + } + } + } + else if ( in_tree && !in_translate ) { + nhx.append( line ); + } + if ( !line_lc.startsWith( "title" ) && !line_lc.startsWith( "link" ) && !in_translate + && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) { + in_tree = false; + in_translate = false; + createPhylogeny( name, nhx, rooted_info_present, is_rooted ); + nhx = new StringBuffer(); + name = ""; + rooted_info_present = false; + is_rooted = false; + } + } + if ( in_taxalabels ) { + if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { + in_taxalabels = false; + } + else { + final String[] labels = line.split( "\\s+" ); + for( String label : labels ) { + if ( !label.toLowerCase().equals( taxlabels ) ) { + if ( label.endsWith( ";" ) ) { + in_taxalabels = false; + label = label.substring( 0, label.length() - 1 ); + } + if ( label.length() > 0 ) { + getTaxlabels().add( label ); + } + } + } + } + } + if ( in_translate ) { + if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) { + in_translate = false; + } + else { + translate_sb.append( " " ); + translate_sb.append( line.trim() ); + if ( line.endsWith( ";" ) ) { + in_translate = false; + setTranslateKeyValuePairs( translate_sb ); + } + } + } + } + } + if ( nhx.length() > 0 ) { + createPhylogeny( name, nhx, rooted_info_present, is_rooted ); + } + return getPhylogeniesAsArray(); + } + + private void reset() { + setPhylogenies( new ArrayList() ); + setTaxlabels( new ArrayList() ); + setTranslateMap( new HashMap() ); + } + + public void setIgnoreQuotes( final boolean ignore_quotes_in_nh_data ) { + _ignore_quotes_in_nh_data = ignore_quotes_in_nh_data; + } + + private void setPhylogenies( final ArrayList phylogenies ) { + _phylogenies = phylogenies; + } + + public void setReplaceUnderscores( final boolean replace_underscores ) { + _replace_underscores = replace_underscores; + } + + public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException { + if ( nexus_source == null ) { + throw new PhylogenyParserException( getClass() + ": attempt to parse null object." ); + } + _nexus_source = nexus_source; + } + + private void setTaxlabels( final List taxlabels ) { + _taxlabels = taxlabels; + } + + private void setTranslateKeyValuePairs( final StringBuffer translate_sb ) throws IOException { + String s = translate_sb.toString().trim(); + if ( s.endsWith( ";" ) ) { + s = s.substring( 0, s.length() - 1 ).trim(); + } + for( final String pair : s.split( "," ) ) { + final String[] kv = pair.trim().split( "\\s+" ); + if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) { + throw new IOException( "ill formatted translate values: " + translate_sb ); + } + if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) { + throw new IOException( "ill formatted translate values: " + translate_sb ); + } + String key = ""; + String value = ""; + if ( kv.length == 3 ) { + key = kv[ 1 ]; + value = kv[ 2 ]; + } + else { + key = kv[ 0 ]; + value = kv[ 1 ]; + } + if ( value.endsWith( ";" ) ) { + value = value.substring( 0, value.length() - 1 ); + } + getTranslateMap().put( key, value ); + } + } + + private void setTranslateMap( final Map translate_map ) { + _translate_map = translate_map; + } + + private static String removeWhiteSpaceBeforeSemicolon( final String s ) { + return s.replaceAll( "\\s+;", ";" ); + } +} diff --git a/forester/java/src/org/forester/io/parsers/nexus/PaupLogParser.java b/forester/java/src/org/forester/io/parsers/nexus/PaupLogParser.java new file mode 100644 index 0000000..c4243b4 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nexus/PaupLogParser.java @@ -0,0 +1,128 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/ + +package org.forester.io.parsers.nexus; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; +import org.forester.io.parsers.util.ParserUtils; +import org.forester.io.parsers.util.PhylogenyParserException; + +public class PaupLogParser { + + private static final String DATA_MATRIX_AND_RECONSTRUCTED_STATES_FOR_INTERNAL_NODES = "data matrix and reconstructed states for internal nodes"; + private Object _nexus_source; + + private Object getNexusSource() { + return _nexus_source; + } + + public CharacterStateMatrix parse() throws IOException { + final BufferedReader reader = ParserUtils.createReader( getNexusSource() ); + String line; + boolean saw_line = false; + int identifier_index = 0; + boolean first_block = true; + boolean saw_data_matrix_line = false; + final List identifiers = new ArrayList(); + final List> states = new ArrayList>(); + boolean done = false; + while ( ( ( line = reader.readLine() ) != null ) && !done ) { + line = line.trim(); + if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) { + if ( ( ( identifier_index > 0 ) && line.startsWith( "Tree " ) ) + || line.startsWith( "Character change list" ) ) { + done = true; + continue; + } + if ( line.toLowerCase().startsWith( DATA_MATRIX_AND_RECONSTRUCTED_STATES_FOR_INTERNAL_NODES ) ) { + saw_line = false; + saw_data_matrix_line = true; + identifier_index = 0; + if ( first_block && ( line.indexOf( "continued" ) > 0 ) ) { + first_block = false; + } + } + if ( saw_data_matrix_line && line.startsWith( "----------" ) ) { + saw_line = true; + } + else if ( saw_line && ( line.indexOf( ' ' ) > 0 ) ) { + final String[] s = line.split( "\\s+" ); + if ( s.length != 2 ) { + throw new NexusFormatException( "unexpected format at line: " + line ); + } + final String identifier = s[ 0 ]; + final String row = s[ 1 ]; + if ( first_block ) { + if ( identifiers.contains( identifier ) ) { + throw new NexusFormatException( "identifier [" + identifier + "] is not unique in line: " + + line ); + } + identifiers.add( identifier ); + states.add( new ArrayList() ); + } + else { + if ( !identifiers.contains( identifier ) ) { + throw new NexusFormatException( "new identifier [" + identifier + "] at line: " + line ); + } + } + for( int c = 0; c < row.length(); ++c ) { + final char ch = row.charAt( c ); + if ( ch == '0' ) { + states.get( identifier_index ).add( BinaryStates.ABSENT ); + } + else if ( ch == '1' ) { + states.get( identifier_index ).add( BinaryStates.PRESENT ); + } + else { + throw new NexusFormatException( "unknown character state [" + ch + "] at line: " + line ); + } + } + ++identifier_index; + } + } + } + final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( states ); + int i = 0; + for( final String identifier : identifiers ) { + matrix.setIdentifier( i++, identifier ); + } + return matrix; + } + + public void setSource( final Object nexus_source ) throws PhylogenyParserException, IOException { + if ( nexus_source == null ) { + throw new PhylogenyParserException( getClass() + ": attempt to parse null object." ); + } + _nexus_source = nexus_source; + } +} diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXFormatException.java b/forester/java/src/org/forester/io/parsers/nhx/NHXFormatException.java new file mode 100644 index 0000000..b4c18cf --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXFormatException.java @@ -0,0 +1,41 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.nhx; + +import java.io.IOException; + +public class NHXFormatException extends IOException { + + private static final long serialVersionUID = 3756209394438250170L; + + public NHXFormatException() { + super(); + } + + public NHXFormatException( final String message ) { + super( message ); + } +} diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java new file mode 100644 index 0000000..8fa5b29 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -0,0 +1,797 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.nhx; + +import java.awt.Color; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; +import org.forester.phylogeny.data.DomainArchitecture; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PropertiesMap; +import org.forester.phylogeny.data.Property; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +public final class NHXParser implements PhylogenyParser { + + public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true; + public static final ForesterUtil.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = ForesterUtil.TAXONOMY_EXTRACTION.NO; + final static private boolean GUESS_ROOTEDNESS_DEFAULT = true; + final static private boolean GUESS_IF_SUPPORT_VALUES = true; + final static private boolean IGNORE_QUOTES_DEFAULT = false; + final static public boolean REPLACE_UNDERSCORES_DEFAULT = false; + private boolean _saw_closing_paren; + final static private byte STRING = 0; + final static private byte STRING_BUFFER = 1; + final static private byte CHAR_ARRAY = 2; + final static private byte BUFFERED_READER = 3; + private boolean _guess_rootedness; + private boolean _has_next; + private boolean _ignore_quotes; + private byte _input_type; + private int _source_length; + private PhylogenyNode _current_node; + private StringBuilder _current_anotation; + private Object _nhx_source; + private int _clade_level; + private List _phylogenies; + private Phylogeny _current_phylogeny; + private ForesterUtil.TAXONOMY_EXTRACTION _taxonomy_extraction; + private boolean _replace_underscores; + public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern + .compile( "^[A-Z0-9]+$" ); + public final static Pattern NUMBERS_ONLY_PATTERN = Pattern + .compile( "^[0-9]+$" ); + + public NHXParser() { + init(); + } + + /** + * Decreases the clade level by one. + * + * @throws PhylogenyParserException + * if level goes below zero. + */ + private void decreaseCladeLevel() throws PhylogenyParserException { + if ( getCladeLevel() < 0 ) { + throw new PhylogenyParserException( "error in NH (Newick)/NHX formatted data: most likely cause: number of close parens is larger than number of open parens" ); + } + --_clade_level; + } + + /** + * Finishes the current Phylogeny and adds it to the list of Phylogenies + * created. + * + * @throws PhylogenyParserException + * @throws NHXFormatException + */ + private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException { + setCladeLevel( 0 ); + if ( getCurrentPhylogeny() != null ) { + parseNHX( getCurrentAnotation().toString(), + getCurrentPhylogeny().getRoot(), + getTaxonomyExtraction(), + isReplaceUnderscores() ); + if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) { + if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) { + NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() ); + } + } + if ( isGuessRootedness() ) { + final PhylogenyNode root = getCurrentPhylogeny().getRoot(); + if ( ( root.getDistanceToParent() >= 0.0 ) || !ForesterUtil.isEmpty( root.getName() ) + || !ForesterUtil.isEmpty( PhylogenyMethods.getSpecies( root ) ) || root.isHasAssignedEvent() ) { + getCurrentPhylogeny().setRooted( true ); + } + } + getPhylogenies().add( getCurrentPhylogeny() ); + } + } + + private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException { + setCladeLevel( 0 ); + final PhylogenyNode new_node = new PhylogenyNode(); + parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() ); + setCurrentPhylogeny( new Phylogeny() ); + getCurrentPhylogeny().setRoot( new_node ); + getPhylogenies().add( getCurrentPhylogeny() ); + } + + private int getCladeLevel() { + return _clade_level; + } + + private StringBuilder getCurrentAnotation() { + return _current_anotation; + } + + private PhylogenyNode getCurrentNode() { + return _current_node; + } + + private Phylogeny getCurrentPhylogeny() { + return _current_phylogeny; + } + + private byte getInputType() { + return _input_type; + } + + private Object getNhxSource() { + return _nhx_source; + } + + private List getPhylogenies() { + return _phylogenies; + } + + /** + * Returns the Phylogenies created as Array. + * + * @return the Phylogenies created as Array + */ + private Phylogeny[] getPhylogeniesAsArray() { + final Phylogeny[] p = new Phylogeny[ getPhylogenies().size() ]; + for( int i = 0; i < getPhylogenies().size(); ++i ) { + p[ i ] = getPhylogenies().get( i ); + } + return p; + } + + private int getSourceLength() { + return _source_length; + } + + public ForesterUtil.TAXONOMY_EXTRACTION getTaxonomyExtraction() { + return _taxonomy_extraction; + } + + public boolean hasNext() { + return _has_next; + } + + /** + * Increases the clade level by one. + */ + private void increaseCladeLevel() { + ++_clade_level; + } + + private void init() { + setTaxonomyExtraction( TAXONOMY_EXTRACTION_DEFAULT ); + setReplaceUnderscores( REPLACE_UNDERSCORES_DEFAULT ); + setGuessRootedness( GUESS_ROOTEDNESS_DEFAULT ); + setIgnoreQuotes( IGNORE_QUOTES_DEFAULT ); + setHasNext( false ); + } + + private boolean isGuessRootedness() { + return _guess_rootedness; + } + + private boolean isIgnoreQuotes() { + return _ignore_quotes; + } + + private boolean isReplaceUnderscores() { + return _replace_underscores; + } + + private boolean isSawClosingParen() { + return _saw_closing_paren; + } + + /** + * Replaces the current annotation with a new StringBuffer. + */ + private void newCurrentAnotation() { + setCurrentAnotation( new StringBuilder() ); + } + + /** + * Parses the source set with setSource( final Object nhx_source ). Returns + * the Phylogenies found in the source as Phylogeny[]. + * Everything between [ and ] is considered comment and ignored, + * unless: + * "[&&NHX... ]" + * or + * ":digits and/or.[bootstrap]" + * + * @see #setSource( final Object nhx_source ) + * @see org.forester.io.parsers.PhylogenyParser#parse() + * @return Phylogeny[] + * @throws IOException + * @throws NHXFormatException + * @throws PhylogenyParserException + */ + public Phylogeny[] parse() throws IOException, NHXFormatException { + setHasNext( false ); + boolean in_comment = false; + boolean saw_colon = false; + boolean saw_open_bracket = false; + boolean in_double_quote = false; + boolean in_single_quote = false; + setPhylogenies( new ArrayList() ); + setCladeLevel( 0 ); + newCurrentAnotation(); + int i = 0; + while ( true ) { + char c = '\b'; + if ( getInputType() == NHXParser.BUFFERED_READER ) { + final int ci = ( ( BufferedReader ) getNhxSource() ).read(); + if ( ci >= 0 ) { + c = ( char ) ci; + } + else { + break; + } + } + else { + if ( i >= getSourceLength() ) { + break; + } + else { + switch ( getInputType() ) { + case STRING: + c = ( ( String ) getNhxSource() ).charAt( i ); + break; + case STRING_BUFFER: + c = ( ( StringBuffer ) getNhxSource() ).charAt( i ); + break; + case CHAR_ARRAY: + c = ( ( char[] ) getNhxSource() )[ i ]; + break; + } + } + } + if ( !in_single_quote && !in_double_quote ) { + if ( c == ':' ) { + saw_colon = true; + } + else if ( !( ( c < 33 ) || ( c > 126 ) ) && saw_colon + && ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) { + saw_colon = false; + } + } + // \n\t is always ignored, + // as is " (34) and ' (39) (space is 32): + if ( ( isIgnoreQuotes() && ( ( c < 33 ) || ( c > 126 ) || ( c == 34 ) || ( c == 39 ) || ( ( getCladeLevel() == 0 ) && ( c == ';' ) ) ) ) + || ( !isIgnoreQuotes() && ( ( c < 32 ) || ( c > 126 ) || ( ( getCladeLevel() == 0 ) && ( c == ';' ) ) ) ) ) { + // Do nothing. + } + else if ( ( c == 32 ) && ( !in_single_quote && !in_double_quote ) ) { + // Do nothing. + } + else if ( in_comment ) { + if ( c == ']' ) { + in_comment = false; + } + } + else if ( in_double_quote ) { + if ( c == '"' ) { + in_double_quote = false; + } + else { + getCurrentAnotation().append( c ); + } + } + else if ( c == '"' ) { + in_double_quote = true; + } + else if ( in_single_quote ) { + if ( c == 39 ) { + in_single_quote = false; + } + else { + getCurrentAnotation().append( c ); + } + } + else if ( c == 39 ) { + in_single_quote = true; + } + else if ( c == '[' ) { + saw_open_bracket = true; + } + else if ( saw_open_bracket ) { + if ( c != ']' ) { + // everything not starting with "[&" is considered a comment + // unless ":digits and/or . [bootstrap]": + if ( c == '&' ) { + getCurrentAnotation().append( "[&" ); + } + else if ( saw_colon ) { + getCurrentAnotation().append( "[" + c ); + } + else { + in_comment = true; + } + } + // comment consisting just of "[]": + saw_open_bracket = false; + } + else if ( c == '(' ) { + processOpenParen(); + } + else if ( c == ')' ) { + processCloseParen(); + } + else if ( c == ',' ) { + processComma(); + } + else { + getCurrentAnotation().append( c ); + } + ++i; + } + if ( getCladeLevel() != 0 ) { + setPhylogenies( null ); + throw new PhylogenyParserException( "error in NH (Newick)/NHX formatted data: most likely cause: number of open parens does not equal number of close parens" ); + } + if ( getCurrentPhylogeny() != null ) { + finishPhylogeny(); + } + else if ( getCurrentAnotation().length() > 0 ) { + finishSingleNodePhylogeny(); + } + else if ( getPhylogenies().size() < 1 ) { + getPhylogenies().add( new Phylogeny() ); + } + return getPhylogeniesAsArray(); + } // parse() + + public Phylogeny parseNext() throws IOException, NHXFormatException { + return null; + } + + /** + * Called if a closing paren is encountered. + * + * @throws PhylogenyParserException + * @throws NHXFormatException + */ + private void processCloseParen() throws PhylogenyParserException, NHXFormatException { + decreaseCladeLevel(); + if ( !isSawClosingParen() ) { + final PhylogenyNode new_node = new PhylogenyNode(); + parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() ); + newCurrentAnotation(); + getCurrentNode().addAsChild( new_node ); + } + else { + parseNHX( getCurrentAnotation().toString(), + getCurrentNode().getLastChildNode(), + getTaxonomyExtraction(), + isReplaceUnderscores() ); + newCurrentAnotation(); + } + if ( !getCurrentNode().isRoot() ) { + setCurrentNode( getCurrentNode().getParent() ); + } + setSawClosingParen( true ); + } // processCloseParen() + + /** + * Called if a comma is encountered. + * + * @throws PhylogenyParserException + * @throws NHXFormatException + */ + private void processComma() throws PhylogenyParserException, NHXFormatException { + if ( !isSawClosingParen() ) { + final PhylogenyNode new_node = new PhylogenyNode(); + parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() ); + if ( getCurrentNode() == null ) { + throw new NHXFormatException( "format might not be NH or NHX" ); + } + getCurrentNode().addAsChild( new_node ); + } + else { + parseNHX( getCurrentAnotation().toString(), + getCurrentNode().getLastChildNode(), + getTaxonomyExtraction(), + isReplaceUnderscores() ); + } + newCurrentAnotation(); + setSawClosingParen( false ); + } // processComma() + + /** + * Called if a opening paren is encountered. + * + * @throws PhylogenyParserException + * @throws NHXFormatException + */ + private void processOpenParen() throws PhylogenyParserException, NHXFormatException { + final PhylogenyNode new_node = new PhylogenyNode(); + if ( getCladeLevel() == 0 ) { + if ( getCurrentPhylogeny() != null ) { + finishPhylogeny(); + } + setCladeLevel( 1 ); + newCurrentAnotation(); + setCurrentPhylogeny( new Phylogeny() ); + getCurrentPhylogeny().setRoot( new_node ); + } + else { + increaseCladeLevel(); + getCurrentNode().addAsChild( new_node ); + } + setCurrentNode( new_node ); + setSawClosingParen( false ); + } + + private void setCladeLevel( final int clade_level ) { + if ( clade_level < 0 ) { + throw new IllegalArgumentException( "Attempt to set clade level to a number smaller than zero." ); + } + _clade_level = clade_level; + } + + private void setCurrentAnotation( final StringBuilder current_anotation ) { + _current_anotation = current_anotation; + } + + private void setCurrentNode( final PhylogenyNode current_node ) { + _current_node = current_node; + } + + private void setCurrentPhylogeny( final Phylogeny current_phylogeny ) { + _current_phylogeny = current_phylogeny; + } + + public void setGuessRootedness( final boolean guess_rootedness ) { + _guess_rootedness = guess_rootedness; + } + + private void setHasNext( final boolean has_next ) { + _has_next = has_next; + } + + public void setIgnoreQuotes( final boolean ignore_quotes ) { + _ignore_quotes = ignore_quotes; + } + + private void setInputType( final byte input_type ) { + _input_type = input_type; + } + + private void setNhxSource( final Object nhx_source ) { + _nhx_source = nhx_source; + } + + private void setPhylogenies( final ArrayList phylogenies ) { + _phylogenies = phylogenies; + } + + public void setReplaceUnderscores( final boolean replace_underscores ) { + _replace_underscores = replace_underscores; + } + + private void setSawClosingParen( final boolean saw_closing_paren ) { + _saw_closing_paren = saw_closing_paren; + } + + /** + * This sets the source to be parsed. The source can be: String, + * StringBuffer, char[], File, or InputStream. The source can contain more + * than one phylogenies in either New Hamphshire (NH) or New Hamphshire + * Extended (NHX) format. There is no need to separate phylogenies with any + * special character. White space is always ignored, as are semicolons + * inbetween phylogenies. Example of a source describing two phylogenies + * (source is a String, in this example): "(A,(B,(C,(D,E)de)cde)bcde)abcde + * ((((A,B)ab,C)abc,D)abcd,E)abcde". Everything between a '[' followed by any + * character other than '&' and ']' is considered a comment and ignored + * (example: "[this is a comment]"). NHX tags are surrounded by '[&&NHX' and + * ']' (example: "[&&NHX:S=Varanus_storri]"). A sequence like "[& some + * info]" is ignored, too (at the PhylogenyNode level, though). + * Exception: numbers only between [ and ] (e.g. [90]) are interpreted as support values. + * + * @see #parse() + * @see org.forester.io.parsers.PhylogenyParser#setSource(java.lang.Object) + * @param nhx_source + * the source to be parsed (String, StringBuffer, char[], File, + * or InputStream) + * @throws IOException + * @throws PhylogenyParserException + */ + public void setSource( final Object nhx_source ) throws PhylogenyParserException, IOException { + if ( nhx_source == null ) { + throw new PhylogenyParserException( getClass() + ": attempt to parse null object." ); + } + else if ( nhx_source instanceof String ) { + setInputType( NHXParser.STRING ); + setSourceLength( ( ( String ) nhx_source ).length() ); + setNhxSource( nhx_source ); + } + else if ( nhx_source instanceof StringBuffer ) { + setInputType( NHXParser.STRING_BUFFER ); + setSourceLength( ( ( StringBuffer ) nhx_source ).length() ); + setNhxSource( nhx_source ); + } + else if ( nhx_source instanceof char[] ) { + setInputType( NHXParser.CHAR_ARRAY ); + setSourceLength( ( ( char[] ) nhx_source ).length ); + setNhxSource( nhx_source ); + } + else if ( nhx_source instanceof File ) { + setInputType( NHXParser.BUFFERED_READER ); + setSourceLength( 0 ); + final File f = ( File ) nhx_source; + final String error = ForesterUtil.isReadableFile( f ); + if ( !ForesterUtil.isEmpty( error ) ) { + throw new PhylogenyParserException( error ); + } + setNhxSource( new BufferedReader( new FileReader( f ) ) ); + } + else if ( nhx_source instanceof InputStream ) { + setInputType( NHXParser.BUFFERED_READER ); + setSourceLength( 0 ); + final InputStreamReader isr = new InputStreamReader( ( InputStream ) nhx_source ); + setNhxSource( new BufferedReader( isr ) ); + } + else { + throw new IllegalArgumentException( getClass() + " can only parse objects of type String," + + " StringBuffer, char[], File," + " or InputStream " + " [attempt to parse object of " + + nhx_source.getClass() + "]." ); + } + setHasNext( true ); + } + + private void setSourceLength( final int source_length ) { + _source_length = source_length; + } + + public void setTaxonomyExtraction( final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction ) { + _taxonomy_extraction = taxonomy_extraction; + } + + private static double doubleValue( final String str ) throws NHXFormatException { + try { + return Double.valueOf( str ).doubleValue(); + } + catch ( final NumberFormatException ex ) { + throw new NHXFormatException( "error in NH/NHX formatted data: failed to parse number from :" + "\"" + str + + "\"" ); + } + } + + private static boolean isBranchLengthsLikeBootstrapValues( final Phylogeny p ) { + final PhylogenyNodeIterator it = p.iteratorExternalForward(); + final double d0 = it.next().getDistanceToParent(); + if ( ( d0 < 10 ) || !it.hasNext() ) { + return false; + } + while ( it.hasNext() ) { + final double d = it.next().getDistanceToParent(); + if ( ( d != d0 ) || ( d < 10 ) ) { + return false; + } + } + return true; + } + + private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) { + final PhylogenyNodeIterator it = p.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() ); + n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + } + } + + public static void parseNHX( String s, + final PhylogenyNode node_to_annotate, + final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction, + final boolean replace_underscores ) throws NHXFormatException { + if ( ( taxonomy_extraction != ForesterUtil.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) { + throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" ); + } + if ( ( s != null ) && ( s.length() > 0 ) ) { + if ( replace_underscores ) { + s = s.replaceAll( "_+", " " ); + } + int ob = 0; + int cb = 0; + String a = ""; + String b = ""; + StringTokenizer t = null; + boolean is_nhx = false; + ob = s.indexOf( "[" ); + cb = s.indexOf( "]" ); + if ( ob > -1 ) { + a = ""; + b = ""; + is_nhx = true; + if ( cb < 0 ) { + throw new NHXFormatException( "error in NHX formatted data: no closing \"]\"" ); + } + if ( s.indexOf( "&&NHX" ) == ( ob + 1 ) ) { + b = s.substring( ob + 6, cb ); + } + else { + // No &&NHX and digits only: is likely to be a support value. + final String bracketed = s.substring( ob + 1, cb ); + final Matcher numbers_only = NUMBERS_ONLY_PATTERN.matcher( bracketed ); + if ( numbers_only.matches() ) { + b = ":" + NHXtags.SUPPORT + bracketed; + } + } + a = s.substring( 0, ob ); + s = a + b; + if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) { + throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" ); + } + } + t = new StringTokenizer( s, ":" ); + if ( t.countTokens() >= 1 ) { + if ( !s.startsWith( ":" ) ) { + node_to_annotate.setName( t.nextToken() ); + if ( !replace_underscores + && ( !is_nhx && ( taxonomy_extraction != ForesterUtil.TAXONOMY_EXTRACTION.NO ) ) ) { + final String tax = ForesterUtil + .extractTaxonomyCodeFromNodeName( node_to_annotate.getName(), + LIMIT_SPECIES_NAMES_TO_FIVE_CHARS, + taxonomy_extraction ); + if ( !ForesterUtil.isEmpty( tax ) ) { + if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) { + node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() ); + } + node_to_annotate.getNodeData().getTaxonomy().setTaxonomyCode( tax ); + } + } + } + while ( t.hasMoreTokens() ) { + s = t.nextToken(); + if ( s.startsWith( org.forester.io.parsers.nhx.NHXtags.SPECIES_NAME ) ) { + if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) { + node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() ); + } + node_to_annotate.getNodeData().getTaxonomy().setScientificName( s.substring( 2 ) ); + } + else if ( s.startsWith( org.forester.io.parsers.nhx.NHXtags.ANNOTATION ) ) { + if ( !node_to_annotate.getNodeData().isHasSequence() ) { + node_to_annotate.getNodeData().setSequence( new Sequence() ); + } + final Annotation annotation = new Annotation( "_:_" ); + annotation.setDesc( s.substring( 3 ) ); + node_to_annotate.getNodeData().getSequence().addAnnotation( annotation ); + } + else if ( s.startsWith( org.forester.io.parsers.nhx.NHXtags.IS_DUPLICATION ) ) { + if ( ( s.charAt( 2 ) == 'Y' ) || ( s.charAt( 2 ) == 'T' ) ) { + node_to_annotate.getNodeData().setEvent( Event.createSingleDuplicationEvent() ); + } + else if ( ( s.charAt( 2 ) == 'N' ) || ( s.charAt( 2 ) == 'F' ) ) { + node_to_annotate.getNodeData().setEvent( Event.createSingleSpeciationEvent() ); + } + else if ( s.charAt( 2 ) == '?' ) { + node_to_annotate.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() ); + } + else { + throw new NHXFormatException( "error in NHX formatted data: :D=Y or :D=N or :D=?" ); + } + } + else if ( s.startsWith( NHXtags.SUPPORT ) ) { + PhylogenyMethods.setConfidence( node_to_annotate, doubleValue( s.substring( 2 ) ) ); + } + else if ( s.startsWith( NHXtags.TAXONOMY_ID ) ) { + if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) { + node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() ); + } + node_to_annotate.getNodeData().getTaxonomy().setIdentifier( new Identifier( s.substring( 2 ) ) ); + } + else if ( s.startsWith( NHXtags.PARENT_BRANCH_WIDTH ) ) { + PhylogenyMethods.setBranchWidthValue( node_to_annotate, Integer.parseInt( s.substring( 2 ) ) ); + } + else if ( s.startsWith( NHXtags.COLOR ) ) { + final Color c = NHXParser.stringToColor( s.substring( 2 ) ); + if ( c != null ) { + PhylogenyMethods.setBranchColorValue( node_to_annotate, c ); + } + } + else if ( s.startsWith( NHXtags.CUSTOM_DATA_ON_NODE ) ) { + if ( !node_to_annotate.getNodeData().isHasProperties() ) { + node_to_annotate.getNodeData().setProperties( new PropertiesMap() ); + } + node_to_annotate.getNodeData().getProperties().addProperty( Property.createFromNhxString( s ) ); + } + else if ( s.startsWith( NHXtags.DOMAIN_STRUCTURE ) ) { + if ( !node_to_annotate.getNodeData().isHasSequence() ) { + node_to_annotate.getNodeData().setSequence( new Sequence() ); + } + node_to_annotate.getNodeData().getSequence().setDomainArchitecture( new DomainArchitecture( s + .substring( 3 ) ) ); + } + else if ( s.startsWith( NHXtags.NODE_IDENTIFIER ) ) { + node_to_annotate.getNodeData().setNodeIdentifier( new Identifier( s.substring( 3 ) ) ); + } + else if ( s.startsWith( NHXtags.SEQUENCE_ACCESSION ) ) { + if ( !node_to_annotate.getNodeData().isHasSequence() ) { + node_to_annotate.getNodeData().setSequence( new Sequence() ); + } + node_to_annotate.getNodeData().getSequence() + .setAccession( new Accession( s.substring( 3 ), "?" ) ); + } + else if ( s.startsWith( NHXtags.GENE_NAME ) ) { + if ( !node_to_annotate.getNodeData().isHasSequence() ) { + node_to_annotate.getNodeData().setSequence( new Sequence() ); + } + node_to_annotate.getNodeData().getSequence().setName( s.substring( 3 ) ); + } + else if ( s.startsWith( NHXtags.GENE_NAME_SYNONYM ) ) { + if ( !node_to_annotate.getNodeData().isHasSequence() ) { + node_to_annotate.getNodeData().setSequence( new Sequence() ); + } + node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) ); + } + else if ( s.indexOf( '=' ) < 0 ) { + if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:" + + "\"" + s + "\"" ); + } + node_to_annotate.setDistanceToParent( doubleValue( s ) ); + } + } // while ( t.hasMoreTokens() ) + } + } + } + + /** + * Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green, + * and blue and returns the corresponding Color. + */ + private static Color stringToColor( final String s ) { + final StringTokenizer st = new StringTokenizer( s, "." ); + if ( st.countTokens() != 3 ) { + throw new IllegalArgumentException( "illegal format for color: " + s ); + } + final int red = ForesterUtil.limitRangeForColor( Integer.parseInt( st.nextToken() ) ); + final int green = ForesterUtil.limitRangeForColor( Integer.parseInt( st.nextToken() ) ); + final int blu = ForesterUtil.limitRangeForColor( Integer.parseInt( st.nextToken() ) ); + return new Color( red, green, blu ); + } +} diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXtags.java b/forester/java/src/org/forester/io/parsers/nhx/NHXtags.java new file mode 100644 index 0000000..ae884ab --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXtags.java @@ -0,0 +1,55 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.nhx; + +public final class NHXtags { + + public static final String CUSTOM_DATA_ON_NODE = "XN="; + public static final String COLOR = "C="; + public static final String PARENT_BRANCH_WIDTH = "W="; + public static final String SUBTREE_NEIGHBORS = "SNn="; + public static final String SUPER_ORTHOLOGOUS = "SOn="; + public static final String ORTHOLOGOUS = "On="; + public static final String TAXONOMY_ID = "T="; + public static final String SUPPORT = "B="; + public static final String IS_DUPLICATION = "D="; + public static final String ANNOTATION = "AN="; //TODO fix on website NHXv2 + public static final String SPECIES_NAME = "S="; + public static final String DOMAIN_STRUCTURE = "DS="; + public static final String GENE_NAME = "GN="; + public static final String GENE_NAME_SYNONYM = "G="; + public static final String SEQUENCE_ACCESSION = "AC="; + public static final String NODE_IDENTIFIER = "ID="; //TODO fix on website NHXv2 + public static final Object BRANCH_WIDTH = "W="; + @Deprecated + public static final String BINARY_DOMAIN_COMBINATIONS = "GDC="; + @Deprecated + public static final String DOMAINS_SEPARATOR = "\\|"; + @Deprecated + public static final String DOMAINS = "GD="; + @Deprecated + public static final String EC_NUMBER = "E="; +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlDataFormatException.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlDataFormatException.java new file mode 100644 index 0000000..bb80183 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlDataFormatException.java @@ -0,0 +1,40 @@ +// $Id: +// $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml; + +public class PhyloXmlDataFormatException extends PhyloXmlException { + + private static final long serialVersionUID = 3756209394438250170L; + + public PhyloXmlDataFormatException() { + super(); + } + + public PhyloXmlDataFormatException( final String message ) { + super( message ); + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlException.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlException.java new file mode 100644 index 0000000..6c82fa1 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlException.java @@ -0,0 +1,39 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml; + +public class PhyloXmlException extends RuntimeException { + + private static final long serialVersionUID = 3756209394438250170L; + + public PhyloXmlException() { + super(); + } + + public PhyloXmlException( final String message ) { + super( message ); + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlHandler.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlHandler.java new file mode 100644 index 0000000..6c013f6 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlHandler.java @@ -0,0 +1,454 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.forester.io.parsers.phyloxml.data.BinaryCharactersParser; +import org.forester.io.parsers.phyloxml.data.BranchWidthParser; +import org.forester.io.parsers.phyloxml.data.ColorParser; +import org.forester.io.parsers.phyloxml.data.ConfidenceParser; +import org.forester.io.parsers.phyloxml.data.DateParser; +import org.forester.io.parsers.phyloxml.data.DistributionParser; +import org.forester.io.parsers.phyloxml.data.EventParser; +import org.forester.io.parsers.phyloxml.data.IdentifierParser; +import org.forester.io.parsers.phyloxml.data.PropertyParser; +import org.forester.io.parsers.phyloxml.data.ReferenceParser; +import org.forester.io.parsers.phyloxml.data.SequenceParser; +import org.forester.io.parsers.phyloxml.data.SequenceRelationParser; +import org.forester.io.parsers.phyloxml.data.TaxonomyParser; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.BinaryCharacters; +import org.forester.phylogeny.data.BranchColor; +import org.forester.phylogeny.data.BranchWidth; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Date; +import org.forester.phylogeny.data.Distribution; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PropertiesMap; +import org.forester.phylogeny.data.Property; +import org.forester.phylogeny.data.Reference; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.SequenceRelation; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.data.SequenceRelation.SEQUENCE_RELATION_TYPE; +import org.forester.util.FailedConditionCheckException; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public final class PhyloXmlHandler extends DefaultHandler { + + private static final String PHYLOXML = "phyloxml"; + private String _current_element_name; + private Phylogeny _current_phylogeny; + private List _phylogenies; + private XmlElement _current_xml_element; + private PhylogenyNode _current_node; + private static Map> phylogenySequencesById = new HashMap>(); + + PhyloXmlHandler() { + // Constructor. + } + + private void addNode() { + final PhylogenyNode new_node = new PhylogenyNode(); + getCurrentNode().addAsChild( new_node ); + setCurrentNode( new_node ); + } + + @Override + public void characters( final char[] chars, final int start_index, final int end_index ) { + if ( ( ( getCurrentXmlElement() != null ) && ( getCurrentElementName() != null ) ) + && !getCurrentElementName().equals( PhyloXmlMapping.CLADE ) + && !getCurrentElementName().equals( PhyloXmlMapping.PHYLOGENY ) ) { + if ( !ForesterUtil.isEmpty( getCurrentXmlElement().getValueAsString() ) ) { + getCurrentXmlElement().appendValue( new String( chars, start_index, end_index ) ); + } + else { + getCurrentXmlElement().setValue( new String( chars, start_index, end_index ) ); + } + } + } + + @Override + public void endElement( final String namespace_uri, final String local_name, final String qualified_name ) + throws SAXException { + if ( ForesterUtil.isEmpty( namespace_uri ) || namespace_uri.startsWith( ForesterConstants.PHYLO_XML_LOCATION ) ) { + if ( local_name.equals( PhyloXmlMapping.CLADE ) ) { + try { + mapElementToPhylogenyNode( getCurrentXmlElement(), getCurrentNode() ); + if ( !getCurrentNode().isRoot() ) { + setCurrentNode( getCurrentNode().getParent() ); + } + getCurrentXmlElement().setValue( null ); + setCurrentXmlElement( getCurrentXmlElement().getParent() ); + } + catch ( final PhylogenyParserException ex ) { + throw new SAXException( ex.getMessage() ); + } + } + else if ( local_name.equals( PhyloXmlMapping.SEQUENCE_RELATION ) ) { + try { + if ( getCurrentPhylogeny() != null ) { + final SequenceRelation seqRelation = ( SequenceRelation ) SequenceRelationParser + .getInstance( getCurrentPhylogeny() ).parse( getCurrentXmlElement() ); + final Map sequencesById = getSequenceMapByIdForPhylogeny( getCurrentPhylogeny() ); + final Sequence ref0 = sequencesById.get( seqRelation.getRef0().getSourceId() ), ref1 = sequencesById + .get( seqRelation.getRef1().getSourceId() ); + if ( ref0 != null ) { + // check for reverse relation + boolean fFoundReverse = false; + for( final SequenceRelation sr : ref0.getSequenceRelations() ) { + if ( sr.getType().equals( seqRelation.getType() ) + && ( ( sr.getRef0().isEqual( ref1 ) && sr.getRef1().isEqual( ref0 ) ) || ( sr + .getRef0().isEqual( ref0 ) && sr.getRef1().isEqual( ref1 ) ) ) ) { + // in this case we don't need to re-add it, but we make sure we don't loose the confidence value + fFoundReverse = true; + if ( ( sr.getConfidence() == null ) && ( seqRelation.getConfidence() != null ) ) { + sr.setConfidence( seqRelation.getConfidence() ); + } + } + } + if ( !fFoundReverse ) { + ref0.addSequenceRelation( seqRelation ); + } + } + if ( ref1 != null ) { + // check for reverse relation + boolean fFoundReverse = false; + for( final SequenceRelation sr : ref1.getSequenceRelations() ) { + if ( sr.getType().equals( seqRelation.getType() ) + && ( ( sr.getRef0().isEqual( ref1 ) && sr.getRef1().isEqual( ref0 ) ) || ( sr + .getRef0().isEqual( ref0 ) && sr.getRef1().isEqual( ref1 ) ) ) ) { + // in this case we don't need to re-add it, but we make sure we don't loose the confidence value + fFoundReverse = true; + if ( ( sr.getConfidence() == null ) && ( seqRelation.getConfidence() != null ) ) { + sr.setConfidence( seqRelation.getConfidence() ); + } + } + } + if ( !fFoundReverse ) { + ref1.addSequenceRelation( seqRelation ); + } + } + // we add the type to the current phylogeny so we can know it needs to be displayed in the combo + final Collection relationTypesForCurrentPhylogeny = getCurrentPhylogeny() + .getRelevantSequenceRelationTypes(); + if ( !relationTypesForCurrentPhylogeny.contains( seqRelation.getType() ) ) { + relationTypesForCurrentPhylogeny.add( seqRelation.getType() ); + } + } + } + catch ( final PhylogenyParserException ex ) { + throw new SAXException( ex.getMessage() ); + } + } + else if ( local_name.equals( PhyloXmlMapping.PHYLOGENY ) ) { + try { + PhyloXmlHandler.mapElementToPhylogeny( getCurrentXmlElement(), getCurrentPhylogeny() ); + } + catch ( final PhylogenyParserException ex ) { + throw new SAXException( ex.getMessage() ); + } + finishPhylogeny(); + reset(); + } + else if ( local_name.equals( PHYLOXML ) ) { + // Do nothing. + } + else if ( ( getCurrentPhylogeny() != null ) && ( getCurrentXmlElement().getParent() != null ) ) { + setCurrentXmlElement( getCurrentXmlElement().getParent() ); + } + setCurrentElementName( null ); + } + } + + private void finishPhylogeny() throws SAXException { + getCurrentPhylogeny().recalculateNumberOfExternalDescendants( false ); + getPhylogenies().add( getCurrentPhylogeny() ); + final HashMap phyloSequences = phylogenySequencesById.get( getCurrentPhylogeny() ); + if ( phyloSequences != null ) { + getCurrentPhylogeny().setSequenceRelationQueries( phyloSequences.values() ); + phylogenySequencesById.remove( getCurrentPhylogeny() ); + } + } + + private String getCurrentElementName() { + return _current_element_name; + } + + private PhylogenyNode getCurrentNode() { + return _current_node; + } + + private Phylogeny getCurrentPhylogeny() { + return _current_phylogeny; + } + + private XmlElement getCurrentXmlElement() { + return _current_xml_element; + } + + List getPhylogenies() { + return _phylogenies; + } + + private void init() { + reset(); + setPhylogenies( new ArrayList() ); + } + + private void initCurrentNode() { + if ( getCurrentNode() != null ) { + throw new FailedConditionCheckException( "attempt to create new current node when current node already exists" ); + } + if ( getCurrentPhylogeny() == null ) { + throw new FailedConditionCheckException( "attempt to create new current node for non-existing phylogeny" ); + } + final PhylogenyNode node = new PhylogenyNode(); + getCurrentPhylogeny().setRoot( node ); + setCurrentNode( getCurrentPhylogeny().getRoot() ); + } + + private void mapElementToPhylogenyNode( final XmlElement xml_element, final PhylogenyNode node ) + throws PhylogenyParserException { + if ( xml_element.isHasAttribute( PhyloXmlMapping.BRANCH_LENGTH ) ) { + double d = 0; + try { + d = Double.parseDouble( xml_element.getAttribute( PhyloXmlMapping.BRANCH_LENGTH ) ); + } + catch ( final NumberFormatException e ) { + throw new PhylogenyParserException( "ill formatted distance in clade attribute [" + + xml_element.getAttribute( PhyloXmlMapping.BRANCH_LENGTH ) + "]: " + e.getMessage() ); + } + node.setDistanceToParent( d ); + } + for( int i = 0; i < xml_element.getNumberOfChildElements(); ++i ) { + final XmlElement element = xml_element.getChildElement( i ); + final String qualified_name = element.getQualifiedName(); + if ( qualified_name.equals( PhyloXmlMapping.BRANCH_LENGTH ) ) { + if ( node.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + throw new PhylogenyParserException( "ill advised attempt to set distance twice for the same clade (probably via element and via attribute)" ); + } + node.setDistanceToParent( element.getValueAsDouble() ); + } + if ( qualified_name.equals( PhyloXmlMapping.NODE_NAME ) ) { + node.setName( element.getValueAsString() ); + } + // else if ( qualified_name.equals( PhyloXmlMapping.NODE_IDENTIFIER ) ) { + // node.getNodeData().setNodeIdentifier( ( Identifier ) IdentifierParser.getInstance().parse( element ) ); + // } + else if ( qualified_name.equals( PhyloXmlMapping.TAXONOMY ) ) { + node.getNodeData().addTaxonomy( ( Taxonomy ) TaxonomyParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.SEQUENCE ) ) { + final Sequence sequence = ( Sequence ) SequenceParser.getInstance().parse( element ); + node.getNodeData().addSequence( sequence ); + // we temporarily store all sequences that have a source ID so we can access them easily when we need to attach relations to them + final String sourceId = sequence.getSourceId(); + if ( ( getCurrentPhylogeny() != null ) && !ForesterUtil.isEmpty( sourceId ) ) { + getSequenceMapByIdForPhylogeny( getCurrentPhylogeny() ).put( sourceId, sequence ); + } + } + else if ( qualified_name.equals( PhyloXmlMapping.DISTRIBUTION ) ) { + node.getNodeData().addDistribution( ( Distribution ) DistributionParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.CLADE_DATE ) ) { + node.getNodeData().setDate( ( Date ) DateParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.REFERENCE ) ) { + node.getNodeData().addReference( ( Reference ) ReferenceParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.BINARY_CHARACTERS ) ) { + node.getNodeData().setBinaryCharacters( ( BinaryCharacters ) BinaryCharactersParser.getInstance() + .parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.COLOR ) ) { + node.getBranchData().setBranchColor( ( BranchColor ) ColorParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.CONFIDENCE ) ) { + node.getBranchData().addConfidence( ( Confidence ) ConfidenceParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.WIDTH ) ) { + node.getBranchData().setBranchWidth( ( BranchWidth ) BranchWidthParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.EVENTS ) ) { + node.getNodeData().setEvent( ( Event ) EventParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.PROPERTY ) ) { + if ( !node.getNodeData().isHasProperties() ) { + node.getNodeData().setProperties( new PropertiesMap() ); + } + node.getNodeData().getProperties().addProperty( ( Property ) PropertyParser.getInstance() + .parse( element ) ); + } + } + } + + private void newClade() { + if ( getCurrentNode() == null ) { + initCurrentNode(); + } + else { + addNode(); + } + } + + private void newPhylogeny() { + setCurrentPhylogeny( new Phylogeny() ); + } + + private void reset() { + setCurrentPhylogeny( null ); + setCurrentNode( null ); + setCurrentElementName( null ); + setCurrentXmlElement( null ); + } + + private void setCurrentElementName( final String element_name ) { + _current_element_name = element_name; + } + + private void setCurrentNode( final PhylogenyNode current_node ) { + _current_node = current_node; + } + + private void setCurrentPhylogeny( final Phylogeny phylogeny ) { + _current_phylogeny = phylogeny; + } + + private void setCurrentXmlElement( final XmlElement element ) { + _current_xml_element = element; + } + + private void setPhylogenies( final List phylogenies ) { + _phylogenies = phylogenies; + } + + @Override + public void startDocument() throws SAXException { + init(); + } + + @Override + public void startElement( final String namespace_uri, + final String local_name, + final String qualified_name, + final Attributes attributes ) throws SAXException { + if ( ForesterUtil.isEmpty( namespace_uri ) || namespace_uri.startsWith( ForesterConstants.PHYLO_XML_LOCATION ) ) { + setCurrentElementName( local_name ); + if ( local_name.equals( PhyloXmlMapping.CLADE ) ) { + final XmlElement element = new XmlElement( namespace_uri, local_name, local_name, attributes ); + getCurrentXmlElement().addChildElement( element ); + setCurrentXmlElement( element ); + newClade(); + } + else if ( local_name.equals( PhyloXmlMapping.PHYLOGENY ) ) { + setCurrentXmlElement( new XmlElement( "", "", "", null ) ); + newPhylogeny(); + final XmlElement element = new XmlElement( namespace_uri, local_name, local_name, attributes ); + if ( element.isHasAttribute( PhyloXmlMapping.PHYLOGENY_IS_REROOTABLE_ATTR ) ) { + getCurrentPhylogeny().setRerootable( Boolean.parseBoolean( element + .getAttribute( PhyloXmlMapping.PHYLOGENY_IS_REROOTABLE_ATTR ) ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.PHYLOGENY_BRANCHLENGTH_UNIT_ATTR ) ) { + getCurrentPhylogeny().setDistanceUnit( element + .getAttribute( PhyloXmlMapping.PHYLOGENY_BRANCHLENGTH_UNIT_ATTR ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.PHYLOGENY_IS_ROOTED_ATTR ) ) { + getCurrentPhylogeny().setRooted( Boolean.parseBoolean( element + .getAttribute( PhyloXmlMapping.PHYLOGENY_IS_ROOTED_ATTR ) ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.PHYLOGENY_TYPE_ATTR ) ) { + getCurrentPhylogeny().setType( ( element.getAttribute( PhyloXmlMapping.PHYLOGENY_TYPE_ATTR ) ) ); + } + } + else if ( local_name.equals( PHYLOXML ) ) { + } + else if ( getCurrentPhylogeny() != null ) { + final XmlElement element = new XmlElement( namespace_uri, local_name, local_name, attributes ); + getCurrentXmlElement().addChildElement( element ); + setCurrentXmlElement( element ); + } + } + } + + public static boolean attributeEqualsValue( final XmlElement element, + final String attributeName, + final String attributeValue ) { + final String attr = element.getAttribute( attributeName ); + return ( ( attr != null ) && attr.equals( attributeValue ) ); + } + + public static String getAtttributeValue( final XmlElement element, final String attributeName ) { + final String attr = element.getAttribute( attributeName ); + if ( attr != null ) { + return attr; + } + else { + return ""; + } + } + + static public Map getSequenceMapByIdForPhylogeny( final Phylogeny ph ) { + HashMap seqMap = phylogenySequencesById.get( ph ); + if ( seqMap == null ) { + seqMap = new HashMap(); + phylogenySequencesById.put( ph, seqMap ); + } + return seqMap; + } + + private static void mapElementToPhylogeny( final XmlElement xml_element, final Phylogeny phylogeny ) + throws PhylogenyParserException { + for( int i = 0; i < xml_element.getNumberOfChildElements(); ++i ) { + final XmlElement element = xml_element.getChildElement( i ); + final String qualified_name = element.getQualifiedName(); + if ( qualified_name.equals( PhyloXmlMapping.PHYLOGENY_NAME ) ) { + phylogeny.setName( element.getValueAsString() ); + } + else if ( qualified_name.equals( PhyloXmlMapping.PHYLOGENY_DESCRIPTION ) ) { + phylogeny.setDescription( element.getValueAsString() ); + } + else if ( qualified_name.equals( PhyloXmlMapping.IDENTIFIER ) ) { + phylogeny.setIdentifier( ( Identifier ) IdentifierParser.getInstance().parse( element ) ); + } + else if ( qualified_name.equals( PhyloXmlMapping.CONFIDENCE ) ) { + phylogeny.setConfidence( ( Confidence ) ConfidenceParser.getInstance().parse( element ) ); + } + } + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlMapping.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlMapping.java new file mode 100644 index 0000000..04a8cb6 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlMapping.java @@ -0,0 +1,134 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml; + +/* + * @author Christian Zmasek TODO To change the template for this generated type + * comment go to Window - Preferences - Java - Code Style - Code Templates + */ +public final class PhyloXmlMapping { + + public static final String PHYLOGENY = "phylogeny"; + public static final String PHYLOGENY_NAME = "name"; + public static final String PHYLOGENY_DESCRIPTION = "description"; + public static final String PHYLOGENY_IS_REROOTABLE_ATTR = "rerootable"; + public static final String PHYLOGENY_BRANCHLENGTH_UNIT_ATTR = "branch_length_unit"; + public static final String PHYLOGENY_IS_ROOTED_ATTR = "rooted"; + public static final String PHYLOGENY_TYPE_ATTR = "type"; + public static final String CLADE = "clade"; + public static final String NODE_NAME = "name"; + public static final String SEQUENCE = "sequence"; + public static final String SEQUENCE_NAME = "name"; + public static final String SEQUENCE_SYMBOL = "symbol"; + public static final String ACCESSION = "accession"; + public static final String ACCESSION_SOURCE_ATTR = "source"; + public static final String SEQUENCE_LOCATION = "location"; + public static final String SEQUENCE_MOL_SEQ = "mol_seq"; + public static final String SEQUENCE_MOL_SEQ_ALIGNED_ATTR = "is_aligned"; + public static final String ANNOTATION = "annotation"; + public static final String ANNOTATION_DESC = "desc"; + public static final String ANNOTATION_REF_ATTR = "ref"; + public static final String ANNOTATION_EVIDENCE_ATTR = "evidence"; + public static final String ANNOTATION_TYPE_ATTR = "type"; + public static final String TAXONOMY = "taxonomy"; + public static final String TAXONOMY_SCIENTIFIC_NAME = "scientific_name"; + public static final String TAXONOMY_COMMON_NAME = "common_name"; + public static final String TAXONOMY_CODE = "code"; + public static final String TAXONOMY_RANK = "rank"; + public static final String TAXONOMY_SYNONYM = "synonym"; + public static final String TAXONOMY_AUTHORITY = "authority"; + public static final String DISTRIBUTION = "distribution"; + public static final String BINARY_CHARACTERS = "binary_characters"; + public static final String BINARY_CHARACTERS_PRESENT = "present"; + public static final String BINARY_CHARACTERS_GAINED = "gained"; + public static final String BINARY_CHARACTERS_LOST = "lost"; + public static final String BINARY_CHARACTERS_TYPE_ATTR = "type"; + public static final String BINARY_CHARACTERS_PRESENT_COUNT_ATTR = "present_count"; + public static final String BINARY_CHARACTERS_GAINED_COUNT_ATTR = "gained_count"; + public static final String BINARY_CHARACTERS_LOST_COUNT_ATTR = "lost_count"; + public static final String BRANCH_LENGTH = "branch_length"; + public static final String CONFIDENCE = "confidence"; + public static final String CONFIDENCE_TYPE_ATTR = "type"; + public static final String COLOR = "color"; + public static final String COLOR_RED = "red"; + public static final String COLOR_GREEN = "green"; + public static final String COLOR_BLUE = "blue"; + public final static String SEQUENCE_DOMAIN_ARCHITECTURE_DOMAIN = "domain"; + public final static String SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_FROM = "from"; + public final static String SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_TO = "to"; + public final static String SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_CONFIDENCE = "confidence"; + // public final static String NODE_IDENTIFIER = "node_id"; + public final static String IDENTIFIER = "id"; + public final static String IDENTIFIER_PROVIDER_ATTR = "provider"; + public static final String URI = "uri"; + public static final String WIDTH = "width"; + public final static String EVENTS = "events"; + public final static String EVENT_TYPE = "type"; + public final static String EVENT_DUPLICATIONS = "duplications"; + public final static String EVENT_SPECIATIONS = "speciations"; + public final static String EVENT_LOSSES = "losses"; + public final static String SEQUENCE_DOMAIN_ARCHITECURE = "domain_architecture"; + public final static String SEQUENCE_DOMAIN_ARCHITECTURE_LENGTH = "length"; + public final static String SEQUENCE_TYPE = "type"; + public static final String BINARY_CHARACTER = "bc"; + public static final String URI_DESC_ATTR = "desc"; + public static final String TYPE_ATTR = "type"; + public static final String REFERENCE = "reference"; + public static final String REFERENCE_DOI_ATTR = "doi"; + public static final String REFERENCE_DESC = "desc"; + public static final String PROPERTY = "property"; + public static final String PROPERTY_REF = "ref"; + public static final String PROPERTY_UNIT = "unit"; + public static final String PROPERTY_DATATYPE = "datatype"; + public static final String PROPERTY_APPLIES_TO = "applies_to"; + public static final String ID_REF = "id_ref"; + public static final String ANNOTATION_SOURCE_ATTR = "source"; + public static final String DISTRIBUTION_DESC = "desc"; + public static final String POINT = "point"; + public static final String POINT_LONGITUDE = "long"; + public static final String POINT_LATITUDE = "lat"; + public static final String POINT_ALTITUDE = "alt"; + public static final String POINT_ALTITUDE_UNIT_ATTR = "alt_unit"; + public static final String POINT_GEODETIC_DATUM = "geodetic_datum"; + public static final String CLADE_DATE = "date"; + public static final String CLADE_DATE_UNIT = "unit"; + public static final String CLADE_DATE_DESC = "desc"; + public static final String CLADE_DATE_MIN = "minimum"; + public static final String CLADE_DATE_MAX = "maximum"; + public static final String CLADE_DATE_VALUE = "value"; + public final static String SEQUENCE_RELATION = "sequence_relation"; + public final static String SEQUENCE_RELATION_TYPE = "type"; + public final static String SEQUENCE_RELATION_ID_REF0 = "id_ref_0"; + public final static String SEQUENCE_RELATION_ID_REF1 = "id_ref_1"; + public final static String SEQUENCE_RELATION_DISTANCE = "distance"; + public final static String SEQUENCE_SOURCE_ID = "id_source"; + public final static String POLYGON = "polygon"; + + private PhyloXmlMapping() { + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlParser.java new file mode 100644 index 0000000..e5fa2d0 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlParser.java @@ -0,0 +1,313 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; +import java.util.Date; +import java.util.Enumeration; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipInputStream; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.SAXNotRecognizedException; +import org.xml.sax.SAXNotSupportedException; +import org.xml.sax.SAXParseException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; + +public class PhyloXmlParser implements PhylogenyParser { + + final public static String JAXP_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; + final public static String W3C_XML_SCHEMA = "http://www.w3.org/2001/XMLSchema"; + final public static String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; + final public static String SAX_FEATURES_VALIDATION = "http://xml.org/sax/features/validation"; + final public static String APACHE_FEATURES_VALIDATION_SCHEMA = "http://apache.org/xml/features/validation/schema"; + final public static String APACHE_FEATURES_VALIDATION_SCHEMA_FULL = "http://apache.org/xml/features/validation/schema-full-checking"; + final public static String APACHE_PROPERTIES_SCHEMA_EXTERNAL_LOCATION = "http://apache.org/xml/properties/schema/external-schemaLocation"; + final static private boolean TIME = false; + private Object _source; + private boolean _valid; + private boolean _zipped_inputstream; + private int _error_count; + private int _warning_count; + private String _schema_location; + private StringBuffer _error_messages; + private StringBuffer _warning_messages; + + public PhyloXmlParser() { + init(); + reset(); + } + + public int getErrorCount() { + return _error_count; + } + + public StringBuffer getErrorMessages() { + return _error_messages; + } + + private Reader getReaderFromZipFile() throws IOException { + Reader reader = null; + final ZipFile zip_file = new ZipFile( getSource().toString() ); + final Enumeration zip_file_entries = zip_file.entries(); + while ( zip_file_entries.hasMoreElements() ) { + final ZipEntry zip_file_entry = ( ZipEntry ) zip_file_entries.nextElement(); + if ( !zip_file_entry.isDirectory() && ( zip_file_entry.getSize() > 0 ) ) { + final InputStream is = zip_file.getInputStream( zip_file_entry ); + reader = new InputStreamReader( is ); + break; + } + } + return reader; + } + + private String getSchemaLocation() { + return _schema_location; + } + + private Object getSource() { + return _source; + } + + public int getWarningCount() { + return _warning_count; + } + + public StringBuffer getWarningMessages() { + return _warning_messages; + } + + private void init() { + setZippedInputstream( false ); + } + + public boolean isValid() { + return _valid; + } + + private boolean isZippedInputstream() { + return _zipped_inputstream; + } + + public Phylogeny[] parse() throws IOException, PhylogenyParserException { + reset(); + final PhyloXmlHandler handler = new PhyloXmlHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware( true ); + try { + if ( !ForesterUtil.isEmpty( getSchemaLocation() ) ) { + factory.setFeature( SAX_FEATURES_VALIDATION, true ); + factory.setFeature( APACHE_FEATURES_VALIDATION_SCHEMA, true ); + factory.setFeature( APACHE_FEATURES_VALIDATION_SCHEMA_FULL, true ); + } + } + catch ( final SAXNotRecognizedException e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "sax not recognized exception: " + e.getLocalizedMessage() ); + } + catch ( final SAXNotSupportedException e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "sax not supported exception: " + e.getLocalizedMessage() ); + } + catch ( final ParserConfigurationException e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "parser configuration exception: " + e.getLocalizedMessage() ); + } + catch ( final Exception e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "error while configuring sax parser: " + e.getLocalizedMessage() ); + } + try { + final SAXParser parser = factory.newSAXParser(); + if ( !ForesterUtil.isEmpty( getSchemaLocation() ) ) { + parser.setProperty( JAXP_SCHEMA_LANGUAGE, W3C_XML_SCHEMA ); + parser.setProperty( JAXP_SCHEMA_SOURCE, getSchemaLocation() ); + parser.setProperty( APACHE_PROPERTIES_SCHEMA_EXTERNAL_LOCATION, getSchemaLocation() ); + } + final XMLReader xml_reader = parser.getXMLReader(); + xml_reader.setContentHandler( handler ); + xml_reader.setErrorHandler( new PhyloXmlParserErrorHandler() ); + long start_time = 0; + if ( TIME ) { + start_time = new Date().getTime(); + } + if ( getSource() instanceof File ) { + if ( !getSource().toString().toLowerCase().endsWith( ".zip" ) ) { + xml_reader.parse( new InputSource( new FileReader( ( File ) getSource() ) ) ); + } + else { + final Reader reader = getReaderFromZipFile(); + if ( reader == null ) { + throw new PhylogenyParserException( "zip file \"" + getSource() + + "\" appears not to contain any entries" ); + } + xml_reader.parse( new InputSource( reader ) ); + } + } + else if ( getSource() instanceof InputSource ) { + xml_reader.parse( ( InputSource ) getSource() ); + } + else if ( getSource() instanceof InputStream ) { + if ( !isZippedInputstream() ) { + final InputStream is = ( InputStream ) getSource(); + final Reader reader = new InputStreamReader( is ); + xml_reader.parse( new InputSource( reader ) ); + } + else { + final ZipInputStream zip_is = new ZipInputStream( ( InputStream ) getSource() ); + zip_is.getNextEntry(); + final Reader reader = new InputStreamReader( zip_is ); + if ( reader == null ) { + throw new PhylogenyParserException( "zip input stream \"" + getSource() + + "\" appears not to contain any (phyloXML) data" ); + } + xml_reader.parse( new InputSource( reader ) ); + } + } + else if ( getSource() instanceof String ) { + final File file = new File( getSource().toString() ); + final Reader reader = new FileReader( file ); + xml_reader.parse( new InputSource( reader ) ); + } + else if ( getSource() instanceof StringBuffer ) { + final StringReader string_reader = new StringReader( getSource().toString() ); + xml_reader.parse( new InputSource( string_reader ) ); + } + else { + throw new PhylogenyParserException( "phyloXML parser: attempt to parse object of unsupported type: \"" + + getSource().getClass() + "\"" ); + } + if ( TIME ) { + System.out.println( "[TIME] phyloXML parsing: " + ( new Date().getTime() - start_time ) + "ms." ); + } + } + catch ( final SAXException sax_exception ) { + throw new PhylogenyParserException( "failed to parse [" + getSource() + "]: " + + sax_exception.getLocalizedMessage() ); + } + catch ( final ParserConfigurationException parser_config_exception ) { + throw new PhylogenyParserException( "failed to parse [" + getSource() + + "]. Problem with XML parser configuration: " + parser_config_exception.getLocalizedMessage() ); + } + catch ( final IOException e ) { + throw new PhylogenyParserException( "problem with input source: " + e.getLocalizedMessage() ); + } + catch ( final Exception e ) { + throw new PhylogenyParserException( e.getLocalizedMessage() ); + } + catch ( final Error err ) { + err.printStackTrace(); + throw new PhylogenyParserException( "severe error: " + err.getLocalizedMessage() ); + } + final Phylogeny[] ps = new Phylogeny[ handler.getPhylogenies().size() ]; + int i = 0; + for( final Phylogeny phylogeny : handler.getPhylogenies() ) { + ps[ i++ ] = phylogeny; + } + return ps; + } + + private void reset() { + _valid = true; + _error_count = 0; + _warning_count = 0; + _error_messages = new StringBuffer(); + _warning_messages = new StringBuffer(); + } + + public void setSource( final Object source ) { + _source = source; + } + + public void setValidateAgainstSchema( final String schema_location ) { + _schema_location = schema_location; + } + + public void setZippedInputstream( final boolean zipped_inputstream ) { + _zipped_inputstream = zipped_inputstream; + } + + public static PhyloXmlParser createPhyloXmlParserXsdValidating() { + final PhyloXmlParser xml_parser = new PhyloXmlParser(); + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + xml_parser.setValidateAgainstSchema( xsd_url.toString() ); + } + else { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + return xml_parser; + } + + private class PhyloXmlParserErrorHandler extends DefaultHandler { + + @Override + public void error( final SAXParseException e ) { + ++_error_count; + _valid = false; + throw new PhyloXmlException( "phyloXML error at line " + e.getLineNumber() + ": \n" + + e.getLocalizedMessage() ); + } + + @Override + public void fatalError( final SAXParseException e ) { + ++_error_count; + _valid = false; + throw new PhyloXmlException( "fatal XML error at line " + e.getLineNumber() + ": \n" + + e.getLocalizedMessage() ); + } + + @Override + public void warning( final SAXParseException e ) { + ++_warning_count; + if ( _error_messages.length() > 1 ) { + _error_messages.append( ForesterUtil.LINE_SEPARATOR ); + } + _warning_messages.append( "[line: " + e.getLineNumber() + "] " + e.getMessage() ); + } + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java new file mode 100644 index 0000000..277dfa1 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java @@ -0,0 +1,98 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml; + +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +public final class PhyloXmlUtil { + + public final static Pattern SEQUENCE_SYMBOL_PATTERN = Pattern.compile( "\\S{1,20}" ); + public final static Pattern TAXOMONY_CODE_PATTERN = Pattern.compile( "[a-zA-Z0-9_]{1,10}" ); + public final static Pattern LIT_REF_DOI_PATTERN = Pattern + .compile( "[a-zA-Z0-9_\\.]+\\S+" ); + public final static Set SEQUENCE_TYPES = new HashSet(); + public final static Set TAXONOMY_RANKS = new HashSet(); + public static final int ROUNDING_DIGITS_FOR_PHYLOXML_DOUBLE_OUTPUT = 9; + public static final String VECTOR_PROPERTY_REF = "vector:index="; + public static final String VECTOR_PROPERTY_TYPE = "xsd:decimal"; + static { + SEQUENCE_TYPES.add( "rna" ); + SEQUENCE_TYPES.add( "protein" ); + SEQUENCE_TYPES.add( "dna" ); + TAXONOMY_RANKS.add( "domain" ); + TAXONOMY_RANKS.add( "superkingdom" ); + TAXONOMY_RANKS.add( "kingdom" ); + TAXONOMY_RANKS.add( "subkingdom" ); + TAXONOMY_RANKS.add( "branch" ); + TAXONOMY_RANKS.add( "infrakingdom" ); + TAXONOMY_RANKS.add( "superphylum" ); + TAXONOMY_RANKS.add( "phylum" ); + TAXONOMY_RANKS.add( "subphylum" ); + TAXONOMY_RANKS.add( "infraphylum" ); + TAXONOMY_RANKS.add( "microphylum" ); + TAXONOMY_RANKS.add( "superdivision" ); + TAXONOMY_RANKS.add( "division" ); + TAXONOMY_RANKS.add( "subdivision" ); + TAXONOMY_RANKS.add( "infradivision" ); + TAXONOMY_RANKS.add( "superclass" ); + TAXONOMY_RANKS.add( "class" ); + TAXONOMY_RANKS.add( "subclass" ); + TAXONOMY_RANKS.add( "infraclass" ); + TAXONOMY_RANKS.add( "superlegion" ); + TAXONOMY_RANKS.add( "legion" ); + TAXONOMY_RANKS.add( "sublegion" ); + TAXONOMY_RANKS.add( "infralegion" ); + TAXONOMY_RANKS.add( "supercohort" ); + TAXONOMY_RANKS.add( "cohort" ); + TAXONOMY_RANKS.add( "subcohort" ); + TAXONOMY_RANKS.add( "infracohort" ); + TAXONOMY_RANKS.add( "superorder" ); + TAXONOMY_RANKS.add( "order" ); + TAXONOMY_RANKS.add( "suborder" ); + TAXONOMY_RANKS.add( "superfamily" ); + TAXONOMY_RANKS.add( "family" ); + TAXONOMY_RANKS.add( "subfamily" ); + TAXONOMY_RANKS.add( "supertribe" ); + TAXONOMY_RANKS.add( "tribe" ); + TAXONOMY_RANKS.add( "subtribe" ); + TAXONOMY_RANKS.add( "infratribe" ); + TAXONOMY_RANKS.add( "genus" ); + TAXONOMY_RANKS.add( "subgenus" ); + TAXONOMY_RANKS.add( "superspecies" ); + TAXONOMY_RANKS.add( "species" ); + TAXONOMY_RANKS.add( "subspecies" ); + TAXONOMY_RANKS.add( "variety" ); + TAXONOMY_RANKS.add( "subvariety" ); + TAXONOMY_RANKS.add( "form" ); + TAXONOMY_RANKS.add( "subform" ); + TAXONOMY_RANKS.add( "cultivar" ); + TAXONOMY_RANKS.add( "strain" ); + TAXONOMY_RANKS.add( "unknown" ); + TAXONOMY_RANKS.add( "other" ); + }; +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/XmlElement.java b/forester/java/src/org/forester/io/parsers/phyloxml/XmlElement.java new file mode 100644 index 0000000..442c937 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/XmlElement.java @@ -0,0 +1,213 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml; + +import java.util.ArrayList; +import java.util.HashMap; + +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.util.ForesterUtil; +import org.xml.sax.Attributes; + +public class XmlElement { + + public final static boolean DEBUG = false; + private final String _namespaceUri; + private final String _localName; + private final String _qualifiedName; + private String _value; + private final HashMap _attributes; + private final ArrayList _childElements; + private XmlElement _parent; + + public XmlElement( final String namespaceUri, + final String localName, + final String qualifiedName, + final Attributes attributes ) { + _namespaceUri = namespaceUri; + _localName = localName; + _qualifiedName = qualifiedName; + if ( attributes != null ) { + _attributes = new HashMap( attributes.getLength() ); + for( int i = 0; i < attributes.getLength(); ++i ) { + getAttributes().put( new String( attributes.getQName( i ) ), new String( attributes.getValue( i ) ) ); + } + } + else { + _attributes = new HashMap(); + } + _childElements = new ArrayList(); + _parent = null; + } + + public void addChildElement( final XmlElement element ) { + element.setParent( this ); + getChildElements().add( element ); + } + + public void appendValue( final String value ) { + _value = _value + value; + } + + public String getAttribute( final String attribute_name ) { + if ( !isHasAttribute( attribute_name ) ) { + throw new IllegalArgumentException( "no attribute named [" + attribute_name + "] present in element [" + + getQualifiedName() + "]" ); + } + return getAttributes().get( attribute_name ); + } + + public HashMap getAttributes() { + return _attributes; + } + + public XmlElement getChildElement( final int i ) { + if ( ( i < 0 ) || ( i >= getNumberOfChildElements() ) ) { + throw new IllegalArgumentException( "attempt to get child element with index " + i + " for element with " + + getNumberOfChildElements() + " child elements" ); + } + return getChildElements().get( i ); + } + + ArrayList getChildElements() { + return _childElements; + } + + String getLocalName() { + return _localName; + } + + String getNamespaceUri() { + return _namespaceUri; + } + + public int getNumberOfChildElements() { + return getChildElements().size(); + } + + public XmlElement getParent() { + return _parent; + } + + public String getQualifiedName() { + return _qualifiedName; + } + + XmlElement getRoot() { + XmlElement e = this; + while ( e.getParent() != null ) { + e = e.getParent(); + } + return e; + } + + public boolean getValueAsBoolean() throws PhylogenyParserException { + boolean b = false; + try { + b = ( new Boolean( getValueAsString() ) ).booleanValue(); + } + catch ( final NumberFormatException ex ) { + throw new PhylogenyParserException( "attempt to parse [" + getValueAsString() + "] into boolean, in " + + toString() ); + } + return b; + } + + public double getValueAsDouble() throws PhylogenyParserException { + double d = 0.0; + try { + d = Double.parseDouble( getValueAsString() ); + } + catch ( final NumberFormatException ex ) { + throw new PhylogenyParserException( "attempt to parse [" + getValueAsString() + "] into double, in " + + toString() ); + } + return d; + } + + public int getValueAsInt() throws PhylogenyParserException { + int i = 0; + try { + i = Integer.parseInt( getValueAsString() ); + } + catch ( final NumberFormatException ex ) { + throw new PhylogenyParserException( "attempt to parse [" + getValueAsString() + "] into integer, in " + + toString() ); + } + return i; + } + + public String getValueAsString() { + if ( _value == null ) { + return ""; + } + return _value.replaceAll( "\\s+", " " ).trim(); + } + + public boolean isHasAttribute( final String attribute_name ) { + return getAttributes().containsKey( attribute_name ); + } + + public boolean isHasValue() { + return !ForesterUtil.isEmpty( _value ); + } + + void setParent( final XmlElement parent ) { + _parent = parent; + } + + /** + * [Careful, this does not call "new String(...)"] + * + * @param value + */ + public void setValue( final String value ) { + _value = value; + if ( XmlElement.DEBUG ) { + System.out.println(); + System.out.println( "Value is \"" + value + "\" for" ); + System.out.println( "Local name = " + getLocalName() ); + System.out.println( "Qualified name = " + getQualifiedName() ); + System.out.println( "Namespace URI = " + getNamespaceUri() ); + System.out.print( "Attributes : " ); + for( final String string : getAttributes().keySet() ) { + final String key = string; + System.out.print( key + " = \"" + getAttributes().get( key ) + "\" " ); + } + System.out.println(); + System.out.println(); + } + } + + @Override + public String toString() { + if ( getParent() != null ) { + return "\"" + getQualifiedName() + "\" [value: " + getValueAsString() + ", parent element: \"" + + getParent().getQualifiedName() + "\"]"; + } + return "\"" + getQualifiedName() + "\" [value: " + getValueAsString() + "]"; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/AccessionParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/AccessionParser.java new file mode 100644 index 0000000..01d3c54 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/AccessionParser.java @@ -0,0 +1,63 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/ + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.PhylogenyData; + +public class AccessionParser implements PhylogenyDataPhyloXmlParser { + + private final static PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new AccessionParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private AccessionParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + if ( element.isHasAttribute( PhyloXmlMapping.ACCESSION_SOURCE_ATTR ) ) { + return new Accession( element.getValueAsString(), element + .getAttribute( PhyloXmlMapping.ACCESSION_SOURCE_ATTR ) ); + } + else { + return new Accession( element.getValueAsString(), "?" ); + } + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/AnnotationParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/AnnotationParser.java new file mode 100644 index 0000000..8f25921 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/AnnotationParser.java @@ -0,0 +1,97 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Annotation; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.PropertiesMap; +import org.forester.phylogeny.data.Property; +import org.forester.phylogeny.data.Uri; + +public class AnnotationParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new AnnotationParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private AnnotationParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String ref; + if ( element.isHasAttribute( PhyloXmlMapping.ANNOTATION_REF_ATTR ) ) { + ref = element.getAttribute( PhyloXmlMapping.ANNOTATION_REF_ATTR ); + } + else { + ref = "_:_"; + } + final Annotation annotation = new Annotation( ref ); + if ( element.isHasAttribute( PhyloXmlMapping.ANNOTATION_TYPE_ATTR ) ) { + annotation.setType( element.getAttribute( PhyloXmlMapping.ANNOTATION_TYPE_ATTR ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.ANNOTATION_EVIDENCE_ATTR ) ) { + annotation.setEvidence( element.getAttribute( PhyloXmlMapping.ANNOTATION_EVIDENCE_ATTR ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.ANNOTATION_SOURCE_ATTR ) ) { + annotation.setSource( element.getAttribute( PhyloXmlMapping.ANNOTATION_SOURCE_ATTR ) ); + } + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.ANNOTATION_DESC ) ) { + annotation.setDesc( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.CONFIDENCE ) ) { + annotation.setConfidence( ( Confidence ) ConfidenceParser.getInstance().parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.URI ) ) { + annotation.addUri( ( Uri ) UriParser.getInstance().parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.PROPERTY ) ) { + if ( annotation.getProperties() == null ) { + annotation.setProperties( new PropertiesMap() ); + } + annotation.getProperties() + .addProperty( ( Property ) PropertyParser.getInstance().parse( child_element ) ); + } + } + return annotation; + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/BinaryCharactersParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/BinaryCharactersParser.java new file mode 100644 index 0000000..11469c4 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/BinaryCharactersParser.java @@ -0,0 +1,115 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.BinaryCharacters; +import org.forester.phylogeny.data.PhylogenyData; + +public class BinaryCharactersParser implements PhylogenyDataPhyloXmlParser { + + private static final BinaryCharactersParser _instance; + static { + try { + _instance = new BinaryCharactersParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private BinaryCharactersParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + final SortedSet present = new TreeSet(); + final SortedSet gained = new TreeSet(); + final SortedSet lost = new TreeSet(); + String type = ""; + int present_count = BinaryCharacters.COUNT_DEFAULT; + int gained_count = BinaryCharacters.COUNT_DEFAULT; + int lost_count = BinaryCharacters.COUNT_DEFAULT; + if ( element.isHasAttribute( PhyloXmlMapping.BINARY_CHARACTERS_TYPE_ATTR ) ) { + type = element.getAttribute( PhyloXmlMapping.BINARY_CHARACTERS_TYPE_ATTR ); + } + try { + if ( element.isHasAttribute( PhyloXmlMapping.BINARY_CHARACTERS_PRESENT_COUNT_ATTR ) ) { + present_count = Integer.parseInt( element + .getAttribute( PhyloXmlMapping.BINARY_CHARACTERS_PRESENT_COUNT_ATTR ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.BINARY_CHARACTERS_GAINED_COUNT_ATTR ) ) { + gained_count = Integer.parseInt( element + .getAttribute( PhyloXmlMapping.BINARY_CHARACTERS_GAINED_COUNT_ATTR ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.BINARY_CHARACTERS_LOST_COUNT_ATTR ) ) { + lost_count = Integer + .parseInt( element.getAttribute( PhyloXmlMapping.BINARY_CHARACTERS_LOST_COUNT_ATTR ) ); + } + } + catch ( final NumberFormatException e ) { + throw new PhylogenyParserException( "failed to parse integer from element " + element.getQualifiedName() ); + } + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.BINARY_CHARACTERS_PRESENT ) ) { + parseCharacters( present, child_element ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.BINARY_CHARACTERS_GAINED ) ) { + parseCharacters( gained, child_element ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.BINARY_CHARACTERS_LOST ) ) { + parseCharacters( lost, child_element ); + } + } + BinaryCharacters bc = null; + if ( present_count != BinaryCharacters.COUNT_DEFAULT ) { + bc = new BinaryCharacters( present, gained, lost, type, present_count, gained_count, lost_count ); + } + else { + bc = new BinaryCharacters( present, gained, lost, type ); + } + return bc; + } + + private void parseCharacters( final SortedSet present, final XmlElement child_element ) { + for( int j = 0; j < child_element.getNumberOfChildElements(); ++j ) { + final XmlElement child_child_element = child_element.getChildElement( j ); + if ( child_child_element.getQualifiedName().equals( PhyloXmlMapping.BINARY_CHARACTER ) ) { + present.add( child_child_element.getValueAsString() ); + } + } + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/BranchWidthParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/BranchWidthParser.java new file mode 100644 index 0000000..b65513b --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/BranchWidthParser.java @@ -0,0 +1,56 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.BranchWidth; +import org.forester.phylogeny.data.PhylogenyData; + +public class BranchWidthParser implements PhylogenyDataPhyloXmlParser { + + private static final BranchWidthParser _instance; + static { + try { + _instance = new BranchWidthParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private BranchWidthParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + return new BranchWidth( element.getValueAsDouble() ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/ColorParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/ColorParser.java new file mode 100644 index 0000000..02255ef --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/ColorParser.java @@ -0,0 +1,76 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.awt.Color; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.BranchColor; +import org.forester.phylogeny.data.PhylogenyData; + +public class ColorParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new ColorParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private ColorParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + int red = 0; + int green = 0; + int blue = 0; + for( int j = 0; j < element.getNumberOfChildElements(); ++j ) { + final XmlElement c = element.getChildElement( j ); + if ( c.getQualifiedName().equals( PhyloXmlMapping.COLOR_RED ) ) { + red = c.getValueAsInt(); + } + else if ( c.getQualifiedName().equals( PhyloXmlMapping.COLOR_GREEN ) ) { + green = c.getValueAsInt(); + } + else if ( c.getQualifiedName().equals( PhyloXmlMapping.COLOR_BLUE ) ) { + blue = c.getValueAsInt(); + } + } + final BranchColor color = new BranchColor(); + color.setValue( new Color( red, green, blue ) ); + return color; + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/ConfidenceParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/ConfidenceParser.java new file mode 100644 index 0000000..92ce6c7 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/ConfidenceParser.java @@ -0,0 +1,62 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.PhylogenyData; + +public class ConfidenceParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new ConfidenceParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private ConfidenceParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + final Confidence confidence = new Confidence(); + confidence.setValue( element.getValueAsDouble() ); + if ( element.isHasAttribute( PhyloXmlMapping.CONFIDENCE_TYPE_ATTR ) ) { + confidence.setType( element.getAttribute( PhyloXmlMapping.CONFIDENCE_TYPE_ATTR ) ); + } + return confidence; + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/DateParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/DateParser.java new file mode 100644 index 0000000..3f5add7 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/DateParser.java @@ -0,0 +1,95 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.math.BigDecimal; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Date; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.util.ForesterUtil; + +public class DateParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new DateParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private DateParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String unit = ""; + if ( element.isHasAttribute( PhyloXmlMapping.CLADE_DATE_UNIT ) ) { + unit = element.getAttribute( PhyloXmlMapping.CLADE_DATE_UNIT ); + } + String val = null; + String min = null; + String max = null; + String desc = ""; + for( int j = 0; j < element.getNumberOfChildElements(); ++j ) { + final XmlElement e = element.getChildElement( j ); + if ( e.getQualifiedName().equals( PhyloXmlMapping.CLADE_DATE_VALUE ) ) { + val = e.getValueAsString(); + } + else if ( e.getQualifiedName().equals( PhyloXmlMapping.CLADE_DATE_MIN ) ) { + min = e.getValueAsString(); + } + else if ( e.getQualifiedName().equals( PhyloXmlMapping.CLADE_DATE_MAX ) ) { + max = e.getValueAsString(); + } + else if ( e.getQualifiedName().equals( PhyloXmlMapping.CLADE_DATE_DESC ) ) { + desc = e.getValueAsString(); + } + } + BigDecimal val_bd = null; + BigDecimal min_bd = null; + BigDecimal max_bd = null; + if ( !ForesterUtil.isEmpty( val ) ) { + val_bd = new BigDecimal( val ); + } + if ( !ForesterUtil.isEmpty( min ) ) { + min_bd = new BigDecimal( min ); + } + if ( !ForesterUtil.isEmpty( max ) ) { + max_bd = new BigDecimal( max ); + } + return new Date( desc, val_bd, min_bd, max_bd, unit ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/DistributionParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/DistributionParser.java new file mode 100644 index 0000000..c211106 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/DistributionParser.java @@ -0,0 +1,83 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Distribution; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.Point; +import org.forester.phylogeny.data.Polygon; + +public class DistributionParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new DistributionParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private DistributionParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String desc = ""; + List points = null; + List polygons = null; + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.DISTRIBUTION_DESC ) ) { + desc = child_element.getValueAsString(); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.POINT ) ) { + if ( points == null ) { + points = new ArrayList(); + } + points.add( ( Point ) PointParser.getInstance().parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.POLYGON ) ) { + if ( polygons == null ) { + polygons = new ArrayList(); + } + polygons.add( ( Polygon ) PolygonParser.getInstance().parse( child_element ) ); + } + } + return new Distribution( desc, points, polygons ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/DomainArchitectureParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/DomainArchitectureParser.java new file mode 100644 index 0000000..34bba14 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/DomainArchitectureParser.java @@ -0,0 +1,76 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.DomainArchitecture; +import org.forester.phylogeny.data.ProteinDomain; + +public class DomainArchitectureParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new DomainArchitectureParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private DomainArchitectureParser() { + } + + @Override + public DomainArchitecture parse( final XmlElement element ) throws PhylogenyParserException { + final DomainArchitecture architecure = new DomainArchitecture(); + if ( !element.isHasAttribute( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_LENGTH ) ) { + throw new PhylogenyParserException( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_LENGTH + + " attribute is required for domain architecture" ); + } + final String lenght_str = element.getAttribute( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_LENGTH ); + try { + architecure.setTotalLength( Integer.parseInt( lenght_str ) ); + } + catch ( final NumberFormatException e ) { + throw new PhylogenyParserException( "could not extract domain architecture length from [" + lenght_str + + "]: " + e.getMessage() ); + } + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_DOMAIN ) ) { + architecure.addDomain( ( ProteinDomain ) ProteinDomainParser.getInstance().parse( child_element ) ); + } + } + return architecure; + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/EventParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/EventParser.java new file mode 100644 index 0000000..f58448e --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/EventParser.java @@ -0,0 +1,97 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.util.ForesterUtil; + +public class EventParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new EventParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private EventParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String type = ""; + Confidence conf = null; + int duplications = Event.DEFAULT_VALUE; + int speciations = Event.DEFAULT_VALUE; + int losses = Event.DEFAULT_VALUE; + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.EVENT_TYPE ) ) { + type = child_element.getValueAsString(); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.CONFIDENCE ) ) { + conf = ( ( Confidence ) ConfidenceParser.getInstance().parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.EVENT_DUPLICATIONS ) ) { + duplications = child_element.getValueAsInt(); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.EVENT_SPECIATIONS ) ) { + speciations = child_element.getValueAsInt(); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.EVENT_LOSSES ) ) { + losses = child_element.getValueAsInt(); + } + } + Event event = null; + if ( ForesterUtil.isEmpty( type ) ) { + event = new Event( duplications, speciations, losses ); + } + else { + try { + event = new Event( duplications, speciations, losses, type ); + } + catch ( final Exception e ) { + throw new PhylogenyParserException( "problem with " + element.toString() + ": " + e.getMessage() ); + } + } + if ( conf != null ) { + event.setConfidence( conf ); + } + return event; + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/IdentifierParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/IdentifierParser.java new file mode 100644 index 0000000..6d68234 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/IdentifierParser.java @@ -0,0 +1,67 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/ + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PhylogenyData; + +public class IdentifierParser implements PhylogenyDataPhyloXmlParser { + + final private static String TYPE = "type"; //TODO deprecated, remove, to ensure comp. with phyloxml 1.00 + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new IdentifierParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private IdentifierParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + if ( element.isHasAttribute( PhyloXmlMapping.IDENTIFIER_PROVIDER_ATTR ) ) { + return new Identifier( element.getValueAsString(), element + .getAttribute( PhyloXmlMapping.IDENTIFIER_PROVIDER_ATTR ) ); + } + else if ( element.isHasAttribute( TYPE ) ) { + return new Identifier( element.getValueAsString(), element.getAttribute( TYPE ) ); + } + else { + return new Identifier( element.getValueAsString() ); + } + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/PhylogenyDataPhyloXmlParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/PhylogenyDataPhyloXmlParser.java new file mode 100644 index 0000000..d8db199 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/PhylogenyDataPhyloXmlParser.java @@ -0,0 +1,36 @@ +// $Id: +// $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.PhylogenyData; + +public interface PhylogenyDataPhyloXmlParser { + + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException; +} \ No newline at end of file diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/PointParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/PointParser.java new file mode 100644 index 0000000..8ac0ee5 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/PointParser.java @@ -0,0 +1,95 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.math.BigDecimal; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.Point; +import org.forester.util.ForesterUtil; + +public class PointParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new PointParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private PointParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String alt_unit = ""; + String geo_datum = ""; + if ( element.isHasAttribute( PhyloXmlMapping.POINT_ALTITUDE_UNIT_ATTR ) ) { + alt_unit = element.getAttribute( PhyloXmlMapping.POINT_ALTITUDE_UNIT_ATTR ); + } + if ( element.isHasAttribute( PhyloXmlMapping.POINT_GEODETIC_DATUM ) ) { + geo_datum = element.getAttribute( PhyloXmlMapping.POINT_GEODETIC_DATUM ); + } + String lat_str = null; + String lon_str = null; + String alt_str = null; + for( int j = 0; j < element.getNumberOfChildElements(); ++j ) { + final XmlElement e = element.getChildElement( j ); + if ( e.getQualifiedName().equals( PhyloXmlMapping.POINT_LATITUDE ) ) { + lat_str = e.getValueAsString(); + } + else if ( e.getQualifiedName().equals( PhyloXmlMapping.POINT_LONGITUDE ) ) { + lon_str = e.getValueAsString(); + } + else if ( e.getQualifiedName().equals( PhyloXmlMapping.POINT_ALTITUDE ) ) { + alt_str = e.getValueAsString(); + } + } + BigDecimal lat = null; + BigDecimal lon = null; + BigDecimal alt = null; + if ( !ForesterUtil.isEmpty( lat_str ) ) { + lat = new BigDecimal( lat_str ); + } + if ( !ForesterUtil.isEmpty( lon_str ) ) { + lon = new BigDecimal( lon_str ); + } + if ( !ForesterUtil.isEmpty( alt_str ) ) { + alt = new BigDecimal( alt_str ); + } + return new Point( geo_datum, lat, lon, alt, alt_unit ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/PolygonParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/PolygonParser.java new file mode 100644 index 0000000..6d5ff38 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/PolygonParser.java @@ -0,0 +1,68 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.Point; +import org.forester.phylogeny.data.Polygon; + +public class PolygonParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new PolygonParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private PolygonParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + final List points = new ArrayList(); + for( int j = 0; j < element.getNumberOfChildElements(); ++j ) { + final XmlElement e = element.getChildElement( j ); + if ( e.getQualifiedName().equals( PhyloXmlMapping.POINT ) ) { + points.add( ( Point ) PointParser.getInstance().parse( e ) ); + } + } + return new Polygon( points ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/PropertyParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/PropertyParser.java new file mode 100644 index 0000000..38f4dfb --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/PropertyParser.java @@ -0,0 +1,99 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/ + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.Property; +import org.forester.phylogeny.data.Property.AppliesTo; +import org.forester.util.ForesterUtil; + +public class PropertyParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new PropertyParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private PropertyParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String ref = ""; + String value = ""; + String unit = ""; + String datatype = ""; + String applies_to_str = ""; + String id_ref = ""; + if ( element.isHasAttribute( PhyloXmlMapping.PROPERTY_REF ) ) { + ref = element.getAttribute( PhyloXmlMapping.PROPERTY_REF ); + } + if ( element.isHasAttribute( PhyloXmlMapping.PROPERTY_UNIT ) ) { + unit = element.getAttribute( PhyloXmlMapping.PROPERTY_UNIT ); + } + if ( element.isHasAttribute( PhyloXmlMapping.PROPERTY_DATATYPE ) ) { + datatype = element.getAttribute( PhyloXmlMapping.PROPERTY_DATATYPE ); + } + if ( element.isHasAttribute( PhyloXmlMapping.PROPERTY_APPLIES_TO ) ) { + applies_to_str = element.getAttribute( PhyloXmlMapping.PROPERTY_APPLIES_TO ); + } + if ( element.isHasAttribute( PhyloXmlMapping.ID_REF ) ) { + id_ref = element.getAttribute( PhyloXmlMapping.ID_REF ); + } + if ( !ForesterUtil.isEmpty( element.getValueAsString() ) ) { + value = element.getValueAsString(); + } + AppliesTo applies_to = AppliesTo.OTHER; + if ( applies_to_str.equals( AppliesTo.NODE.toString() ) ) { + applies_to = AppliesTo.NODE; + } + else if ( applies_to_str.equals( AppliesTo.PARENT_BRANCH.toString() ) ) { + applies_to = AppliesTo.PARENT_BRANCH; + } + else if ( applies_to_str.equals( AppliesTo.CLADE.toString() ) ) { + applies_to = AppliesTo.CLADE; + } + else if ( applies_to_str.equals( AppliesTo.ANNOTATION.toString() ) ) { + applies_to = AppliesTo.ANNOTATION; + } + else if ( applies_to_str.equals( AppliesTo.PHYLOGENY.toString() ) ) { + applies_to = AppliesTo.PHYLOGENY; + } + return new Property( ref, value, unit, datatype, applies_to, id_ref ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/ProteinDomainParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/ProteinDomainParser.java new file mode 100644 index 0000000..e32096e --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/ProteinDomainParser.java @@ -0,0 +1,78 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.ProteinDomain; + +public class ProteinDomainParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new ProteinDomainParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private ProteinDomainParser() { + } + + @Override + public ProteinDomain parse( final XmlElement element ) throws PhylogenyParserException { + String name = ""; + int f = -1; + int t = -1; + double conf = ProteinDomain.CONFIDENCE_DEFAULT; + String id = ProteinDomain.IDENTIFIER_DEFAULT; + try { + f = Integer + .parseInt( element.getAttribute( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_FROM ) ); + t = Integer.parseInt( element.getAttribute( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_TO ) ); + conf = Double.parseDouble( element + .getAttribute( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_CONFIDENCE ) ); + if ( element.isHasAttribute( PhyloXmlMapping.IDENTIFIER ) ) { + id = element.getAttribute( PhyloXmlMapping.IDENTIFIER ); + } + } + catch ( final Exception e ) { + throw new PhylogenyParserException( "failed to parse element [" + element + "]: " + e.getMessage() ); + } + name = element.getValueAsString(); + if ( ( f == -1 ) || ( t == -1 ) || ( conf == ProteinDomain.CONFIDENCE_DEFAULT ) ) { + throw new PhylogenyParserException( "from, to, or confidence attribute not set in: " + element ); + } + return new ProteinDomain( name, f, t, id, conf ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/ReferenceParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/ReferenceParser.java new file mode 100644 index 0000000..cad2b26 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/ReferenceParser.java @@ -0,0 +1,75 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/ + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.Reference; +import org.forester.util.ForesterUtil; + +public class ReferenceParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new ReferenceParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private ReferenceParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String desc = ""; + String doi = ""; + if ( element.isHasAttribute( PhyloXmlMapping.REFERENCE_DOI_ATTR ) ) { + doi = element.getAttribute( PhyloXmlMapping.REFERENCE_DOI_ATTR ); + } + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.REFERENCE_DESC ) ) { + desc = child_element.getValueAsString(); + break; + } + } + if ( !ForesterUtil.isEmpty( doi ) ) { + return new Reference( desc, doi ); + } + else { + return new Reference( desc ); + } + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceParser.java new file mode 100644 index 0000000..f0abd4b --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceParser.java @@ -0,0 +1,99 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; +import org.forester.phylogeny.data.DomainArchitecture; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.Uri; + +public class SequenceParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new SequenceParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private SequenceParser() { + } + + @Override + public Sequence parse( final XmlElement element ) throws PhylogenyParserException { + final Sequence sequence = new Sequence(); + if ( element.isHasAttribute( PhyloXmlMapping.SEQUENCE_TYPE ) ) { + sequence.setType( element.getAttribute( PhyloXmlMapping.SEQUENCE_TYPE ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.SEQUENCE_SOURCE_ID ) ) { + sequence.setSourceId( element.getAttribute( PhyloXmlMapping.SEQUENCE_SOURCE_ID ) ); + } + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.SEQUENCE_LOCATION ) ) { + sequence.setLocation( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.SEQUENCE_NAME ) ) { + sequence.setName( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.SEQUENCE_MOL_SEQ ) ) { + if ( child_element.isHasAttribute( PhyloXmlMapping.SEQUENCE_MOL_SEQ_ALIGNED_ATTR ) ) { + sequence.setMolecularSequenceAligned( Boolean.parseBoolean( child_element + .getAttribute( PhyloXmlMapping.SEQUENCE_MOL_SEQ_ALIGNED_ATTR ) ) ); + } + sequence.setMolecularSequence( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.ACCESSION ) ) { + sequence.setAccession( ( Accession ) AccessionParser.getInstance().parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.SEQUENCE_SYMBOL ) ) { + sequence.setSymbol( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.ANNOTATION ) ) { + sequence.addAnnotation( ( Annotation ) AnnotationParser.getInstance().parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECURE ) ) { + sequence.setDomainArchitecture( ( DomainArchitecture ) DomainArchitectureParser.getInstance() + .parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.URI ) ) { + sequence.addUri( ( Uri ) UriParser.getInstance().parse( child_element ) ); + } + } + return sequence; + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceRelationParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceRelationParser.java new file mode 100644 index 0000000..6c0a849 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/SequenceRelationParser.java @@ -0,0 +1,91 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.util.HashMap; +import java.util.Map; + +import org.forester.io.parsers.phyloxml.PhyloXmlHandler; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.SequenceRelation; + +public class SequenceRelationParser implements PhylogenyDataPhyloXmlParser { + + private static final Map _instances = new HashMap(); + private Phylogeny _phylogeny; + + private SequenceRelationParser() { + } + + @Override + public SequenceRelation parse( final XmlElement element ) throws PhylogenyParserException { + final SequenceRelation seqRelation = new SequenceRelation(); + if ( element.isHasAttribute( PhyloXmlMapping.SEQUENCE_RELATION_TYPE ) ) { + final String sType = element.getAttribute( PhyloXmlMapping.SEQUENCE_RELATION_TYPE ); + seqRelation.setType( SequenceRelation.SEQUENCE_RELATION_TYPE.valueOf( sType ) ); + } + if ( element.isHasAttribute( PhyloXmlMapping.SEQUENCE_RELATION_ID_REF0 ) && ( _phylogeny != null ) ) { + final Sequence ref = PhyloXmlHandler.getSequenceMapByIdForPhylogeny( _phylogeny ).get( element + .getAttribute( PhyloXmlMapping.SEQUENCE_RELATION_ID_REF0 ) ); + if ( ref != null ) { + seqRelation.setRef0( ref ); + } + } + if ( element.isHasAttribute( PhyloXmlMapping.SEQUENCE_RELATION_ID_REF1 ) && ( _phylogeny != null ) ) { + final Sequence ref = PhyloXmlHandler.getSequenceMapByIdForPhylogeny( _phylogeny ).get( element + .getAttribute( PhyloXmlMapping.SEQUENCE_RELATION_ID_REF1 ) ); + if ( ref != null ) { + seqRelation.setRef1( ref ); + } + } + if ( element.isHasAttribute( PhyloXmlMapping.SEQUENCE_RELATION_DISTANCE ) ) { + seqRelation.setDistance( Double + .valueOf( element.getAttribute( PhyloXmlMapping.SEQUENCE_RELATION_DISTANCE ) ) ); + } + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.CONFIDENCE ) ) { + seqRelation.setConfidence( ( Confidence ) ConfidenceParser.getInstance().parse( child_element ) ); + } + } + return seqRelation; + } + + public static PhylogenyDataPhyloXmlParser getInstance( final Phylogeny phylogeny ) { + SequenceRelationParser instance = _instances.get( phylogeny ); + if ( instance == null ) { + instance = new SequenceRelationParser(); + instance._phylogeny = phylogeny; + _instances.put( phylogeny, instance ); + } + return instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/TaxonomyParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/TaxonomyParser.java new file mode 100644 index 0000000..d124ad0 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/TaxonomyParser.java @@ -0,0 +1,87 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.data.Uri; + +public class TaxonomyParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new TaxonomyParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private TaxonomyParser() { + } + + public Taxonomy parse( final XmlElement element ) throws PhylogenyParserException { + final Taxonomy taxonomy = new Taxonomy(); + for( int i = 0; i < element.getNumberOfChildElements(); ++i ) { + final XmlElement child_element = element.getChildElement( i ); + if ( child_element.isHasValue() ) { + if ( child_element.getQualifiedName().equals( PhyloXmlMapping.TAXONOMY_CODE ) ) { + taxonomy.setTaxonomyCode( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.TAXONOMY_COMMON_NAME ) ) { + taxonomy.setCommonName( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.TAXONOMY_AUTHORITY ) ) { + taxonomy.setAuthority( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.TAXONOMY_SYNONYM ) ) { + taxonomy.getSynonyms().add( ( child_element.getValueAsString() ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.IDENTIFIER ) ) { + taxonomy.setIdentifier( ( Identifier ) IdentifierParser.getInstance().parse( child_element ) ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.TAXONOMY_RANK ) ) { + taxonomy.setRank( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.TAXONOMY_SCIENTIFIC_NAME ) ) { + taxonomy.setScientificName( child_element.getValueAsString() ); + } + else if ( child_element.getQualifiedName().equals( PhyloXmlMapping.URI ) ) { + taxonomy.addUri( ( Uri ) UriParser.getInstance().parse( child_element ) ); + } + } + } + return taxonomy; + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/data/UriParser.java b/forester/java/src/org/forester/io/parsers/phyloxml/data/UriParser.java new file mode 100644 index 0000000..57c106e --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/phyloxml/data/UriParser.java @@ -0,0 +1,75 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.phyloxml.data; + +import java.net.URI; +import java.net.URISyntaxException; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.Uri; + +public class UriParser implements PhylogenyDataPhyloXmlParser { + + private static final PhylogenyDataPhyloXmlParser _instance; + static { + try { + _instance = new UriParser(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private UriParser() { + } + + @Override + public PhylogenyData parse( final XmlElement element ) throws PhylogenyParserException { + String type = ""; + String desc = ""; + URI uri = null; + try { + uri = new URI( element.getValueAsString() ); + } + catch ( final URISyntaxException e ) { + throw new PhylogenyParserException( "ill formatted Uri: " + element.getValueAsString() ); + } + if ( element.isHasAttribute( PhyloXmlMapping.URI_DESC_ATTR ) ) { + desc = element.getAttribute( PhyloXmlMapping.URI_DESC_ATTR ); + } + if ( element.isHasAttribute( PhyloXmlMapping.TYPE_ATTR ) ) { + type = element.getAttribute( PhyloXmlMapping.TYPE_ATTR ); + } + return new Uri( uri, desc, type ); + } + + public static PhylogenyDataPhyloXmlParser getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/io/parsers/tol/TolParser.java b/forester/java/src/org/forester/io/parsers/tol/TolParser.java new file mode 100644 index 0000000..8968885 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/tol/TolParser.java @@ -0,0 +1,286 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.tol; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.util.Enumeration; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipInputStream; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; +import org.forester.util.ForesterUtil; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.SAXNotRecognizedException; +import org.xml.sax.SAXNotSupportedException; +import org.xml.sax.SAXParseException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; + +public class TolParser implements PhylogenyParser { + + final public static String JAXP_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; + final public static String W3C_XML_SCHEMA = "http://www.w3.org/2001/XMLSchema"; + final public static String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; + final public static String SAX_FEATURES_VALIDATION = "http://xml.org/sax/features/validation"; + final public static String APACHE_FEATURES_VALIDATION_SCHEMA = "http://apache.org/xml/features/validation/schema"; + final public static String APACHE_FEATURES_VALIDATION_SCHEMA_FULL = "http://apache.org/xml/features/validation/schema-full-checking"; + final public static String APACHE_PROPERTIES_SCHEMA_EXTERNAL_LOCATION = "http://apache.org/xml/properties/schema/external-schemaLocation"; + private Object _source; + private boolean _valid; + private boolean _zipped_inputstream; + private int _error_count; + private int _warning_count; + private String _schema_location; + private StringBuffer _error_messages; + private StringBuffer _warning_messages; + + public TolParser() { + init(); + reset(); + } + + public int getErrorCount() { + return _error_count; + } + + public StringBuffer getErrorMessages() { + return _error_messages; + } + + private Reader getReaderFromZipFile() throws IOException { + Reader reader = null; + final ZipFile zip_file = new ZipFile( getSource().toString() ); + final Enumeration zip_file_entries = zip_file.entries(); + while ( zip_file_entries.hasMoreElements() ) { + final ZipEntry zip_file_entry = ( ZipEntry ) zip_file_entries.nextElement(); + if ( !zip_file_entry.isDirectory() && ( zip_file_entry.getSize() > 0 ) ) { + final InputStream is = zip_file.getInputStream( zip_file_entry ); + reader = new InputStreamReader( is ); + break; + } + } + return reader; + } + + private String getSchemaLocation() { + return _schema_location; + } + + private Object getSource() { + return _source; + } + + public int getWarningCount() { + return _warning_count; + } + + public StringBuffer getWarningMessages() { + return _warning_messages; + } + + private void init() { + setZippedInputstream( false ); + } + + public boolean isValid() { + return _valid; + } + + private boolean isZippedInputstream() { + return _zipped_inputstream; + } + + public Phylogeny[] parse() throws IOException, PhylogenyParserException { + reset(); + final TolXmlHandler handler = new TolXmlHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware( true ); + try { + if ( !ForesterUtil.isEmpty( getSchemaLocation() ) ) { + factory.setFeature( SAX_FEATURES_VALIDATION, true ); + factory.setFeature( APACHE_FEATURES_VALIDATION_SCHEMA, true ); + factory.setFeature( APACHE_FEATURES_VALIDATION_SCHEMA_FULL, true ); + } + } + catch ( final SAXNotRecognizedException e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "sax not recognized exception: " + e.getMessage() ); + } + catch ( final SAXNotSupportedException e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "sax not supported exception: " + e.getMessage() ); + } + catch ( final ParserConfigurationException e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "parser _configuration exception: " + e.getMessage() ); + } + catch ( final Exception e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "error while configuring sax parser: " + e.getMessage() ); + } + try { + final SAXParser parser = factory.newSAXParser(); + if ( !ForesterUtil.isEmpty( getSchemaLocation() ) ) { + parser.setProperty( JAXP_SCHEMA_LANGUAGE, W3C_XML_SCHEMA ); + parser.setProperty( JAXP_SCHEMA_SOURCE, getSchemaLocation() ); + parser.setProperty( APACHE_PROPERTIES_SCHEMA_EXTERNAL_LOCATION, getSchemaLocation() ); + } + final XMLReader xml_reader = parser.getXMLReader(); + xml_reader.setContentHandler( handler ); + xml_reader.setErrorHandler( new TolParserErrorHandler() ); + if ( getSource() instanceof File ) { + if ( !getSource().toString().toLowerCase().endsWith( ".zip" ) ) { + xml_reader.parse( new InputSource( new FileReader( ( File ) getSource() ) ) ); + } + else { + final Reader reader = getReaderFromZipFile(); + if ( reader == null ) { + throw new PhylogenyParserException( "Zip file \"" + getSource() + + "\" appears not to contain any entries" ); + } + xml_reader.parse( new InputSource( reader ) ); + } + } + else if ( getSource() instanceof InputSource ) { + xml_reader.parse( ( InputSource ) getSource() ); + } + else if ( getSource() instanceof InputStream ) { + if ( !isZippedInputstream() ) { + final InputStream is = ( InputStream ) getSource(); + final Reader reader = new InputStreamReader( is ); + xml_reader.parse( new InputSource( reader ) ); + } + else { + final ZipInputStream zip_is = new ZipInputStream( ( InputStream ) getSource() ); + zip_is.getNextEntry(); + final Reader reader = new InputStreamReader( zip_is ); + if ( reader == null ) { + throw new PhylogenyParserException( "Zip input stream \"" + getSource() + + "\" appears not to contain any data" ); + } + xml_reader.parse( new InputSource( reader ) ); + } + } + else if ( getSource() instanceof String ) { + final File file = new File( getSource().toString() ); + final Reader reader = new FileReader( file ); + xml_reader.parse( new InputSource( reader ) ); + } + else if ( getSource() instanceof StringBuffer ) { + final StringReader string_reader = new StringReader( getSource().toString() ); + xml_reader.parse( new InputSource( string_reader ) ); + } + else { + throw new PhylogenyParserException( "attempt to parse object of unsupported type: \"" + + getSource().getClass() + "\"" ); + } + } + catch ( final SAXException sax_exception ) { + throw new PhylogenyParserException( "Failed to parse [" + getSource() + "]: " + sax_exception.getMessage() ); + } + catch ( final ParserConfigurationException parser_config_exception ) { + throw new PhylogenyParserException( "Failed to parse [" + getSource() + + "] Problem with xml parser _configuration: " + parser_config_exception.getMessage() ); + } + catch ( final IOException e ) { + throw new PhylogenyParserException( "Problem with input source [" + getSource() + "]: \n" + e.getMessage() ); + } + catch ( final Exception e ) { + e.printStackTrace(); + throw new PhylogenyParserException( "Failed to parse [" + getSource() + "]: " + e.getMessage() ); + } + catch ( final Error err ) { + err.printStackTrace(); + throw new PhylogenyParserException( "Severe error: " + err.getMessage() ); + } + final Phylogeny[] ps = new Phylogeny[ handler.getPhylogenies().size() ]; + int i = 0; + for( final Phylogeny phylogeny : handler.getPhylogenies() ) { + ps[ i++ ] = phylogeny; + } + return ps; + } + + private void reset() { + _valid = true; + _error_count = 0; + _warning_count = 0; + _error_messages = new StringBuffer(); + _warning_messages = new StringBuffer(); + } + + public void setSource( final Object source ) { + _source = source; + } + + public void setValidateAgainstSchema( final String schema_location ) { + _schema_location = schema_location; + } + + public void setZippedInputstream( final boolean zipped_inputstream ) { + _zipped_inputstream = zipped_inputstream; + } + + private class TolParserErrorHandler extends DefaultHandler { + + @Override + public void error( final SAXParseException e ) { + ++_error_count; + _valid = false; + throw new RuntimeException( "XML error at line " + e.getLineNumber() + ": \n" + e.getMessage() ); + } + + @Override + public void fatalError( final SAXParseException e ) { + ++_error_count; + _valid = false; + throw new RuntimeException( "Fatal XML error at line " + e.getLineNumber() + ": \n" + e.getMessage() ); + } + + @Override + public void warning( final SAXParseException e ) { + ++_warning_count; + if ( _error_messages.length() > 1 ) { + _error_messages.append( ForesterUtil.LINE_SEPARATOR ); + } + _warning_messages.append( "[line: " + e.getLineNumber() + "] " + e.getMessage() ); + } + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/io/parsers/tol/TolXmlHandler.java b/forester/java/src/org/forester/io/parsers/tol/TolXmlHandler.java new file mode 100644 index 0000000..c3b452d --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/tol/TolXmlHandler.java @@ -0,0 +1,318 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.tol; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.phyloxml.XmlElement; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.util.FailedConditionCheckException; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public final class TolXmlHandler extends DefaultHandler { + + private String _current_element_name; + private Phylogeny _current_phylogeny; + private List _phylogenies; + private XmlElement _current_xml_element; + private PhylogenyNode _current_node; + private final static StringBuffer _buffer = new StringBuffer(); + + TolXmlHandler() { + // Constructor. + } + + private void addNode() { + final PhylogenyNode new_node = new PhylogenyNode(); + getCurrentNode().addAsChild( new_node ); + setCurrentNode( new_node ); + } + + @Override + public void characters( final char[] chars, final int start_index, final int end_index ) { + if ( ( ( getCurrentXmlElement() != null ) && ( getCurrentElementName() != null ) ) + && !getCurrentElementName().equals( TolXmlMapping.CLADE ) + && !getCurrentElementName().equals( TolXmlMapping.PHYLOGENY ) ) { + getCurrentXmlElement().setValue( new String( chars, start_index, end_index ).trim() ); + } + } + + @Override + public void endElement( final String namespace_uri, final String local_name, final String qualified_name ) + throws SAXException { + if ( ForesterUtil.isEmpty( namespace_uri ) || namespace_uri.startsWith( ForesterConstants.PHYLO_XML_LOCATION ) ) { + if ( local_name.equals( TolXmlMapping.CLADE ) ) { + try { + TolXmlHandler.mapElementToPhylogenyNode( getCurrentXmlElement(), getCurrentNode() ); + if ( !getCurrentNode().isRoot() ) { + setCurrentNode( getCurrentNode().getParent() ); + } + setCurrentXmlElement( getCurrentXmlElement().getParent() ); + } + catch ( final PhylogenyParserException ex ) { + throw new SAXException( ex.getMessage() ); + } + } + else if ( local_name.equals( TolXmlMapping.PHYLOGENY ) ) { + try { + TolXmlHandler.mapElementToPhylogeny( getCurrentXmlElement(), getCurrentPhylogeny() ); + } + catch ( final PhylogenyParserException ex ) { + throw new SAXException( ex.getMessage() ); + } + finishPhylogeny(); + reset(); + } + else if ( ( getCurrentPhylogeny() != null ) && ( getCurrentXmlElement().getParent() != null ) ) { + setCurrentXmlElement( getCurrentXmlElement().getParent() ); + } + setCurrentElementName( null ); + } + } + + private void finishPhylogeny() throws SAXException { + getCurrentPhylogeny().setRooted( true ); + getCurrentPhylogeny().recalculateNumberOfExternalDescendants( false ); + getPhylogenies().add( getCurrentPhylogeny() ); + } + + private String getCurrentElementName() { + return _current_element_name; + } + + private PhylogenyNode getCurrentNode() { + return _current_node; + } + + private Phylogeny getCurrentPhylogeny() { + return _current_phylogeny; + } + + private XmlElement getCurrentXmlElement() { + return _current_xml_element; + } + + List getPhylogenies() { + return _phylogenies; + } + + private void init() { + reset(); + setPhylogenies( new ArrayList() ); + } + + private void initCurrentNode() { + if ( getCurrentNode() != null ) { + throw new FailedConditionCheckException( "attempt to create new current node when current node already exists" ); + } + if ( getCurrentPhylogeny() == null ) { + throw new FailedConditionCheckException( "attempt to create new current node for non-existing phylogeny" ); + } + final PhylogenyNode node = new PhylogenyNode(); + getCurrentPhylogeny().setRoot( node ); + setCurrentNode( getCurrentPhylogeny().getRoot() ); + } + + private void newClade() { + if ( getCurrentNode() == null ) { + initCurrentNode(); + } + else { + addNode(); + } + } + + private void newPhylogeny() { + setCurrentPhylogeny( new Phylogeny() ); + } + + private void reset() { + setCurrentPhylogeny( null ); + setCurrentNode( null ); + setCurrentElementName( null ); + setCurrentXmlElement( null ); + } + + private void setCurrentElementName( final String element_name ) { + _current_element_name = element_name; + } + + private void setCurrentNode( final PhylogenyNode current_node ) { + _current_node = current_node; + } + + private void setCurrentPhylogeny( final Phylogeny phylogeny ) { + _current_phylogeny = phylogeny; + } + + private void setCurrentXmlElement( final XmlElement element ) { + _current_xml_element = element; + } + + private void setPhylogenies( final List phylogenies ) { + _phylogenies = phylogenies; + } + + @Override + public void startDocument() throws SAXException { + init(); + } + + @Override + public void startElement( final String namespace_uri, + final String local_name, + final String qualified_name, + final Attributes attributes ) throws SAXException { + setCurrentElementName( local_name ); + if ( local_name.equals( TolXmlMapping.CLADE ) ) { + final XmlElement element = new XmlElement( namespace_uri, local_name, local_name, attributes ); + getCurrentXmlElement().addChildElement( element ); + setCurrentXmlElement( element ); + newClade(); + } + else if ( local_name.equals( TolXmlMapping.PHYLOGENY ) ) { + setCurrentXmlElement( new XmlElement( "", "", "", null ) ); + newPhylogeny(); + } + else if ( getCurrentPhylogeny() != null ) { + final XmlElement element = new XmlElement( namespace_uri, local_name, local_name, attributes ); + getCurrentXmlElement().addChildElement( element ); + setCurrentXmlElement( element ); + } + } + + public static boolean attributeEqualsValue( final XmlElement element, + final String attributeName, + final String attributeValue ) { + final String attr = element.getAttribute( attributeName ); + return ( ( attr != null ) && attr.equals( attributeValue ) ); + } + + public static String getAtttributeValue( final XmlElement element, final String attributeName ) { + final String attr = element.getAttribute( attributeName ); + if ( attr != null ) { + return attr; + } + else { + return ""; + } + } + + private static void mapElementToPhylogeny( final XmlElement xml_element, final Phylogeny phylogeny ) + throws PhylogenyParserException { + // Not needed for now. + } + + private static void mapElementToPhylogenyNode( final XmlElement xml_element, final PhylogenyNode node ) + throws PhylogenyParserException { + if ( xml_element.isHasAttribute( TolXmlMapping.NODE_ID_ATTR ) ) { + final String id = xml_element.getAttribute( TolXmlMapping.NODE_ID_ATTR ); + if ( !ForesterUtil.isEmpty( id ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy() + .setIdentifier( new Identifier( id, TolXmlMapping.TOL_TAXONOMY_ID_TYPE ) ); + } + } + final boolean put_into_scientific_name = true; // Allways put into scientific name. + // if ( xml_element.isHasAttribute( TolXmlMapping.NODE_ITALICIZENAME_ATTR ) ) { + // final String ital = xml_element.getAttribute( TolXmlMapping.NODE_ITALICIZENAME_ATTR ); + // if ( !ForesterUtil.isEmpty( ital ) && ital.equals( "1" ) ) { + // put_into_scientific_name = true; + // } + // } + for( int i = 0; i < xml_element.getNumberOfChildElements(); ++i ) { + final XmlElement element = xml_element.getChildElement( i ); + final String qualified_name = element.getQualifiedName(); + if ( qualified_name.equals( TolXmlMapping.TAXONOMY_NAME ) ) { + final String name = element.getValueAsString(); + if ( !ForesterUtil.isEmpty( name ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + if ( put_into_scientific_name ) { + node.getNodeData().getTaxonomy().setScientificName( name ); + } + else { + node.getNodeData().getTaxonomy().setCommonName( name ); + } + } + } + else if ( qualified_name.equals( TolXmlMapping.AUTHORITY ) ) { + String auth = element.getValueAsString(); + if ( !ForesterUtil.isEmpty( auth ) && !auth.equalsIgnoreCase( "null" ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + auth = auth.replaceAll( "&", "&" ); + node.getNodeData().getTaxonomy().setAuthority( auth ); + } + } + else if ( qualified_name.equals( TolXmlMapping.AUTHDATE ) ) { + final String authdate = element.getValueAsString(); + if ( !ForesterUtil.isEmpty( authdate ) && !authdate.equalsIgnoreCase( "null" ) ) { + if ( node.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getAuthority() ) ) { + _buffer.setLength( 0 ); + _buffer.append( node.getNodeData().getTaxonomy().getAuthority() ); + _buffer.append( " " ); + _buffer.append( authdate ); + node.getNodeData().getTaxonomy().setAuthority( _buffer.toString() ); + } + } + } + else if ( qualified_name.equals( TolXmlMapping.OTHERNAMES ) ) { + for( int j = 0; j < element.getNumberOfChildElements(); ++j ) { + final XmlElement element_j = element.getChildElement( j ); + if ( element_j.getQualifiedName().equals( TolXmlMapping.OTHERNAME ) ) { + for( int z = 0; z < element_j.getNumberOfChildElements(); ++z ) { + final XmlElement element_z = element_j.getChildElement( z ); + if ( element_z.getQualifiedName().equals( TolXmlMapping.OTHERNAME_NAME ) ) { + final String syn = element_z.getValueAsString(); + if ( !ForesterUtil.isEmpty( syn ) && !syn.equalsIgnoreCase( "null" ) ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy().getSynonyms().add( syn ); + } + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/io/parsers/tol/TolXmlMapping.java b/forester/java/src/org/forester/io/parsers/tol/TolXmlMapping.java new file mode 100644 index 0000000..bb7732d --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/tol/TolXmlMapping.java @@ -0,0 +1,47 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.tol; + +public final class TolXmlMapping { + + public static final String PHYLOGENY = "TREE"; + public static final String CLADE = "NODE"; + public static final String AUTHDATE = "AUTHDATE"; + public static final String AUTHORITY = "AUTHORITY"; + public static final String TAXONOMY_NAME = "NAME"; + public static final String OTHERNAMES = "OTHERNAMES"; + public static final String OTHERNAME = "OTHERNAME"; + public static final String OTHERNAME_NAME = "NAME"; + public static final String NODE_ID_ATTR = "ID"; + public static final String NODE_ITALICIZENAME_ATTR = "ITALICIZENAME"; + public static final String TOL_TAXONOMY_ID_TYPE = "tol"; + + private TolXmlMapping() { + // Hidden. + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/io/parsers/util/ParserUtils.java b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java new file mode 100644 index 0000000..09d5b24 --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/util/ParserUtils.java @@ -0,0 +1,73 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/ + +package org.forester.io.parsers.util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.StringReader; + +public final class ParserUtils { + + public static BufferedReader createReader( final Object source ) throws IOException, FileNotFoundException { + BufferedReader reader = null; + if ( ( source instanceof File ) || ( source instanceof String ) ) { + File f = null; + if ( source instanceof File ) { + f = ( File ) source; + } + else { + f = new File( ( String ) source ); + } + if ( !f.exists() ) { + throw new IOException( "[" + f.getAbsolutePath() + "] does not exist" ); + } + else if ( !f.isFile() ) { + throw new IOException( "[" + f.getAbsolutePath() + "] is not a file" ); + } + else if ( !f.canRead() ) { + throw new IOException( "[" + f.getAbsolutePath() + "] is not a readable" ); + } + reader = new BufferedReader( new FileReader( f ) ); + } + else if ( source instanceof InputStream ) { + reader = new BufferedReader( new InputStreamReader( ( InputStream ) source ) ); + } + else if ( ( source instanceof StringBuffer ) || ( source instanceof StringBuilder ) ) { + reader = new BufferedReader( new StringReader( source.toString() ) ); + } + else { + throw new IllegalArgumentException( "attempt to parse object of type [" + source.getClass() + + "] (can only parse objects of type File/String, InputStream, StringBuffer, or StringBuilder)" ); + } + return reader; + } +} diff --git a/forester/java/src/org/forester/io/parsers/util/PhylogenyParserException.java b/forester/java/src/org/forester/io/parsers/util/PhylogenyParserException.java new file mode 100644 index 0000000..e15472f --- /dev/null +++ b/forester/java/src/org/forester/io/parsers/util/PhylogenyParserException.java @@ -0,0 +1,53 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.parsers.util; + +import java.io.IOException; + +/* + * @author Christian Zmasek + */ +public class PhylogenyParserException extends IOException { + + /** + * + */ + private static final long serialVersionUID = -4810333295377881086L; + + /** + * + */ + public PhylogenyParserException() { + super(); + } + + /** + * @param arg0 + */ + public PhylogenyParserException( final String message ) { + super( message ); + } +} diff --git a/forester/java/src/org/forester/io/writers/PhyloXmlNodeWriter.java b/forester/java/src/org/forester/io/writers/PhyloXmlNodeWriter.java new file mode 100644 index 0000000..586a709 --- /dev/null +++ b/forester/java/src/org/forester/io/writers/PhyloXmlNodeWriter.java @@ -0,0 +1,59 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2000-2009 Christian M. Zmasek +// Copyright (C) 2007-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.writers; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.PhylogenyDataUtil; +import org.forester.util.ForesterUtil; + +public class PhyloXmlNodeWriter { + + public static void toPhyloXml( final Writer w, final PhylogenyNode node, final int level, final String indentation ) + throws IOException { + String ind = ""; + if ( indentation.length() > 0 ) { + ind = indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE; + } + if ( !ForesterUtil.isEmpty( node.getName() ) ) { + PhylogenyDataUtil.appendElement( w, PhyloXmlMapping.NODE_NAME, node.getName(), indentation ); + } + if ( node.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + PhylogenyDataUtil.appendElement( w, PhyloXmlMapping.BRANCH_LENGTH, String.valueOf( ForesterUtil.round( node + .getDistanceToParent(), PhyloXmlUtil.ROUNDING_DIGITS_FOR_PHYLOXML_DOUBLE_OUTPUT ) ), indentation ); + } + if ( node.getBranchData() != null ) { + node.getBranchData().toPhyloXML( w, level, ind ); + } + if ( node.getNodeData() != null ) { + node.getNodeData().toPhyloXML( w, level, ind ); + } + } +} diff --git a/forester/java/src/org/forester/io/writers/PhylogenyWriter.java b/forester/java/src/org/forester/io/writers/PhylogenyWriter.java new file mode 100644 index 0000000..630f057 --- /dev/null +++ b/forester/java/src/org/forester/io/writers/PhylogenyWriter.java @@ -0,0 +1,761 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.io.writers; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Stack; + +import org.forester.io.parsers.nexus.NexusConstants; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.PhylogenyDataUtil; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.phylogeny.iterators.PostOrderStackObject; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; + +public final class PhylogenyWriter { + + public final static boolean INDENT_PHYLOXML_DEAFULT = true; + public final static String PHYLO_XML_INTENDATION_BASE = " "; + public final static String PHYLO_XML_VERSION_ENCODING_LINE = ""; + public final static String PHYLO_XML_NAMESPACE_LINE = ""; + public final static String PHYLO_XML_END = ""; + private boolean _saw_comma; + private StringBuffer _buffer; + private Writer _writer; + private PhylogenyNode _root; + private boolean _has_next; + private Stack _stack; + private boolean _simple_nh; + private boolean _nh_write_distance_to_parent; + private boolean _indent_phyloxml; + private int _node_level; + private int _phyloxml_level; + private FORMAT _format; + + public PhylogenyWriter() { + setIndentPhyloxml( INDENT_PHYLOXML_DEAFULT ); + } + + private void appendPhylogenyLevelPhyloXml( final Writer writer, final Phylogeny tree ) throws IOException { + final String indentation = new String(); + if ( !ForesterUtil.isEmpty( tree.getName() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.PHYLOGENY_NAME, tree.getName(), indentation ); + } + if ( tree.getIdentifier() != null ) { + if ( ForesterUtil.isEmpty( tree.getIdentifier().getProvider() ) ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.IDENTIFIER, + tree.getIdentifier().getValue(), + indentation ); + } + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.IDENTIFIER, + tree.getIdentifier().getValue(), + PhyloXmlMapping.IDENTIFIER_PROVIDER_ATTR, + tree.getIdentifier().getProvider(), + indentation ); + } + if ( !ForesterUtil.isEmpty( tree.getDescription() ) ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.PHYLOGENY_DESCRIPTION, + tree.getDescription(), + indentation ); + } + if ( tree.getConfidence() != null ) { + if ( ForesterUtil.isEmpty( tree.getConfidence().getType() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.CONFIDENCE, tree.getConfidence().getValue() + + "", indentation ); + } + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.CONFIDENCE, + tree.getConfidence().getValue() + "", + PhyloXmlMapping.CONFIDENCE_TYPE_ATTR, + tree.getConfidence().getType(), + indentation ); + } + } + + private StringBuffer createIndentation() { + if ( !isIndentPhyloxml() ) { + return null; + } + final StringBuffer sb = new StringBuffer( getNodeLevel() * 2 ); + for( int i = 0; i < getNodeLevel(); ++i ) { + sb.append( PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + } + return sb; + } + + private void decreaseNodeLevel() { + --_node_level; + } + + private StringBuffer getBuffer() { + return _buffer; + } + + private int getNodeLevel() { + return _node_level; + } + + private StringBuffer getOutput( final Phylogeny tree ) throws IOException { + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + throw new RuntimeException( "method inappropriately called" ); + } + if ( tree != null ) { + reset( tree ); + while ( isHasNext() ) { + next(); + } + if ( getOutputFormt() == FORMAT.NH ) { + getBuffer().append( ';' ); + } + return getBuffer(); + } + else { + return new StringBuffer( 0 ); + } + } + + private FORMAT getOutputFormt() { + return _format; + } + + private int getPhyloXmlLevel() { + return _phyloxml_level; + } + + private PhylogenyNode getRoot() { + return _root; + } + + private Stack getStack() { + return _stack; + } + + private Writer getWriter() { + return _writer; + } + + private void increaseNodeLevel() { + ++_node_level; + } + + private boolean isHasNext() { + return _has_next; + } + + private boolean isIndentPhyloxml() { + return _indent_phyloxml; + } + + private boolean isSawComma() { + return _saw_comma; + } + + private boolean isSimpleNH() { + return _simple_nh; + } + + private boolean isWriteDistanceToParentInNH() { + return _nh_write_distance_to_parent; + } + + private void next() throws IOException { + while ( true ) { + final PostOrderStackObject si = getStack().pop(); + final PhylogenyNode node = si.getNode(); + final int phase = si.getPhase(); + if ( phase > node.getNumberOfDescendants() ) { + setHasNext( node != getRoot() ); + if ( ( getOutputFormt() != FORMAT.PHYLO_XML ) || node.isExternal() ) { + if ( !node.isRoot() && node.isFirstChildNode() ) { + increaseNodeLevel(); + } + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + writeNode( node, createIndentation() ); + } + else { + writeNode( node, null ); + } + } + if ( !node.isRoot() ) { + if ( !node.isLastChildNode() ) { + writeCladeSeparator(); + } + else { + writeCloseClade(); + } + } + return; + } + else { + getStack().push( new PostOrderStackObject( node, ( phase + 1 ) ) ); + if ( node.isInternal() ) { + getStack().push( new PostOrderStackObject( node.getChildNode( phase - 1 ), 1 ) ); + writeOpenClade( node ); + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + if ( phase == 1 ) { + writeNode( node, createIndentation() ); + } + } + } + } + } + } + + private void reset( final Phylogeny tree ) { + setBuffer( new StringBuffer() ); + setWriter( null ); + setSawComma( false ); + setHasNext( true ); + setRoot( tree.getRoot() ); + setStack( new Stack() ); + getStack().push( new PostOrderStackObject( tree.getRoot(), 1 ) ); + setNodeLevel( 1 ); + } + + private void reset( final Writer writer, final Phylogeny tree ) { + setBuffer( null ); + setWriter( writer ); + setSawComma( false ); + setHasNext( true ); + setRoot( tree.getRoot() ); + setStack( new Stack() ); + getStack().push( new PostOrderStackObject( tree.getRoot(), 1 ) ); + setNodeLevel( 1 ); + } + + private void setBuffer( final StringBuffer buffer ) { + _buffer = buffer; + } + + private void setHasNext( final boolean has_next ) { + _has_next = has_next; + } + + public void setIndentPhyloxml( final boolean indent_phyloxml ) { + _indent_phyloxml = indent_phyloxml; + } + + private void setNodeLevel( final int level ) { + _node_level = level; + } + + private void setOutputFormt( final FORMAT format ) { + _format = format; + } + + private void setPhyloXmlLevel( final int phyloxml_level ) { + _phyloxml_level = phyloxml_level; + } + + private void setRoot( final PhylogenyNode root ) { + _root = root; + } + + private void setSawComma( final boolean saw_comma ) { + _saw_comma = saw_comma; + } + + private void setSimpleNH( final boolean simple_nh ) { + _simple_nh = simple_nh; + } + + private void setStack( final Stack stack ) { + _stack = stack; + } + + private void setWriteDistanceToParentInNH( final boolean nh_write_distance_to_parent ) { + _nh_write_distance_to_parent = nh_write_distance_to_parent; + } + + private void setWriter( final Writer writer ) { + _writer = writer; + } + + public void toNewHampshire( final List trees, + final boolean simple_nh, + final boolean write_distance_to_parent, + final File out_file, + final String separator ) throws IOException { + final Iterator it = trees.iterator(); + final StringBuffer sb = new StringBuffer(); + while ( it.hasNext() ) { + sb.append( toNewHampshire( it.next(), simple_nh, write_distance_to_parent ) ); + sb.append( separator ); + } + writeToFile( sb, out_file ); + } + + public StringBuffer toNewHampshire( final Phylogeny tree, + final boolean simple_nh, + final boolean nh_write_distance_to_parent ) throws IOException { + setOutputFormt( FORMAT.NH ); + setSimpleNH( simple_nh ); + setWriteDistanceToParentInNH( nh_write_distance_to_parent ); + return getOutput( tree ); + } + + public void toNewHampshire( final Phylogeny tree, + final boolean simple_nh, + final boolean write_distance_to_parent, + final File out_file ) throws IOException { + writeToFile( toNewHampshire( tree, simple_nh, write_distance_to_parent ), out_file ); + } + + public void toNewHampshire( final Phylogeny[] trees, + final boolean simple_nh, + final boolean write_distance_to_parent, + final File out_file, + final String separator ) throws IOException { + final StringBuffer sb = new StringBuffer(); + for( final Phylogeny element : trees ) { + sb.append( toNewHampshire( element, simple_nh, write_distance_to_parent ) ); + sb.append( separator ); + } + writeToFile( sb, out_file ); + } + + public void toNewHampshireX( final List trees, final File out_file, final String separator ) + throws IOException { + final Iterator it = trees.iterator(); + final StringBuffer sb = new StringBuffer(); + while ( it.hasNext() ) { + sb.append( toNewHampshireX( it.next() ) ); + sb.append( separator ); + } + writeToFile( sb, out_file ); + } + + public StringBuffer toNewHampshireX( final Phylogeny tree ) throws IOException { + setOutputFormt( FORMAT.NHX ); + return getOutput( tree ); + } + + public void toNewHampshireX( final Phylogeny tree, final File out_file ) throws IOException { + writeToFile( toNewHampshireX( tree ), out_file ); + } + + public void toNewHampshireX( final Phylogeny[] trees, final File out_file, final String separator ) + throws IOException { + final StringBuffer sb = new StringBuffer(); + for( final Phylogeny element : trees ) { + sb.append( toNewHampshireX( element ) ); + sb.append( separator ); + } + writeToFile( sb, out_file ); + } + + public void toNexus( final File out_file, final List trees ) throws IOException { + final Writer writer = new BufferedWriter( new PrintWriter( out_file ) ); + writeNexusStart( writer ); + writeNexusTaxaBlock( writer, trees.get( 0 ) ); + writeNexusTreesBlock( writer, trees ); + writer.flush(); + writer.close(); + } + + public void toNexus( final File out_file, final Phylogeny tree ) throws IOException { + final Writer writer = new BufferedWriter( new PrintWriter( out_file ) ); + final List trees = new ArrayList( 1 ); + trees.add( tree ); + writeNexusStart( writer ); + writeNexusTaxaBlock( writer, tree ); + writeNexusTreesBlock( writer, trees ); + writer.flush(); + writer.close(); + } + + public StringBuffer toNexus( final Phylogeny tree ) throws IOException { + final StringWriter string_writer = new StringWriter(); + final Writer writer = new BufferedWriter( string_writer ); + final List trees = new ArrayList( 1 ); + trees.add( tree ); + writeNexusStart( writer ); + writeNexusTaxaBlock( writer, tree ); + writeNexusTreesBlock( writer, trees ); + writer.flush(); + writer.close(); + return string_writer.getBuffer(); + } + + public void toPhyloXML( final File out_file, + final List trees, + final int phyloxml_level, + final String separator ) throws IOException { + final Writer writer = new BufferedWriter( new PrintWriter( out_file ) ); + toPhyloXML( writer, trees, phyloxml_level, separator ); + writer.flush(); + writer.close(); + } + + public void toPhyloXML( final File out_file, final Phylogeny tree, final int phyloxml_level ) throws IOException { + final Writer writer = new BufferedWriter( new PrintWriter( out_file ) ); + writePhyloXmlStart( writer ); + toPhyloXMLNoPhyloXmlSource( writer, tree, phyloxml_level ); + writePhyloXmlEnd( writer ); + writer.flush(); + writer.close(); + } + + public StringBuffer toPhyloXML( final Phylogeny tree, final int phyloxml_level ) throws IOException { + final StringWriter string_writer = new StringWriter(); + final Writer writer = new BufferedWriter( string_writer ); + setPhyloXmlLevel( phyloxml_level ); + setOutputFormt( FORMAT.PHYLO_XML ); + writePhyloXmlStart( writer ); + writeOutput( writer, tree ); + writePhyloXmlEnd( writer ); + writer.flush(); + writer.close(); + return string_writer.getBuffer(); + } + + public void toPhyloXML( final Phylogeny[] trees, + final int phyloxml_level, + final File out_file, + final String separator ) throws IOException { + final Writer writer = new BufferedWriter( new PrintWriter( out_file ) ); + toPhyloXML( writer, trees, phyloxml_level, separator ); + writer.flush(); + writer.close(); + } + + public void toPhyloXML( final Writer writer, + final List trees, + final int phyloxml_level, + final String separator ) throws IOException { + writePhyloXmlStart( writer ); + final Iterator it = trees.iterator(); + while ( it.hasNext() ) { + toPhyloXMLNoPhyloXmlSource( writer, it.next(), phyloxml_level ); + writer.write( separator ); + } + writePhyloXmlEnd( writer ); + } + + public void toPhyloXML( final Writer writer, final Phylogeny tree, final int phyloxml_level ) throws IOException { + setPhyloXmlLevel( phyloxml_level ); + setOutputFormt( FORMAT.PHYLO_XML ); + writePhyloXmlStart( writer ); + writeOutput( writer, tree ); + writePhyloXmlEnd( writer ); + } + + public void toPhyloXML( final Writer writer, + final Phylogeny[] trees, + final int phyloxml_level, + final String separator ) throws IOException { + writePhyloXmlStart( writer ); + for( final Phylogeny phylogeny : trees ) { + toPhyloXMLNoPhyloXmlSource( writer, phylogeny, phyloxml_level ); + writer.write( separator ); + } + writePhyloXmlEnd( writer ); + } + + private void toPhyloXMLNoPhyloXmlSource( final Writer writer, final Phylogeny tree, final int phyloxml_level ) + throws IOException { + setPhyloXmlLevel( phyloxml_level ); + setOutputFormt( FORMAT.PHYLO_XML ); + writeOutput( writer, tree ); + } + + private void writeCladeSeparator() { + setSawComma( true ); + if ( ( getOutputFormt() == FORMAT.NHX ) || ( getOutputFormt() == FORMAT.NH ) ) { + getBuffer().append( "," ); + } + } + + private void writeCloseClade() throws IOException { + decreaseNodeLevel(); + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + getWriter().write( ForesterUtil.LINE_SEPARATOR ); + if ( isIndentPhyloxml() ) { + getWriter().write( createIndentation().toString() ); + } + PhylogenyDataUtil.appendClose( getWriter(), PhyloXmlMapping.CLADE ); + } + else if ( ( getOutputFormt() == FORMAT.NHX ) || ( getOutputFormt() == FORMAT.NH ) ) { + getBuffer().append( ")" ); + } + } + + private void writeNode( final PhylogenyNode node, final StringBuffer indentation ) throws IOException { + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + if ( node.isExternal() ) { + getWriter().write( ForesterUtil.LINE_SEPARATOR ); + if ( indentation != null ) { + getWriter().write( indentation.toString() ); + } + PhylogenyDataUtil.appendOpen( getWriter(), PhyloXmlMapping.CLADE ); + } + if ( indentation != null ) { + PhyloXmlNodeWriter.toPhyloXml( getWriter(), node, getPhyloXmlLevel(), indentation.toString() ); + } + else { + PhyloXmlNodeWriter.toPhyloXml( getWriter(), node, getPhyloXmlLevel(), "" ); + } + if ( node.isExternal() ) { + getWriter().write( ForesterUtil.LINE_SEPARATOR ); + if ( indentation != null ) { + getWriter().write( indentation.toString() ); + } + PhylogenyDataUtil.appendClose( getWriter(), PhyloXmlMapping.CLADE ); + } + } + else if ( getOutputFormt() == FORMAT.NHX ) { + getBuffer().append( node.toNewHampshireX() ); + } + else if ( getOutputFormt() == FORMAT.NH ) { + getBuffer().append( node.toNewHampshire( isSimpleNH(), isWriteDistanceToParentInNH() ) ); + } + } + + private void writeOpenClade( final PhylogenyNode node ) throws IOException { + if ( !isSawComma() ) { + if ( !node.isRoot() && node.isFirstChildNode() ) { + increaseNodeLevel(); + } + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + getWriter().write( ForesterUtil.LINE_SEPARATOR ); + if ( isIndentPhyloxml() ) { + getWriter().write( createIndentation().toString() ); + } + PhylogenyDataUtil.appendOpen( getWriter(), PhyloXmlMapping.CLADE ); + } + else if ( ( getOutputFormt() == FORMAT.NHX ) || ( getOutputFormt() == FORMAT.NH ) ) { + getBuffer().append( "(" ); + } + } + setSawComma( false ); + } + + private void writeOutput( final Writer writer, final Phylogeny tree ) throws IOException { + if ( getOutputFormt() != FORMAT.PHYLO_XML ) { + throw new RuntimeException( "method inappropriately called" ); + } + if ( tree != null ) { + reset( writer, tree ); + boolean rerootable = true; + String unit = ""; + String type = ""; + String rooted = "false"; + if ( tree.isRooted() ) { + rooted = "true"; + } + if ( !tree.isRerootable() ) { + rerootable = false; + } + if ( !ForesterUtil.isEmpty( tree.getDistanceUnit() ) ) { + unit = tree.getDistanceUnit(); + } + if ( !ForesterUtil.isEmpty( tree.getType() ) ) { + type = tree.getType(); + } + if ( rerootable ) { + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.PHYLOGENY, + PhyloXmlMapping.PHYLOGENY_IS_ROOTED_ATTR, + rooted, + PhyloXmlMapping.PHYLOGENY_BRANCHLENGTH_UNIT_ATTR, + unit, + PhyloXmlMapping.PHYLOGENY_TYPE_ATTR, + type ); + } + else { + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.PHYLOGENY, + PhyloXmlMapping.PHYLOGENY_IS_ROOTED_ATTR, + rooted, + PhyloXmlMapping.PHYLOGENY_BRANCHLENGTH_UNIT_ATTR, + unit, + PhyloXmlMapping.PHYLOGENY_TYPE_ATTR, + type, + PhyloXmlMapping.PHYLOGENY_IS_REROOTABLE_ATTR, + "false" ); + } + appendPhylogenyLevelPhyloXml( writer, tree ); + while ( isHasNext() ) { + next(); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.PHYLOGENY ); + } + } + + private void writeToFile( final StringBuffer sb, final File out_file ) throws IOException { + if ( out_file.exists() ) { + throw new IOException( "attempt to overwrite existing file \"" + out_file.getAbsolutePath() + "\"" ); + } + final PrintWriter out = new PrintWriter( new FileWriter( out_file ), true ); + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + out.print( PHYLO_XML_VERSION_ENCODING_LINE ); + out.print( ForesterUtil.LINE_SEPARATOR ); + out.print( PHYLO_XML_NAMESPACE_LINE ); + out.print( ForesterUtil.LINE_SEPARATOR ); + } + out.print( sb ); + if ( getOutputFormt() == FORMAT.PHYLO_XML ) { + out.print( ForesterUtil.LINE_SEPARATOR ); + out.print( PHYLO_XML_END ); + } + out.flush(); + out.close(); + } + + public static PhylogenyWriter createPhylogenyWriter() { + return new PhylogenyWriter(); + } + + private static void writeNexusStart( final Writer writer ) throws IOException { + writer.write( NexusConstants.NEXUS ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + public static void writeNexusTaxaBlock( final Writer writer, final Phylogeny tree ) throws IOException { + writer.write( NexusConstants.BEGIN_TAXA ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( " " ); + writer.write( NexusConstants.DIMENSIONS ); + writer.write( " " ); + writer.write( NexusConstants.NTAX ); + writer.write( "=" ); + writer.write( String.valueOf( tree.getNumberOfExternalNodes() ) ); + writer.write( ";" ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( " " ); + writer.write( NexusConstants.TAXLABELS ); + for( final PhylogenyNodeIterator it = tree.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + writer.write( " " ); + String data = ""; + if ( !ForesterUtil.isEmpty( node.getName() ) ) { + data = node.getName(); + } + else if ( node.getNodeData().isHasTaxonomy() ) { + if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + data = node.getNodeData().getTaxonomy().getTaxonomyCode(); + } + else if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getScientificName() ) ) { + data = node.getNodeData().getTaxonomy().getScientificName(); + } + else if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getCommonName() ) ) { + data = node.getNodeData().getTaxonomy().getCommonName(); + } + else if ( node.getNodeData().getTaxonomy().getTaxonomyCode() != null ) { + data = node.getNodeData().getTaxonomy().getTaxonomyCode(); + } + } + else if ( node.getNodeData().isHasSequence() ) { + if ( !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) { + data = node.getNodeData().getSequence().getName(); + } + } + if ( data.length() > 0 ) { + data = data.replaceAll( " ", "_" ); + } + writer.write( data ); + } + writer.write( ";" ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( NexusConstants.END ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + public static void writeNexusTreesBlock( final Writer writer, final List trees ) throws IOException { + writer.write( NexusConstants.BEGIN_TREES ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + int i = 1; + for( final Phylogeny phylogeny : trees ) { + writer.write( " " ); + writer.write( NexusConstants.TREE ); + writer.write( " " ); + if ( !ForesterUtil.isEmpty( phylogeny.getName() ) ) { + writer.write( "\'" ); + writer.write( phylogeny.getName() ); + writer.write( "\'" ); + } + else { + writer.write( "tree" ); + writer.write( String.valueOf( i ) ); + } + writer.write( "=" ); + if ( phylogeny.isRooted() ) { + writer.write( "[&R]" ); + } + else { + writer.write( "[&U]" ); + } + writer.write( phylogeny.toNewHampshire( false ) ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + i++; + } + writer.write( NexusConstants.END ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + private static void writePhyloXmlEnd( final Writer writer ) throws IOException { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( PhylogenyWriter.PHYLO_XML_END ); + } + + private static void writePhyloXmlStart( final Writer writer ) throws IOException { + writer.write( PhylogenyWriter.PHYLO_XML_VERSION_ENCODING_LINE ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( PhylogenyWriter.PHYLO_XML_NAMESPACE_LINE ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + + public static enum FORMAT { + NH, NHX, PHYLO_XML, NEXUS; + } +} diff --git a/forester/java/src/org/forester/io/writers/SequenceWriter.java b/forester/java/src/org/forester/io/writers/SequenceWriter.java new file mode 100644 index 0000000..b4135d1 --- /dev/null +++ b/forester/java/src/org/forester/io/writers/SequenceWriter.java @@ -0,0 +1,96 @@ + +package org.forester.io.writers; + +import java.io.IOException; +import java.io.Writer; +import java.util.List; + +import org.forester.sequence.BasicSequence; +import org.forester.sequence.Sequence; +import org.forester.util.ForesterUtil; + +public class SequenceWriter { + + public static enum SEQ_FORMAT { + FASTA; + } + + public static void main( final String[] args ) { + final Sequence s = BasicSequence.createAaSequence( "name", "abcdefghiiklmnap" ); + System.out.println( s.toString() ); + System.out.println( SequenceWriter.toFasta( s, 0 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 5 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 8 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 4 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 3 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 2 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 1 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 100 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 15 ).toString() ); + System.out.println( SequenceWriter.toFasta( s, 16 ).toString() ); + } + + public static StringBuilder toFasta( final Sequence seq, final int width ) { + final StringBuilder sb = new StringBuilder(); + sb.append( ">" ); + sb.append( seq.getIdentifier().toString() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + if ( ( width < 1 ) || ( width >= seq.getLength() ) ) { + sb.append( seq.getMolecularSequence() ); + } + else { + final int lines = seq.getLength() / width; + final int rest = seq.getLength() - ( lines * width ); + for( int i = 0; i < lines; ++i ) { + sb.append( seq.getMolecularSequence(), i * width, width ); + if ( i < ( lines - 1 ) ) { + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + } + if ( rest > 0 ) { + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( seq.getMolecularSequence(), lines * width, rest ); + } + } + return sb; + } + + public static void toFasta( final Sequence seq, final Writer w, final int width ) throws IOException { + w.write( ">" ); + w.write( seq.getIdentifier().toString() ); + w.write( ForesterUtil.LINE_SEPARATOR ); + if ( ( width < 1 ) || ( width >= seq.getLength() ) ) { + w.write( seq.getMolecularSequence() ); + } + else { + final int lines = seq.getLength() / width; + final int rest = seq.getLength() - ( lines * width ); + for( int i = 0; i < lines; ++i ) { + w.write( seq.getMolecularSequence(), i * width, width ); + if ( i < ( lines - 1 ) ) { + w.write( ForesterUtil.LINE_SEPARATOR ); + } + } + if ( rest > 0 ) { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( seq.getMolecularSequence(), lines * width, rest ); + } + } + } + + public static void writeSeqs( final List seqs, + final Writer writer, + final SEQ_FORMAT format, + final int width ) throws IOException { + switch ( format ) { + case FASTA: + for( final Sequence s : seqs ) { + toFasta( s, writer, width ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + break; + default: + throw new RuntimeException( "unknown format " + format ); + } + } +} diff --git a/forester/java/src/org/forester/msa/BasicMsa.java b/forester/java/src/org/forester/msa/BasicMsa.java new file mode 100644 index 0000000..6e407f8 --- /dev/null +++ b/forester/java/src/org/forester/msa/BasicMsa.java @@ -0,0 +1,156 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.msa; + +import java.io.IOException; +import java.io.Writer; +import java.util.List; + +import org.forester.sequence.Sequence; +import org.forester.sequence.Sequence.TYPE; +import org.forester.util.ForesterUtil; + +public class BasicMsa implements Msa { + + private final char[][] _data; + private final Object[] _identifiers; + private final TYPE _type; + + public BasicMsa( final int rows, final int columns, final TYPE type ) { + if ( ( rows < 1 ) || ( columns < 1 ) ) { + throw new IllegalArgumentException( "basic msa of size zero are illegal" ); + } + _data = new char[ rows ][ columns ]; + _identifiers = new Object[ rows ]; + _type = type; + } + + BasicMsa( final BasicMsa msa ) { + _data = msa._data; + _identifiers = msa._identifiers; + _type = msa._type; + } + + private int determineMaxIdLength() { + int max = 0; + for( int row = 0; row < _data.length; ++row ) { + final int l = _identifiers[ row ].toString().length(); + if ( l > max ) { + max = l; + } + } + return max; + } + + @Override + public Object getIdentifier( final int row ) { + return _identifiers[ row ]; + } + + @Override + public int getLength() { + return _data[ 0 ].length; + } + + @Override + public int getNumberOfSequences() { + return _identifiers.length; + } + + @Override + public char getResidueAt( final int row, final int col ) { + return _data[ row ][ col ]; + } + + @Override + public StringBuffer getSequenceAsString( final int row ) { + final StringBuffer sb = new StringBuffer( _data[ 0 ].length ); + for( int col = 0; col < _data[ 0 ].length; ++col ) { + sb.append( getResidueAt( row, col ) ); + } + return sb; + } + + @Override + public TYPE getType() { + return _type; + } + + public void setIdentifier( final int row, final Object id ) { + _identifiers[ row ] = id; + } + + public void setResidueAt( final int row, final int col, final char residue ) { + _data[ row ][ col ] = residue; + } + + @Override + public String toString() { + final int max = determineMaxIdLength() + 1; + final StringBuffer sb = new StringBuffer(); + for( int row = 0; row < _data.length; ++row ) { + sb.append( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ) ); + for( int col = 0; col < _data[ 0 ].length; ++col ) { + sb.append( getResidueAt( row, col ) ); + } + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb.toString(); + } + + public void write( final Writer w ) throws IOException { + final int max = determineMaxIdLength() + 1; + for( int row = 0; row < _data.length; ++row ) { + w.write( ForesterUtil.pad( _identifiers[ row ].toString(), max, ' ', false ).toString() ); + for( int col = 0; col < _data[ 0 ].length; ++col ) { + w.write( getResidueAt( row, col ) ); + } + w.write( ForesterUtil.LINE_SEPARATOR ); + } + } + + public static Msa createInstance( final List seqs ) { + if ( seqs.size() < 1 ) { + throw new IllegalArgumentException( "cannot create basic msa from less than one sequence" ); + } + final int length = seqs.get( 0 ).getLength(); + final BasicMsa msa = new BasicMsa( seqs.size(), length, seqs.get( 0 ).getType() ); + for( int row = 0; row < seqs.size(); ++row ) { + final Sequence seq = seqs.get( row ); + if ( seq.getLength() != length ) { + throw new IllegalArgumentException( "illegal attempt to build msa from sequences of unequal length" ); + } + if ( seq.getType() != msa.getType() ) { + throw new IllegalArgumentException( "illegal attempt to build msa from sequences of different type" ); + } + msa.setIdentifier( row, seq.getIdentifier() ); + for( int col = 0; col < length; ++col ) { + msa._data[ row ][ col ] = seq.getResidueAt( col ); + } + } + return msa; + } +} diff --git a/forester/java/src/org/forester/msa/Mafft.java b/forester/java/src/org/forester/msa/Mafft.java new file mode 100644 index 0000000..644b5b5 --- /dev/null +++ b/forester/java/src/org/forester/msa/Mafft.java @@ -0,0 +1,124 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.msa; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.FastaParser; +import org.forester.util.ForesterUtil; +import org.forester.util.SystemCommandExecutor; + +public final class Mafft implements MsaInferrer { + + private final static String DEFAULT_PARAMETERS = "--maxiterate 1000 --localpair"; + private String _error; + private int _exit_code; + private final String _path_to_prg; + + public static MsaInferrer createInstance( final String path_to_prg ) { + return new Mafft( path_to_prg ); + } + + private static String getPathToCmd() { + //TODO this needs to come from env variable, etc. + String path = ""; + final String os = ForesterUtil.OS_NAME.toLowerCase(); + if ( ( os.indexOf( "mac" ) >= 0 ) && ( os.indexOf( "os" ) > 0 ) ) { + path = "/usr/local/bin/mafft"; + } + else if ( os.indexOf( "win" ) >= 0 ) { + path = "C:\\Program Files\\mafft-win\\mafft.bat"; + } + else { + path = "/home/czmasek/SOFTWARE/MSA/MAFFT/mafft-6.832-without-extensions/scripts/mafft"; + } + return path; + } + + public static boolean isInstalled() { + return SystemCommandExecutor.isExecuteableFile( new File( getPathToCmd() ) ); + } + + public static MsaInferrer createInstance() { + return createInstance( getPathToCmd() ); + } + + private Mafft( final String path_to_prg ) { + if ( !SystemCommandExecutor.isExecuteableFile( new File( path_to_prg ) ) ) { + throw new IllegalArgumentException( "cannot execute MAFFT via [" + path_to_prg + "]" ); + } + _path_to_prg = new String( path_to_prg ); + init(); + } + + public static String getDefaultParameters() { + return DEFAULT_PARAMETERS; + } + + @Override + public Object clone() { + throw new NoSuchMethodError(); + } + + public String getErrorDescription() { + return _error; + } + + public int getExitCode() { + return _exit_code; + } + + public Msa infer( final File path_to_input_seqs, final List opts ) throws IOException, InterruptedException { + init(); + final List my_opts = new ArrayList(); + my_opts.add( _path_to_prg ); + for( int i = 0; i < opts.size(); i++ ) { + my_opts.add( opts.get( i ) ); + } + my_opts.add( path_to_input_seqs.getAbsolutePath() ); + final SystemCommandExecutor commandExecutor = new SystemCommandExecutor( my_opts ); + final int _exit_code = commandExecutor.executeCommand(); + if ( _exit_code != 0 ) { + throw new IOException( "MAFFT failed, exit code: " + _exit_code ); + } + final StringBuilder stdout = commandExecutor.getStandardOutputFromCommand(); + final StringBuilder stderr = commandExecutor.getStandardErrorFromCommand(); + System.out.println( stdout ); + System.out.println(); + System.out.println( stderr ); + _error = stderr.toString(); + final Msa msa = FastaParser.parseMsa( stdout.toString() ); + return msa; + } + + private void init() { + _error = null; + _exit_code = -100; + } +} diff --git a/forester/java/src/org/forester/msa/MafftOLD.java b/forester/java/src/org/forester/msa/MafftOLD.java new file mode 100644 index 0000000..323d94a --- /dev/null +++ b/forester/java/src/org/forester/msa/MafftOLD.java @@ -0,0 +1,78 @@ + +package org.forester.msa; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.List; + +import org.forester.io.parsers.FastaParser; +import org.forester.util.ExternalProgram; +import org.forester.util.ForesterUtil; + +public final class MafftOLD implements MsaInferrer { + + private String _error; + private int _exit_code; + private final String _path_to_prg; + + public static MsaInferrer createInstance( final String path_to_prg ) { + return new MafftOLD( path_to_prg ); + } + + private MafftOLD( final String path_to_prg ) { + _path_to_prg = new String( path_to_prg ); + init(); + } + + @Override + public Object clone() { + throw new NoSuchMethodError(); + } + + public String getErrorDescription() { + return _error; + } + + public int getExitCode() { + return _exit_code; + } + + public Msa infer( final File path_to_input_seqs, final List opts ) throws IOException, InterruptedException { + init(); + final String[] my_opts = new String[ opts.size() + 1 ]; + for( int i = 0; i < opts.size(); i++ ) { + my_opts[ i ] = opts.get( i ); + } + my_opts[ opts.size() ] = path_to_input_seqs.getAbsolutePath(); + final ExternalProgram mafft_prg = new ExternalProgram( _path_to_prg ); + mafft_prg.launch( my_opts ); + // _exit_code = mafft_prg.waitFor(); + // if ( _exit_code != 0 ) { + // throw new IOException( "MAFFT failed, exit code: " + _exit_code ); + // } + final BufferedReader r = new BufferedReader( new InputStreamReader( mafft_prg.getErrorStream() ) ); + final StringBuffer error_sb = new StringBuffer(); + String line = null; + while ( ( line = r.readLine() ) != null ) { + error_sb.append( line ); + error_sb.append( ForesterUtil.LINE_SEPARATOR ); + } + r.close(); + if ( error_sb.length() > 0 ) { + _error = error_sb.toString(); + throw new IOException( "MAFFT failed" ); + } + final InputStream is = mafft_prg.getInputStream(); + final Msa msa = FastaParser.parseMsa( is ); + is.close(); + return msa; + } + + private void init() { + _error = null; + _exit_code = -100; + } +} diff --git a/forester/java/src/org/forester/msa/Msa.java b/forester/java/src/org/forester/msa/Msa.java new file mode 100644 index 0000000..2867989 --- /dev/null +++ b/forester/java/src/org/forester/msa/Msa.java @@ -0,0 +1,52 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.msa; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.sequence.Sequence.TYPE; + +public interface Msa { + + public Object getIdentifier( int row ); + + public void setIdentifier( int row, Object identifier ); + + public int getLength(); + + public int getNumberOfSequences(); + + public char getResidueAt( int row, int col ); + + public StringBuffer getSequenceAsString( int row ); + + public abstract TYPE getType(); + + public void setResidueAt( final int row, final int col, final char residue ); + + public void write( Writer w ) throws IOException; +} diff --git a/forester/java/src/org/forester/msa/MsaFormatException.java b/forester/java/src/org/forester/msa/MsaFormatException.java new file mode 100644 index 0000000..949b072 --- /dev/null +++ b/forester/java/src/org/forester/msa/MsaFormatException.java @@ -0,0 +1,37 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.msa; + +import java.io.IOException; + +public class MsaFormatException extends IOException { + + private static final long serialVersionUID = 690079849050106491L; + + public MsaFormatException( final String msg ) { + super( msg ); + } +} diff --git a/forester/java/src/org/forester/msa/MsaInferrer.java b/forester/java/src/org/forester/msa/MsaInferrer.java new file mode 100644 index 0000000..801c247 --- /dev/null +++ b/forester/java/src/org/forester/msa/MsaInferrer.java @@ -0,0 +1,39 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.msa; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +public interface MsaInferrer { + + public String getErrorDescription(); + + public int getExitCode(); + + public Msa infer( File path_to_input_seqs, List opts ) throws IOException, InterruptedException; +} diff --git a/forester/java/src/org/forester/msa/MsaTools.java b/forester/java/src/org/forester/msa/MsaTools.java new file mode 100644 index 0000000..a2dfa06 --- /dev/null +++ b/forester/java/src/org/forester/msa/MsaTools.java @@ -0,0 +1,126 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.msa; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.sequence.BasicSequence; +import org.forester.sequence.Sequence; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; + +public final class MsaTools { + + private ArrayList _ignored_seqs_ids; + + synchronized public ArrayList getIgnoredSequenceIds() { + return _ignored_seqs_ids; + } + + synchronized public static MsaTools createInstance() { + return new MsaTools(); + } + + private MsaTools() { + init(); + } + + synchronized private void init() { + _ignored_seqs_ids = new ArrayList(); + } + + @Override + public Object clone() { + throw new NoSuchMethodError(); + } + + public static int calcGapSumPerColumn( final Msa msa, final int col ) { + int gap_rows = 0; + for( int j = 0; j < msa.getNumberOfSequences(); ++j ) { + if ( msa.getResidueAt( j, col ) == Sequence.GAP ) { + gap_rows++; + } + } + return gap_rows; + } + + synchronized public Msa removeGapColumns( final double max_allowed_gap_ratio, + final int min_allowed_length, + final Msa msa ) { + init(); + if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) { + throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio ); + } + final boolean ignore_too_short_seqs = min_allowed_length > 0; + final boolean[] delete_cols = new boolean[ msa.getLength() ]; + int new_length = 0; + for( int col = 0; col < msa.getLength(); ++col ) { + delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) > max_allowed_gap_ratio; + if ( !delete_cols[ col ] ) { + ++new_length; + } + } + final List seqs = new ArrayList( msa.getNumberOfSequences() ); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char[] mol_seq = new char[ new_length ]; + int new_col = 0; + int non_gap_cols_sum = 0; + for( int col = 0; col < msa.getLength(); ++col ) { + if ( !delete_cols[ col ] ) { + final char residue = msa.getResidueAt( row, col ); + mol_seq[ new_col++ ] = ( residue ); + if ( residue != Sequence.GAP ) { + ++non_gap_cols_sum; + } + } + } + if ( ignore_too_short_seqs ) { + if ( non_gap_cols_sum >= min_allowed_length ) { + seqs.add( new BasicSequence( msa.getIdentifier( row ), mol_seq, msa.getType() ) ); + } + else { + _ignored_seqs_ids.add( msa.getIdentifier( row ).toString() ); + } + } + else { + seqs.add( new BasicSequence( msa.getIdentifier( row ), mol_seq, msa.getType() ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( int i = 0; i < msa.getLength(); ++i ) { + stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() ); + } + return stats; + } +} diff --git a/forester/java/src/org/forester/msa/ResampleableMsa.java b/forester/java/src/org/forester/msa/ResampleableMsa.java new file mode 100644 index 0000000..7476cad --- /dev/null +++ b/forester/java/src/org/forester/msa/ResampleableMsa.java @@ -0,0 +1,57 @@ +// / $Id: ResampleableMsa.java,v 1.3 2010/12/13 18:59:48 cmzmasek Exp $ +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.msa; + +public final class ResampleableMsa extends BasicMsa { + + private int[] _resampled_column_positions = null; + + public ResampleableMsa( final BasicMsa msa ) { + super( msa ); + } + + public void resample( final int[] resampled_column_positions ) { + if ( resampled_column_positions.length != getLength() ) { + _resampled_column_positions = null; + throw new IllegalArgumentException( "illegal attempt to use " + resampled_column_positions.length + + " resampled column positions on msa of length " + getLength() ); + } + _resampled_column_positions = resampled_column_positions; + } + + @Override + public char getResidueAt( final int row, final int col ) { + if ( _resampled_column_positions != null ) { + return super.getResidueAt( row, _resampled_column_positions[ col ] ); + } + return super.getResidueAt( row, col ); + } + + @Override + public void setResidueAt( final int row, final int col, final char residue ) { + throw new NoSuchMethodError( "illegal attempt to set residue in resampleable msa" ); + } +} diff --git a/forester/java/src/org/forester/pccx/BasicExternalNodeBasedCoverageExtender.java b/forester/java/src/org/forester/pccx/BasicExternalNodeBasedCoverageExtender.java new file mode 100644 index 0000000..790518a --- /dev/null +++ b/forester/java/src/org/forester/pccx/BasicExternalNodeBasedCoverageExtender.java @@ -0,0 +1,178 @@ +// $Id: +// cmzmasek Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +/* + * @author Christian M. Zmasek + */ +public class BasicExternalNodeBasedCoverageExtender implements CoverageExtender { + + private String find( final CoverageCalculationOptions options, + final BranchCountingBasedScoringMethod scoring_method, + final List> external_node_scores_list, + final List> external_node_scores_list_temp, + final List phylogenies, + final Set already_covered, + final PrintStream out, + final int i, + final double normalization_factor ) { + final Phylogeny p = phylogenies.get( 0 ); + String best_name = null; + double best_score = -Double.MAX_VALUE; + for( final PhylogenyNodeIterator iter = p.iteratorExternalForward(); iter.hasNext(); ) { + final String name = iter.next().getName(); + if ( !already_covered.contains( name ) ) { + final double score = BasicExternalNodeBasedCoverageExtender + .calculateCoverage( phylogenies, + name, + options, + scoring_method, + external_node_scores_list_temp, + false ); + if ( score > best_score ) { + best_score = score; + best_name = name; + } + } + } + BasicExternalNodeBasedCoverageExtender.calculateCoverage( phylogenies, + best_name, + options, + scoring_method, + external_node_scores_list_temp, + true ); + if ( out != null ) { + out.println( i + "\t" + best_name + "\t" + ( best_score * normalization_factor ) ); + } + return best_name; + } + + /* + * (non-Javadoc) + * + * @see org.forester.tools.modeling.CoverageExtender#find(java.util.List, + * java.util.List, int, + * org.forester.tools.modeling.CoverageCalculationMethod, + * org.forester.tools.modeling.CoverageCalculationOptions, + * java.io.PrintStream) + */ + public List find( final List phylogenies, + final List already_covered, + int number_names_to_find, + final CoverageCalculationOptions options, + final PrintStream out ) { + final ExternalNodeBasedCoverageMethodOptions my_options = ( ExternalNodeBasedCoverageMethodOptions ) options; + if ( ( my_options == null ) || ForesterUtil.isEmpty( my_options.getScoringMethod() ) ) { + throw new IllegalArgumentException( "options for external node based coverage method appear to not have been set" ); + } + BranchCountingBasedScoringMethod scoring_method; + try { + scoring_method = ( BranchCountingBasedScoringMethod ) ( Class.forName( my_options.getScoringMethod() ) ) + .newInstance(); + } + catch ( final Exception e ) { + throw new IllegalArgumentException( "could not create scoring method class \"" + + my_options.getScoringMethod() + "\"" ); + } + final List best_names = new ArrayList(); + final Set my_already_covered = new HashSet(); + final List> external_node_scores_list = new ArrayList>(); + for( int i = 0; i < phylogenies.size(); ++i ) { + external_node_scores_list.add( ModelingUtils.setUpExternalCoverageHashMap( phylogenies.get( i ) ) ); + } + if ( already_covered != null ) { + for( final String name : already_covered ) { + my_already_covered.add( name ); + BasicExternalNodeBasedCoverageExtender.calculateCoverage( phylogenies, + name, + options, + scoring_method, + external_node_scores_list, + true ); + } + } + if ( number_names_to_find < 1 ) { + number_names_to_find = phylogenies.get( 0 ).getNumberOfExternalNodes() - my_already_covered.size(); + } + final double normalization_factor = scoring_method.getNormalizationFactor( phylogenies.get( 0 ) ); + for( int i = 0; i < number_names_to_find; ++i ) { + final String name = find( my_options, + scoring_method, + external_node_scores_list, + external_node_scores_list, + phylogenies, + my_already_covered, + out, + i, + normalization_factor ); + my_already_covered.add( name ); + best_names.add( name ); + } + return best_names; + } + + private static double calculateCoverage( final List phylogenies, + final String name, + final CoverageCalculationOptions options, + final BranchCountingBasedScoringMethod scoring_method, + final List> external_node_scores_list, + final boolean update_external_node_scores_list ) { + int i = 0; + double score_sum = 0.0; + for( final Object element : phylogenies ) { + SortedMap external_node_scores; + if ( update_external_node_scores_list ) { + external_node_scores = external_node_scores_list.get( i++ ); + } + else { + external_node_scores = new TreeMap( external_node_scores_list.get( i++ ) ); + } + final Phylogeny phylogeny = ( Phylogeny ) element; + scoring_method.calculateScoreForExternalNode( external_node_scores, + phylogeny, + phylogeny.getNode( name ), + options ); + for( final Object element2 : external_node_scores.values() ) { + score_sum += ( ( Double ) element2 ).doubleValue(); + } + } + return score_sum / i; + } +} diff --git a/forester/java/src/org/forester/pccx/BranchCountingBasedScoringMethod.java b/forester/java/src/org/forester/pccx/BranchCountingBasedScoringMethod.java new file mode 100644 index 0000000..815d13d --- /dev/null +++ b/forester/java/src/org/forester/pccx/BranchCountingBasedScoringMethod.java @@ -0,0 +1,74 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.util.SortedMap; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; + +/* + * Scoring method according to an idea by Adam Godzik, PhD. + * + * @author Christian M. Zmasek + */ +public class BranchCountingBasedScoringMethod implements ScoringMethodForExternalNode { + + double calculateScoreContributionPerExternalNode( final PhylogenyNode external_node, + final PhylogenyNode current_node ) { + double score_contribution = 0.0; + if ( current_node == external_node ) { + score_contribution = 1.0; + } + else { + score_contribution = 1.0 / ModelingUtils.calculateBranchSum( external_node, current_node ); + } + return score_contribution; + } + + public void calculateScoreForExternalNode( final SortedMap external_node_scores, + final Phylogeny phylogeny, + final PhylogenyNode external_node, + final CoverageCalculationOptions options ) { + for( final Object element : external_node_scores.keySet() ) { + final PhylogenyNode current_node = ( PhylogenyNode ) element; + final double score_contribution = calculateScoreContributionPerExternalNode( external_node, current_node ); + final double prev_score_contribution = external_node_scores.get( current_node ); + if ( score_contribution > prev_score_contribution ) { + external_node_scores.put( current_node, score_contribution ); + } + } + } + + public String getDesciption() { + return "sum of 1/branch-segment-sum"; + } + + public double getNormalizationFactor( final Phylogeny phylogeny ) { + return ( 1.0 / phylogeny.getNumberOfExternalNodes() ); + } +} diff --git a/forester/java/src/org/forester/pccx/BranchLengthBasedScoringMethod.java b/forester/java/src/org/forester/pccx/BranchLengthBasedScoringMethod.java new file mode 100644 index 0000000..7cddfdf --- /dev/null +++ b/forester/java/src/org/forester/pccx/BranchLengthBasedScoringMethod.java @@ -0,0 +1,73 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +/* + * + * @author Christian M. Zmasek + */ +public class BranchLengthBasedScoringMethod extends BranchCountingBasedScoringMethod { + + public static final double MIN_ALLOWED_BL_VALUE = 0.001; + + @Override + double calculateScoreContributionPerExternalNode( final PhylogenyNode external_node, + final PhylogenyNode current_node ) { + double score_contribution = 0.0; + if ( current_node == external_node ) { + score_contribution = external_node.getDistanceToParent(); + // This, of course, is completely /ad hoc/. + } + else { + score_contribution = ModelingUtils.calculateBranchLengthSum( external_node, current_node ); + } + return 1.0 / ( score_contribution > BranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE ? score_contribution + : BranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE ); + } + + @Override + public String getDesciption() { + return "sum of 1/branch-length-sum [for self: 1/branch-length] [min branch length: " + + BranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE + "]"; + } + + @Override + public double getNormalizationFactor( final Phylogeny phylogeny ) { + double s = 0.0; + double d = 0.0; + for( final PhylogenyNodeIterator iter = phylogeny.iteratorExternalForward(); iter.hasNext(); ) { + d = iter.next().getDistanceToParent(); + s += ( 1.0 / ( d > BranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE ? d + : BranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE ) ); + } + return 1.0 / s; + } +} diff --git a/forester/java/src/org/forester/pccx/Coverage.java b/forester/java/src/org/forester/pccx/Coverage.java new file mode 100644 index 0000000..59827f9 --- /dev/null +++ b/forester/java/src/org/forester/pccx/Coverage.java @@ -0,0 +1,36 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +/* + * @author Christian M. Zmasek + */ +public interface Coverage { + + public String asString(); + + public double getScore(); +} diff --git a/forester/java/src/org/forester/pccx/CoverageCalculationMethod.java b/forester/java/src/org/forester/pccx/CoverageCalculationMethod.java new file mode 100644 index 0000000..638ca22 --- /dev/null +++ b/forester/java/src/org/forester/pccx/CoverageCalculationMethod.java @@ -0,0 +1,41 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.util.List; + +import org.forester.phylogeny.Phylogeny; + +/* + * @author Christian M. Zmasek + */ +public interface CoverageCalculationMethod { + + public Coverage calculateCoverage( List phylogenies, + List names, + CoverageCalculationOptions options, + boolean annotate_phylogenies ); +} diff --git a/forester/java/src/org/forester/pccx/CoverageCalculationOptions.java b/forester/java/src/org/forester/pccx/CoverageCalculationOptions.java new file mode 100644 index 0000000..1d588ff --- /dev/null +++ b/forester/java/src/org/forester/pccx/CoverageCalculationOptions.java @@ -0,0 +1,34 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +/* + * @author Christian M. Zmasek + */ +public interface CoverageCalculationOptions { + + public String asString(); +} diff --git a/forester/java/src/org/forester/pccx/CoverageCalculator.java b/forester/java/src/org/forester/pccx/CoverageCalculator.java new file mode 100644 index 0000000..8f60542 --- /dev/null +++ b/forester/java/src/org/forester/pccx/CoverageCalculator.java @@ -0,0 +1,63 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.util.List; + +import org.forester.phylogeny.Phylogeny; + +/* + * @author Christian M. Zmasek + */ +public class CoverageCalculator { + + private final CoverageCalculationMethod _method; + private final CoverageCalculationOptions _options; + + private CoverageCalculator( final CoverageCalculationMethod method, final CoverageCalculationOptions options ) { + _method = method; + _options = options; + } + + public Coverage calculateCoverage( final List phylogenies, + final List names, + final boolean annotate_phylogenies ) { + return getMethod().calculateCoverage( phylogenies, names, getOptions(), annotate_phylogenies ); + } + + private CoverageCalculationMethod getMethod() { + return _method; + } + + private CoverageCalculationOptions getOptions() { + return _options; + } + + public static CoverageCalculator getInstance( final CoverageCalculationMethod method, + final CoverageCalculationOptions options ) { + return new CoverageCalculator( method, options ); + } +} diff --git a/forester/java/src/org/forester/pccx/CoverageExtender.java b/forester/java/src/org/forester/pccx/CoverageExtender.java new file mode 100644 index 0000000..b882759 --- /dev/null +++ b/forester/java/src/org/forester/pccx/CoverageExtender.java @@ -0,0 +1,43 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.io.PrintStream; +import java.util.List; + +import org.forester.phylogeny.Phylogeny; + +/* + * @author Christian M. Zmasek + */ +public interface CoverageExtender { + + public abstract List find( final List phylogenies, + final List already_covered, + int number_names_to_find, + final CoverageCalculationOptions options, + final PrintStream out ); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverage.java b/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverage.java new file mode 100644 index 0000000..ee46c40 --- /dev/null +++ b/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverage.java @@ -0,0 +1,100 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +/* + * @author Christian M. Zmasek + */ +public class ExternalNodeBasedCoverage implements Coverage { + + private final double _av_normalized_score; + private final double _av_raw_score; + private final int _n; + private final double _sd; + private final double _max; + private final double _min; + + public ExternalNodeBasedCoverage( final DescriptiveStatistics stats, + final double average_raw_score, + final CoverageCalculationOptions options ) { + _av_normalized_score = stats.arithmeticMean(); + _av_raw_score = average_raw_score; + _n = stats.getN(); + if ( _n > 1 ) { + _sd = stats.sampleStandardDeviation(); + } + else { + _sd = 0.0; + } + _max = stats.getMax(); + _min = stats.getMin(); + } + + public String asString() { + final StringBuffer sb = new StringBuffer(); + if ( getN() == 1 ) { + sb.append( "Normalized score: " + getScore() + ForesterUtil.getLineSeparator() ); + sb.append( "Raw score : " + getAvarageRawScore() ); + } + else { + sb.append( "Avarage normalized score: " + getScore() + " [sd=" + getSD() + " min=" + getMin() + " max=" + + getMax() + " n=" + getN() + "]" + ForesterUtil.getLineSeparator() ); + sb.append( "Avarage raw score : " + getAvarageRawScore() ); + } + return sb.toString(); + } + + public double getAvarageNormalizedScore() { + return _av_normalized_score; + } + + public double getAvarageRawScore() { + return _av_raw_score; + } + + public double getMax() { + return _max; + } + + public double getMin() { + return _min; + } + + public int getN() { + return _n; + } + + public double getScore() { + return getAvarageNormalizedScore(); + } + + public double getSD() { + return _sd; + } +} diff --git a/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethod.java b/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethod.java new file mode 100644 index 0000000..5994bb9 --- /dev/null +++ b/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethod.java @@ -0,0 +1,130 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.awt.Color; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +/* + * @author Christian M. Zmasek + */ +public class ExternalNodeBasedCoverageMethod implements CoverageCalculationMethod { + + private static final Color MEAN_COVERAGE_COLOR = new Color( 0, 0, 0 ); + private static final Color MAXIMAL_COV_COLOR = new Color( 0, 255, 0 ); + private static final Color MINIMAL_COV_COLOR = new Color( 255, 0, 0 ); + + public Coverage calculateCoverage( final List phylogenies, + final List names, + final CoverageCalculationOptions options, + final boolean annotate_phylogenies ) { + final DescriptiveStatistics normalized_score_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics raw_score_stats = new BasicDescriptiveStatistics(); + final ExternalNodeBasedCoverageMethodOptions my_options = ( ExternalNodeBasedCoverageMethodOptions ) options; + if ( ( my_options == null ) || ForesterUtil.isEmpty( my_options.getScoringMethod() ) ) { + throw new IllegalArgumentException( "options for external node based coverage method appear to not have been set" ); + } + BranchCountingBasedScoringMethod scoring_method; + try { + scoring_method = ( BranchCountingBasedScoringMethod ) ( Class.forName( my_options.getScoringMethod() ) ) + .newInstance(); + } + catch ( final Exception e ) { + throw new IllegalArgumentException( "could not create scoring method class \"" + + my_options.getScoringMethod() + "\"" ); + } + final double normalization_factor = scoring_method.getNormalizationFactor( phylogenies.get( 0 ) ); + for( final Object element : phylogenies ) { + final double raw_score = calculateCoverage( ( Phylogeny ) element, + names, + options, + scoring_method, + annotate_phylogenies, + normalization_factor ); + normalized_score_stats.addValue( raw_score * normalization_factor ); + raw_score_stats.addValue( raw_score ); + } + return new ExternalNodeBasedCoverage( normalized_score_stats, raw_score_stats.arithmeticMean(), options ); + } + + private double calculateCoverage( final Phylogeny phylogeny, + final List names, + final CoverageCalculationOptions options, + final BranchCountingBasedScoringMethod scoring_method, + final boolean annotate_phylogeny, + final double normalization_factor ) { + final SortedMap external_node_scores = ModelingUtils + .setUpExternalCoverageHashMap( phylogeny ); + for( final Object element : names ) { + scoring_method.calculateScoreForExternalNode( external_node_scores, phylogeny, phylogeny + .getNode( ( String ) element ), options ); + } + if ( annotate_phylogeny ) { + colorizePhylogenyAccordingToCoverage( external_node_scores, phylogeny, normalization_factor ); + } + double score = 0.0; + for( final Object element : external_node_scores.values() ) { + score += ( ( Double ) element ).doubleValue(); + } + return score; + } + + private void colorizePhylogenyAccordingToCoverage( final SortedMap external_node_scores, + final Phylogeny phylogeny, + final double normalization_factor ) { + final DescriptiveStatistics ds = new BasicDescriptiveStatistics(); + for( final Object element : external_node_scores.entrySet() ) { + ds.addValue( ( Double ) ( ( Map.Entry ) element ).getValue() * normalization_factor ); + } + final double min = ds.getMin(); + final double max = ds.getMax(); + final double median = ds.median(); + for( final Object element2 : external_node_scores.entrySet() ) { + final Map.Entry element = ( Map.Entry ) element2; + final PhylogenyNode node = ( PhylogenyNode ) element.getKey(); + final double normalized_value = ( Double ) element.getValue() * normalization_factor; + PhylogenyMethods.setBranchColorValue( node, ForesterUtil + .calcColor( normalized_value, + min, + max, + median, + ExternalNodeBasedCoverageMethod.MINIMAL_COV_COLOR, + ExternalNodeBasedCoverageMethod.MAXIMAL_COV_COLOR, + ExternalNodeBasedCoverageMethod.MEAN_COVERAGE_COLOR ) ); + } + PhylogenyMethods.postorderBranchColorAveragingExternalNodeBased( phylogeny ); + } +} diff --git a/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethodOptions.java b/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethodOptions.java new file mode 100644 index 0000000..c88c35e --- /dev/null +++ b/forester/java/src/org/forester/pccx/ExternalNodeBasedCoverageMethodOptions.java @@ -0,0 +1,62 @@ +// $Id: +// cmzmasek Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +public class ExternalNodeBasedCoverageMethodOptions implements CoverageCalculationOptions { + + final private String _scoring_method; + + /** + * This constructor sets the class name for the scoring method e.g. + * "org.forester.tools.modeling.BranchCountingBasedScoringMethod" + * + * @param scoring_method + * class name for the scoring method + */ + public ExternalNodeBasedCoverageMethodOptions( final String scoring_method ) { + _scoring_method = scoring_method; + } + + public String asString() { + final StringBuffer sb = new StringBuffer(); + sb.append( "scoring method: " ); + BranchCountingBasedScoringMethod scoring_method; + try { + scoring_method = ( BranchCountingBasedScoringMethod ) ( Class.forName( getScoringMethod() ) ).newInstance(); + } + catch ( final Exception e ) { + sb.append( "?" ); + return sb.toString(); + } + sb.append( scoring_method.getDesciption() ); + return sb.toString(); + } + + public String getScoringMethod() { + return _scoring_method; + } +} diff --git a/forester/java/src/org/forester/pccx/LogBranchLengthBasedScoringMethod.java b/forester/java/src/org/forester/pccx/LogBranchLengthBasedScoringMethod.java new file mode 100644 index 0000000..9264e0e --- /dev/null +++ b/forester/java/src/org/forester/pccx/LogBranchLengthBasedScoringMethod.java @@ -0,0 +1,85 @@ +// $Id: +// cmzmasek Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +/* + * + * @author Christian M. Zmasek + */ +public class LogBranchLengthBasedScoringMethod extends BranchCountingBasedScoringMethod { + + public static final double MIN_ALLOWED_BL_VALUE = 0.0001; + public static final double MAX_ALLOWED_BL_VALUE = 1.0; + + @Override + double calculateScoreContributionPerExternalNode( final PhylogenyNode external_node, + final PhylogenyNode current_node ) { + double score_contribution = 0.0; + if ( current_node == external_node ) { + score_contribution = external_node.getDistanceToParent(); + // This, of course, is completely /ad hoc/. + } + else { + score_contribution = ModelingUtils.calculateBranchLengthSum( external_node, current_node ); + } + if ( score_contribution > LogBranchLengthBasedScoringMethod.MAX_ALLOWED_BL_VALUE ) { + score_contribution = LogBranchLengthBasedScoringMethod.MAX_ALLOWED_BL_VALUE; + } + else if ( score_contribution < LogBranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE ) { + score_contribution = LogBranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE; + } + return ( -Math.log( score_contribution ) ); + } + + @Override + public String getDesciption() { + return "sum of -ln(branch-length-sum) [for self: -ln(branch-length)] [min branch length: " + + LogBranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE + ", max branch length: " + + LogBranchLengthBasedScoringMethod.MAX_ALLOWED_BL_VALUE + "]"; + } + + @Override + public double getNormalizationFactor( final Phylogeny phylogeny ) { + double s = 0.0; + double d = 0.0; + for( final PhylogenyNodeIterator iter = phylogeny.iteratorExternalForward(); iter.hasNext(); ) { + d = iter.next().getDistanceToParent(); + if ( d > LogBranchLengthBasedScoringMethod.MAX_ALLOWED_BL_VALUE ) { + d = LogBranchLengthBasedScoringMethod.MAX_ALLOWED_BL_VALUE; + } + else if ( d < LogBranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE ) { + d = LogBranchLengthBasedScoringMethod.MIN_ALLOWED_BL_VALUE; + } + s += ( -Math.log( d ) ); + } + return 1 / s; + } +} diff --git a/forester/java/src/org/forester/pccx/ModelingUtils.java b/forester/java/src/org/forester/pccx/ModelingUtils.java new file mode 100644 index 0000000..44eb20f --- /dev/null +++ b/forester/java/src/org/forester/pccx/ModelingUtils.java @@ -0,0 +1,81 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +/* + * @author Christian M. Zmasek + */ +public final class ModelingUtils { + + static double calculateBranchLengthSum( final PhylogenyNode n1, final PhylogenyNode n2 ) { + final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( n1, n2 ); + return ModelingUtils.calculateBranchLengthSumHelper( n1, lca ) + + ModelingUtils.calculateBranchLengthSumHelper( n2, lca ); + } + + private static double calculateBranchLengthSumHelper( final PhylogenyNode outer, final PhylogenyNode inner ) { + PhylogenyNode my_outer = outer; + double l = 0; + while ( my_outer != inner ) { + if ( my_outer.getDistanceToParent() > 0.0 ) { + l += my_outer.getDistanceToParent(); + } + my_outer = my_outer.getParent(); + } + return l; + } + + static int calculateBranchSum( final PhylogenyNode n1, final PhylogenyNode n2 ) { + final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( n1, n2 ); + return ModelingUtils.calculateBranchSumHelper( n1, lca ) + ModelingUtils.calculateBranchSumHelper( n2, lca ); + } + + private static int calculateBranchSumHelper( final PhylogenyNode outer, final PhylogenyNode inner ) { + PhylogenyNode my_outer = outer; + int s = 0; + while ( my_outer != inner ) { + s++; + my_outer = my_outer.getParent(); + } + return s; + } + + static SortedMap setUpExternalCoverageHashMap( final Phylogeny phylogeny ) { + final SortedMap external_node_coverage = new TreeMap(); + for( final PhylogenyNodeIterator iter = phylogeny.iteratorExternalForward(); iter.hasNext(); ) { + external_node_coverage.put( iter.next(), 0.0 ); + } + return external_node_coverage; + } +} diff --git a/forester/java/src/org/forester/pccx/ScoringMethodForExternalNode.java b/forester/java/src/org/forester/pccx/ScoringMethodForExternalNode.java new file mode 100644 index 0000000..121bc4e --- /dev/null +++ b/forester/java/src/org/forester/pccx/ScoringMethodForExternalNode.java @@ -0,0 +1,80 @@ +// $Id: +// $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.util.SortedMap; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; + +/* + * Interface providing implementations of scoring methods used by + * ExternalNodeBasedCoverageMethod. + * + * @author Christian M. Zmasek + */ +public interface ScoringMethodForExternalNode { + + /** + * This calculates the coverage score for one external node. + * + * + * @param external_node_scores + * SortedMap in which the external node + * scores are stored (node->score) + * @param phylogeny + * Phylogeny containing the external nodes to score + * @param external_node + * PhylogenyNod for which to calculate the score + * @param options + * CoverageCalculationOptions + * @param annotate_phylogeny + * + */ + public void calculateScoreForExternalNode( final SortedMap external_node_scores, + final Phylogeny phylogeny, + final PhylogenyNode external_node, + final CoverageCalculationOptions options ); + + /** + * This returns a short description of this scoring method + * + * @return short description of this scoring method + */ + public String getDesciption(); + + /** + * This calculates a normalization factor, so that a normalized score of 1.0 + * means complete coverage. + * + * + * @param phylogeny + * Phylogeny containing the external nodes to score + * @return normalization factor + */ + public double getNormalizationFactor( final Phylogeny phylogeny ); +} diff --git a/forester/java/src/org/forester/pccx/TestPccx.java b/forester/java/src/org/forester/pccx/TestPccx.java new file mode 100644 index 0000000..d5bdbdc --- /dev/null +++ b/forester/java/src/org/forester/pccx/TestPccx.java @@ -0,0 +1,246 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.pccx; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; + +/* + * @author Christian M. Zmasek + */ +public class TestPccx { + + private final static double ZERO_DIFF = 1.0E-6; + + private static boolean isEqual( final double a, final double b ) { + return ( ( Math.abs( a - b ) ) < TestPccx.ZERO_DIFF ); + } + + public static boolean test() { + if ( !TestPccx.testExternalNodeBasedCoverage() ) { + return false; + } + return true; + } + + private static boolean testExternalNodeBasedCoverage() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final String ps1 = "((((A:0.1,B:0.7):0.2,C:1.0):2.0,D:1.7):1.3,((E:0.3,F:0.4):1.1,(G:0.5,H:0.6):1.2):1.4,X:2.0)"; + final Phylogeny p1 = factory.create( ps1, new NHXParser() )[ 0 ]; + final List phylogenies = new ArrayList(); + final List names = new ArrayList(); + phylogenies.add( p1 ); + names.add( "A" ); + names.add( "A" ); + final CoverageCalculationOptions options = new ExternalNodeBasedCoverageMethodOptions( "org.forester.pccx.BranchCountingBasedScoringMethod" ); + final CoverageCalculator cc = CoverageCalculator.getInstance( new ExternalNodeBasedCoverageMethod(), + options ); + Coverage cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx.isEqual( cov.getScore(), ( 1.0 + 1.0 / 2 + 1.0 / 3 + 1.0 / 4 + 1.0 / 7 + 1.0 / 7 + 1.0 / 7 + + 1.0 / 7 + 1.0 / 5 ) / 9 ) ) { + return false; + } + names.add( "B" ); + names.add( "B" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx.isEqual( cov.getScore(), ( 1.0 + 1.0 + 1.0 / 3 + 1.0 / 4 + 1.0 / 7 + 1.0 / 7 + 1.0 / 7 + 1.0 + / 7 + 1.0 / 5 ) / 9 ) ) { + return false; + } + names.add( "G" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx + .isEqual( cov.getScore(), + ( 1.0 + 1.0 + 1.0 / 3 + 1.0 / 4 + 1.0 / 4 + 1.0 / 4 + 1.0 + 1.0 / 2 + 1.0 / 4 ) / 9 ) ) { + return false; + } + names.add( "E" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx.isEqual( cov.getScore(), + ( 1.0 + 1.0 + 1.0 / 3 + 1.0 / 4 + 1.0 + 1.0 / 2 + 1.0 + 1.0 / 2 + 1.0 / 4 ) / 9 ) ) { + return false; + } + names.add( "X" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx.isEqual( cov.getScore(), + ( 1.0 + 1.0 + 1.0 / 3 + 1.0 / 3 + 1.0 + 1.0 / 2 + 1.0 + 1.0 / 2 + 1.0 ) / 9 ) ) { + return false; + } + names.add( "C" ); + names.add( "C" ); + names.add( "C" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx.isEqual( cov.getScore(), + ( 1.0 + 1.0 + 1.0 + 1.0 / 3 + 1.0 + 1.0 / 2 + 1.0 + 1.0 / 2 + 1.0 ) / 9 ) ) { + return false; + } + names.add( "D" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx + .isEqual( cov.getScore(), ( 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 / 2 + 1.0 + 1.0 / 2 + 1.0 ) / 9 ) ) { + return false; + } + names.add( "F" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx.isEqual( cov.getScore(), ( 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 / 2 + 1.0 ) / 9 ) ) { + return false; + } + names.add( "H" ); + cov = cc.calculateCoverage( phylogenies, names, false ); + if ( !TestPccx.isEqual( cov.getScore(), ( 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 + 1.0 ) / 9 ) ) { + return false; + } + final CoverageExtender ce = new BasicExternalNodeBasedCoverageExtender(); + List l = ce + .find( phylogenies, + null, + 0, + new ExternalNodeBasedCoverageMethodOptions( "org.forester.pccx.BranchCountingBasedScoringMethod" ), + null ); + if ( !l.get( 0 ).equals( "X" ) ) { + return false; + } + if ( !l.get( 1 ).equals( "A" ) ) { + return false; + } + if ( !l.get( 2 ).equals( "E" ) ) { + return false; + } + if ( !l.get( 3 ).equals( "G" ) ) { + return false; + } + if ( !l.get( 4 ).equals( "C" ) ) { + return false; + } + if ( !l.get( 5 ).equals( "D" ) ) { + return false; + } + if ( !l.get( 6 ).equals( "B" ) ) { + return false; + } + if ( !l.get( 7 ).equals( "F" ) ) { + return false; + } + if ( !l.get( 8 ).equals( "H" ) ) { + return false; + } + final List already_covered = new ArrayList(); + already_covered.add( "A" ); + already_covered.add( "X" ); + already_covered.add( "H" ); + already_covered.add( "C" ); + l = ce + .find( phylogenies, + already_covered, + 0, + new ExternalNodeBasedCoverageMethodOptions( "org.forester.pccx.BranchCountingBasedScoringMethod" ), + null ); + if ( !l.get( 0 ).equals( "E" ) ) { + return false; + } + if ( !l.get( 1 ).equals( "D" ) ) { + return false; + } + if ( !l.get( 2 ).equals( "B" ) ) { + return false; + } + if ( !l.get( 3 ).equals( "F" ) ) { + return false; + } + if ( !l.get( 4 ).equals( "G" ) ) { + return false; + } + final String ps2 = "((((A:0.1,B:0.7):0.2,C:1.0):2.0,D:1.7):1.3,((E:0.3,F:0.4):1.1,(G:0.5,H:0.6):1.2):1.4,X:2.0)"; + final String ps3 = "((((A:0.1,B:0.1):0.2,C:1.0):2.0,D:1.7):1.3,((E:0.3,F:0.4):1.1,(G:0.5,H:0.6):1.2):1.4,X:2.0)"; + final String ps4 = "((((A:0.1,B:0.05):0.2,C:1.0):2.0,D:1.7):1.3,((E:0.3,F:0.4):1.1,(G:0.5,H:0.6):1.2):1.4,X:2.0)"; + final Phylogeny p2 = factory.create( ps2, new NHXParser() )[ 0 ]; + final Phylogeny p3 = factory.create( ps3, new NHXParser() )[ 0 ]; + final Phylogeny p4 = factory.create( ps4, new NHXParser() )[ 0 ]; + final List phylogenies2 = new ArrayList(); + final List names2 = new ArrayList(); + phylogenies2.add( p2 ); + phylogenies2.add( p3 ); + phylogenies2.add( p4 ); + names2.add( "A" ); + names2.add( "A" ); + final CoverageCalculationOptions options2 = new ExternalNodeBasedCoverageMethodOptions( "org.forester.pccx.BranchLengthBasedScoringMethod" ); + final CoverageCalculator cc2 = CoverageCalculator.getInstance( new ExternalNodeBasedCoverageMethod(), + options2 ); + Coverage cov2 = cc2.calculateCoverage( phylogenies2, names2, false ); + final double nf = 1 / ( 1 / 0.1 + 1 / 0.7 + 1 / 1.0 + 1 / 1.7 + 1 / 0.3 + 1 / 0.4 + 1 / 0.5 + 1 / 0.6 + 1 / 2.0 ); + if ( !TestPccx.isEqual( cov2.getScore(), ( 1 / 0.1 + ( 1 / 0.8 + 1 / 0.2 + 1 / 0.15 ) / 3 + 1 / 1.3 + 1 + / 4.0 + 1 / 6.4 + 1 / 6.5 + 1 / 6.7 + 1 / 6.8 + 1 / 5.6 ) + * nf ) ) { + return false; + } + names2.add( "C" ); + cov2 = cc2.calculateCoverage( phylogenies2, names2, false ); + if ( !TestPccx.isEqual( cov2.getScore(), ( 1 / 0.1 + ( 1 / 0.8 + 1 / 0.2 + 1 / 0.15 ) / 3 + 1 / 1.0 + 1 + / 4.0 + 1 / 6.4 + 1 / 6.5 + 1 / 6.7 + 1 / 6.8 + 1 / 5.6 ) + * nf ) ) { + return false; + } + names2.add( "E" ); + cov2 = cc2.calculateCoverage( phylogenies2, names2, false ); + if ( !TestPccx.isEqual( cov2.getScore(), ( 1 / 0.1 + ( 1 / 0.8 + 1 / 0.2 + 1 / 0.15 ) / 3 + 1 / 1.0 + +1 + / 4.0 + 1 / 0.3 + 1 / 0.7 + 1 / 3.1 + 1 / 3.2 + 1 / 4.8 ) + * nf ) ) { + return false; + } + final CoverageCalculationOptions options_log = new ExternalNodeBasedCoverageMethodOptions( "org.forester.pccx.LogBranchLengthBasedScoringMethod" ); + final CoverageCalculator cclog = CoverageCalculator.getInstance( new ExternalNodeBasedCoverageMethod(), + options_log ); + final Coverage cov_log = cclog.calculateCoverage( phylogenies2, names2, false ); + if ( !TestPccx.isEqual( cov_log.getScore(), 0.8534252108361485 ) ) { + return false; + } + final String ps10 = "((((A:0.1,B:0.7):0.2,C:1.0):2.0,D:1.7):1.3,((E:0.3,F:0.4):1.1,(G:0.5,H:0.6):1.2):1.4,((((I:0.1,J:0.7):0.2,K:1.0):2.0,L:1.7):1.3,((M:0.3,N:0.4,O:0.1,P:0.2):1.1,(Q:0.5,R:0.6):1.2):1.4,S:2.0):2.0)"; + final Phylogeny p10 = factory.create( ps10, new NHXParser() )[ 0 ]; + final List phylogenies10 = new ArrayList(); + final List names10 = new ArrayList(); + phylogenies10.add( p10 ); + names10.add( "A" ); + names10.add( "B" ); + names10.add( "N" ); + names10.add( "O" ); + final CoverageCalculationOptions options10 = new ExternalNodeBasedCoverageMethodOptions( "org.forester.pccx.BranchCountingBasedScoringMethod" ); + final CoverageCalculator cc10 = CoverageCalculator.getInstance( new ExternalNodeBasedCoverageMethod(), + options10 ); + cc10.calculateCoverage( phylogenies10, names10, true ); + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } +} diff --git a/forester/java/src/org/forester/phylogeny/Edge.java b/forester/java/src/org/forester/phylogeny/Edge.java new file mode 100644 index 0000000..234b2c1 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/Edge.java @@ -0,0 +1,45 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny; + +import org.forester.phylogeny.data.PhylogenyData; + +/* + * @author Christian Zmasek + * + * Interface for edges connecting nodes, for example branches in a phylgenetic + * network/tree. + */ +public interface Edge { + + public PhylogenyData getData(); + + public PhylogenyNode getFirstNode(); + + public PhylogenyNode getSecondNode(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/phylogeny/Phylogeny.java b/forester/java/src/org/forester/phylogeny/Phylogeny.java new file mode 100644 index 0000000..a3277ba --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/Phylogeny.java @@ -0,0 +1,1335 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Vector; + +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.data.BranchData; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.SequenceRelation; +import org.forester.phylogeny.data.SequenceRelation.SEQUENCE_RELATION_TYPE; +import org.forester.phylogeny.iterators.ExternalForwardIterator; +import org.forester.phylogeny.iterators.LevelOrderTreeIterator; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.phylogeny.iterators.PostorderTreeIterator; +import org.forester.phylogeny.iterators.PreorderTreeIterator; +import org.forester.util.FailedConditionCheckException; +import org.forester.util.ForesterUtil; + +public class Phylogeny { + + public final static boolean ALLOW_MULTIPLE_PARENTS_DEFAULT = false; + private PhylogenyNode _root; + private boolean _rooted; + private boolean _allow_multiple_parents; + private String _name; + private String _type; + private String _description; + private String _distance_unit; + private Confidence _confidence; + private Identifier _identifier; + private boolean _rerootable; + private HashMap _idhash; + private List _external_nodes_set; + private Collection _sequenceRelationQueries; + private Collection _relevant_sequence_relation_types; + + /** + * Default Phylogeny constructor. Constructs an empty Phylogeny. + */ + public Phylogeny() { + init(); + } + + /** + * Adds this Phylogeny to the list of child nodes of PhylogenyNode parent + * and sets the parent of this to parent. + * + * @param n + * the PhylogenyNode to add + */ + public void addAsChild( final PhylogenyNode parent ) { + if ( isEmpty() ) { + throw new IllegalArgumentException( "Attempt to add an empty tree." ); + } + if ( !isRooted() ) { + throw new IllegalArgumentException( "Attempt to add an unrooted tree." ); + } + parent.addAsChild( getRoot() ); + externalNodesHaveChanged(); + } + + public void addAsSibling( final PhylogenyNode sibling ) { + if ( isEmpty() ) { + throw new IllegalArgumentException( "Attempt to add an empty tree." ); + } + if ( !isRooted() ) { + throw new IllegalArgumentException( "Attempt to add an unrooted tree." ); + } + final int sibling_index = sibling.getChildNodeIndex(); + final PhylogenyNode new_node = new PhylogenyNode(); + final PhylogenyNode sibling_parent = sibling.getParent(); + new_node.setChild1( sibling ); + new_node.setChild2( getRoot() ); + new_node.setParent( sibling_parent ); + sibling.setParent( new_node ); + sibling_parent.setChildNode( sibling_index, new_node ); + final double new_dist = sibling.getDistanceToParent() == PhylogenyNode.DISTANCE_DEFAULT ? PhylogenyNode.DISTANCE_DEFAULT + : sibling.getDistanceToParent() / 2; + new_node.setDistanceToParent( new_dist ); + sibling.setDistanceToParent( new_dist ); + externalNodesHaveChanged(); + } + + /** + * This calculates the height of the subtree emanating at n for rooted, + * tree-shaped phylogenies + * + * @param n + * the root-node of a subtree + * @return the height of the subtree emanating at n + */ + public double calculateSubtreeHeight( final PhylogenyNode n ) { + if ( n.isExternal() || n.isCollapse() ) { + return ForesterUtil.isLargerOrEqualToZero( n.getDistanceToParent() ); + } + else { + double max = -Double.MAX_VALUE; + for( int i = 0; i < n.getNumberOfDescendants(); ++i ) { + final double l = calculateSubtreeHeight( n.getChildNode( i ) ); + if ( l > max ) { + max = l; + } + } + return max + ForesterUtil.isLargerOrEqualToZero( n.getDistanceToParent() ); + } + } + + /** + * Returns a deep copy of this Phylogeny. + *

+ * (The resulting Phylogeny has its references in the external nodes + * corrected, if they are lacking/obsolete in this.) + */ + public Phylogeny copy() { + return copy( _root ); + } + + /** + * Returns a shallow copy of this Phylogeny. + *

+ * (The resulting Phylogeny has its references in the external nodes + * corrected, if they are lacking/obsolete in this.) + */ + public Phylogeny copyShallow() { + return copyShallow( _root ); + } + + public Phylogeny copyShallow( final PhylogenyNode source ) { + final Phylogeny tree = new Phylogeny(); + if ( isEmpty() ) { + tree.init(); + return tree; + } + tree._rooted = _rooted; + tree._name = _name; + tree._description = _description; + tree._type = _type; + tree._rerootable = _rerootable; + tree._distance_unit = _distance_unit; + tree._confidence = _confidence; + tree._identifier = _identifier; + tree.setAllowMultipleParents( isAllowMultipleParents() ); + tree._root = PhylogenyMethods.copySubTreeShallow( source ); + return tree; + } + + /** + * Returns a deep copy of this Phylogeny. + *

+ * (The resulting Phylogeny has its references in the external nodes + * corrected, if they are lacking/obsolete in this.) + */ + public Phylogeny copy( final PhylogenyNode source ) { + final Phylogeny tree = new Phylogeny(); + if ( isEmpty() ) { + tree.init(); + return tree; + } + tree._rooted = _rooted; + tree._name = new String( _name ); + tree._description = new String( _description ); + tree._type = new String( _type ); + tree._rerootable = _rerootable; + tree._distance_unit = new String( _distance_unit ); + if ( _confidence != null ) { + tree._confidence = ( Confidence ) _confidence.copy(); + } + if ( _identifier != null ) { + tree._identifier = ( Identifier ) _identifier.copy(); + } + tree.setAllowMultipleParents( isAllowMultipleParents() ); + tree._root = PhylogenyMethods.copySubTree( source ); + return tree; + } + + /** + * Need the delete and/or rehash _idhash (not done automatically + * to allow client multiple deletions in linear time). + * Need to call 'recalculateNumberOfExternalDescendants(boolean)' after this + * if tree is to be displayed. + * + * @param remove_us the parent node of the subtree to be deleted + */ + public void deleteSubtree( final PhylogenyNode remove_us, final boolean collapse_resulting_node_with_one_desc ) { + if ( isEmpty() ) { + return; + } + if ( remove_us.isRoot() ) { + init(); + return; + } + if ( !collapse_resulting_node_with_one_desc ) { + remove_us.getParent().removeChildNode( remove_us ); + } + else { + final PhylogenyNode removed_node = remove_us; + final PhylogenyNode p = remove_us.getParent(); + if ( p.isRoot() ) { + if ( p.getNumberOfDescendants() == 2 ) { + if ( removed_node.isFirstChildNode() ) { + setRoot( getRoot().getChildNode( 1 ) ); + getRoot().setParent( null ); + } + else { + setRoot( getRoot().getChildNode( 0 ) ); + getRoot().setParent( null ); + } + } + else { + p.removeChildNode( removed_node.getChildNodeIndex() ); + } + } + else { + final PhylogenyNode pp = removed_node.getParent().getParent(); + if ( p.getNumberOfDescendants() == 2 ) { + final int pi = p.getChildNodeIndex(); + if ( removed_node.isFirstChildNode() ) { + p.getChildNode( 1 ).setDistanceToParent( PhylogenyMethods.addPhylogenyDistances( p + .getDistanceToParent(), p.getChildNode( 1 ).getDistanceToParent() ) ); + pp.setChildNode( pi, p.getChildNode( 1 ) ); + } + else { + p.getChildNode( 0 ).setDistanceToParent( PhylogenyMethods.addPhylogenyDistances( p + .getDistanceToParent(), p.getChildNode( 0 ).getDistanceToParent() ) ); + pp.setChildNode( pi, p.getChildNode( 0 ) ); + } + } + else { + p.removeChildNode( removed_node.getChildNodeIndex() ); + } + } + } + remove_us.setParent( null ); + setIdHash( null ); + externalNodesHaveChanged(); + } + + public void externalNodesHaveChanged() { + _external_nodes_set = null; + } + + public String[] getAllExternalNodeNames() { + int i = 0; + if ( isEmpty() ) { + return null; + } + final String[] names = new String[ getNumberOfExternalNodes() ]; + for( final PhylogenyNodeIterator iter = iteratorExternalForward(); iter.hasNext(); ) { + names[ i++ ] = new String( iter.next().getName() ); + } + return names; + } + + public Confidence getConfidence() { + return _confidence; + } + + public String getDescription() { + return _description; + } + + public String getDistanceUnit() { + return _distance_unit; + } + + /** + * + * Warning. The order of the returned nodes is random + * -- and hence cannot be relied on. + * + * @return Unordered set of PhylogenyNode + */ + public List getExternalNodes() { + if ( _external_nodes_set == null ) { + _external_nodes_set = new ArrayList(); + for( final PhylogenyNodeIterator it = iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.isExternal() ) { + _external_nodes_set.add( n ); + } + } + } + return _external_nodes_set; + } + + /** + * Returns the number of duplications of this Phylogeny (int). A return + * value of -1 indicates that the number of duplications is unknown. + */ + // public int getNumberOfDuplications() { + // return _number_of_duplications; + // } // getNumberOfDuplications() + /** + * Sets the number of duplications of this Phylogeny (int). A value of -1 + * indicates that the number of duplications is unknown. + * + * @param clean_nh + * set to true for clean NH format + */ + // public void setNumberOfDuplications( int i ) { + // if ( i < 0 ) { + // _number_of_duplications = -1; + // } + // else { + // _number_of_duplications = i; + // } + // } // setNumberOfDuplications( int ) + /** + * Returns the first external PhylogenyNode. + */ + public PhylogenyNode getFirstExternalNode() { + if ( isEmpty() ) { + throw new FailedConditionCheckException( "attempt to obtain first external node of empty phylogeney" ); + } + PhylogenyNode node = getRoot(); + while ( node.isInternal() ) { + node = node.getFirstChildNode(); + } + return node; + } + + /** + * This calculates the height for rooted, tree-shaped phylogenies. The + * height is the longest distance from the root to an external node. Please + * note. Child nodes of collapsed nodes are ignored -- which is useful for + * display purposes but might be misleading for other applications. + * + * @return the height for rooted, tree-shaped phylogenies + */ + public double getHeight() { + if ( isEmpty() ) { + return 0.0; + } + return calculateSubtreeHeight( getRoot() ); + } + + public Identifier getIdentifier() { + return _identifier; + } + + // --------------------------------------------------------- + // Modification of Phylogeny topology and Phylogeny appearance + // --------------------------------------------------------- + private HashMap getIdHash() { + return _idhash; + } + + /** + * Returns the name of this Phylogeny. + */ + public String getName() { + return _name; + } + + /** + * Finds the PhylogenyNode of this Phylogeny which has a matching ID number. + * Takes O(n) time. After method hashIDs() has been called it runs in + * constant time. + * + * @param id + * ID number (int) of the PhylogenyNode to find + * @return PhylogenyNode with matching ID, null if not found + */ + public PhylogenyNode getNode( final int id ) throws NoSuchElementException { + if ( isEmpty() ) { + throw new NoSuchElementException( "attempt to get node in an empty phylogeny" ); + } + if ( _idhash != null ) { + return _idhash.get( id ); + } + else { + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.getId() == id ) { + return node; + } + } + } + return null; + } + + /** + * Returns a PhylogenyNode of this Phylogeny which has a matching name. + * Throws an Exception if seqname is not present in this or not unique. + * + * @param name + * name (String) of PhylogenyNode to find + * @return PhylogenyNode with matchin name + */ + public PhylogenyNode getNode( final String name ) { + if ( isEmpty() ) { + return null; + } + final List nodes = getNodes( name ); + if ( ( nodes == null ) || ( nodes.size() < 1 ) ) { + throw new IllegalArgumentException( "node named [" + name + "] not found" ); + } + if ( nodes.size() > 1 ) { + throw new IllegalArgumentException( "node named [" + name + "] not unique" ); + } + return nodes.get( 0 ); + } + + /** + * Return Node by TaxonomyId Olivier CHABROL : + * olivier.chabrol@univ-provence.fr + * + * @param taxonomyID + * search taxonomy identifier + * @param nodes + * sublist node to search + * @return List node with the same taxonomy identifier + */ + private List getNodeByTaxonomyID( final String taxonomyID, final List nodes ) { + final List retour = new ArrayList(); + for( final PhylogenyNode node : nodes ) { + if ( taxonomyID.equals( PhylogenyMethods.getTaxonomyIdentifier( node ) ) ) { + retour.add( node ); + } + } + return retour; + } + + /** + * Returns a List with references to all Nodes of this Phylogeny which have + * a matching name. + * + * @param name + * name (String) of Nodes to find + * @return Vector of references to Nodes of this Phylogeny with matching + * names + * @see #getNodesWithMatchingSpecies(String) + */ + public List getNodes( final String name ) { + if ( isEmpty() ) { + return null; + } + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getName().equals( name ) ) { + nodes.add( n ); + } + } + return nodes; + } + + public List getNodesViaSequenceName( final String seq_name ) { + if ( isEmpty() ) { + return null; + } + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) { + nodes.add( n ); + } + } + return nodes; + } + + public List getNodesViaTaxonomyCode( final String taxonomy_code ) { + if ( isEmpty() ) { + return null; + } + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasTaxonomy() + && n.getNodeData().getTaxonomy().getTaxonomyCode().equals( taxonomy_code ) ) { + nodes.add( n ); + } + } + return nodes; + } + + /** + * Returns a Vector with references to all Nodes of this Phylogeny which + * have a matching species name. + * + * @param specname + * species name (String) of Nodes to find + * @return Vector of references to Nodes of this Phylogeny with matching + * species names. + * @see #getNodes(String) + */ + public List getNodesWithMatchingSpecies( final String specname ) { + if ( isEmpty() ) { + return null; + } + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( PhylogenyMethods.getSpecies( n ).equals( specname ) ) { + nodes.add( n ); + } + } + return nodes; + } + + public PhylogenyNode getNodeViaSequenceName( final String seq_name ) { + if ( isEmpty() ) { + return null; + } + final List nodes = getNodesViaSequenceName( seq_name ); + if ( ( nodes == null ) || ( nodes.size() < 1 ) ) { + throw new IllegalArgumentException( "node with sequence named [" + seq_name + "] not found" ); + } + if ( nodes.size() > 1 ) { + throw new IllegalArgumentException( "node with sequence named [" + seq_name + "] not unique" ); + } + return nodes.get( 0 ); + } + + public PhylogenyNode getNodeViaTaxonomyCode( final String taxonomy_code ) { + if ( isEmpty() ) { + return null; + } + final List nodes = getNodesViaTaxonomyCode( taxonomy_code ); + if ( ( nodes == null ) || ( nodes.size() < 1 ) ) { + throw new IllegalArgumentException( "node with taxonomy code \"" + taxonomy_code + "\" not found" ); + } + if ( nodes.size() > 1 ) { + throw new IllegalArgumentException( "node with taxonomy code \"" + taxonomy_code + "\" not unique" ); + } + return nodes.get( 0 ); + } + + public int getNumberOfBranches() { + if ( isEmpty() ) { + return 0; + } + int c = 0; + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); iter.next() ) { + ++c; + } + if ( !isRooted() ) { + --c; + } + return c; + } + + /** + * Returns the sum of external Nodes of this Phylogeny (int). + */ + public int getNumberOfExternalNodes() { + if ( isEmpty() ) { + return 0; + } + return getExternalNodes().size(); + } + + /** + * Returns all paralogs of the external PhylogenyNode n of this Phylogeny. + * paralog are returned as List of node references. + *

+ * PRECONDITION: This tree must be binary and rooted, and speciation - + * duplication need to be assigned for each of its internal Nodes. + *

+ * Returns null if this Phylogeny is empty or if n is internal. + *

+ * (Last modified: 11/22/00) Olivier CHABROL : + * olivier.chabrol@univ-provence.fr + * + * @param n + * external PhylogenyNode whose orthologs are to be returned + * @return Vector of references to all orthologous Nodes of PhylogenyNode n + * of this Phylogeny, null if this Phylogeny is empty or if n is + * internal + */ + public List getParalogousNodes( final PhylogenyNode n, final String[] taxonomyCodeRange ) { + PhylogenyNode node = n; + PhylogenyNode prev = null; + final List v = new ArrayList(); + final Map> map = new HashMap>(); + getTaxonomyMap( getRoot(), map ); + if ( !node.isExternal() || isEmpty() ) { + return null; + } + final String searchNodeSpeciesId = PhylogenyMethods.getTaxonomyIdentifier( n ); + if ( !node.isExternal() || isEmpty() ) { + return null; + } + List taxIdList = null; + final List taxonomyCodeRangeList = Arrays.asList( taxonomyCodeRange ); + while ( !node.isRoot() ) { + prev = node; + node = node.getParent(); + taxIdList = map.get( node ); + if ( node.isDuplication() && isContains( taxIdList, taxonomyCodeRangeList ) ) { + if ( node.getChildNode1() == prev ) { + v.addAll( getNodeByTaxonomyID( searchNodeSpeciesId, node.getChildNode2() + .getAllExternalDescendants() ) ); + } + else { + v.addAll( getNodeByTaxonomyID( searchNodeSpeciesId, node.getChildNode1() + .getAllExternalDescendants() ) ); + } + } + } + return v; + } + + public Collection getRelevantSequenceRelationTypes() { + if ( _relevant_sequence_relation_types == null ) { + _relevant_sequence_relation_types = new Vector(); + } + return _relevant_sequence_relation_types; + } + + /** + * Returns the root PhylogenyNode of this Phylogeny. + */ + public PhylogenyNode getRoot() { + return _root; + } + + public Collection getSequenceRelationQueries() { + return _sequenceRelationQueries; + } + + /** + * List all species contains in all leaf under a node Olivier CHABROL : + * olivier.chabrol@univ-provence.fr + * + * @param node + * PhylogenyNode whose sub node species are returned + * @return species contains in all leaf under the param node + */ + private List getSubNodeTaxonomy( final PhylogenyNode node ) { + final List taxonomyList = new ArrayList(); + final List childs = node.getAllExternalDescendants(); + String speciesId = null; + for( final PhylogenyNode phylogenyNode : childs ) { + // taxId = new Long(phylogenyNode.getTaxonomyID()); + speciesId = PhylogenyMethods.getTaxonomyIdentifier( phylogenyNode ); + if ( !taxonomyList.contains( speciesId ) ) { + taxonomyList.add( speciesId ); + } + } + return taxonomyList; + } + + /** + * Create a map [], the list contains the + * species contains in all leaf under phylogeny node Olivier CHABROL : + * olivier.chabrol@univ-provence.fr + * + * @param node + * the tree root node + * @param map + * map to fill + */ + private void getTaxonomyMap( final PhylogenyNode node, final Map> map ) { + // node is leaf + if ( node.isExternal() ) { + return; + } + map.put( node, getSubNodeTaxonomy( node ) ); + getTaxonomyMap( node.getChildNode1(), map ); + getTaxonomyMap( node.getChildNode2(), map ); + } + + public String getType() { + return _type; + } + + /** + * Hashes the ID number of each PhylogenyNode of this Phylogeny to its + * corresponding PhylogenyNode, in order to make method getNode( id ) run in + * constant time. Important: The user is responsible for calling this method + * (again) after this Phylogeny has been changed/created/renumbered. + */ + public void hashIDs() { + if ( isEmpty() ) { + return; + } + setIdHash( new HashMap() ); + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + getIdHash().put( node.getId(), node ); + } + } + + /** + * Deletes this Phylogeny. + */ + public void init() { + _root = null; + _rooted = false; + _name = ""; + _description = ""; + _type = ""; + _distance_unit = ""; + _idhash = null; + _confidence = null; + _identifier = null; + _rerootable = true; + setAllowMultipleParents( Phylogeny.ALLOW_MULTIPLE_PARENTS_DEFAULT ); + } + + private boolean isAllowMultipleParents() { + return _allow_multiple_parents; + } + + /** + * Returns whether this is a completely binary tree (i.e. all internal nodes + * are bifurcations). + * + */ + public boolean isCompletelyBinary() { + if ( isEmpty() ) { + return false; + } + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.isInternal() && ( node.getNumberOfDescendants() != 2 ) ) { + return false; + } + } + return true; + } + + /** + * Util method to check if all element of a list is contains in the + * rangeList. Olivier CHABROL : olivier.chabrol@univ-provence.fr + * + * @param list + * list to be check + * @param rangeList + * the range list to compare + * @return true if all param list element are contains in param + * rangeList, false otherwise. + */ + private boolean isContains( final List list, final List rangeList ) { + if ( list.size() > rangeList.size() ) { + return false; + } + String l = null; + for( final Iterator iterator = list.iterator(); iterator.hasNext(); ) { + l = iterator.next(); + if ( !rangeList.contains( l ) ) { + return false; + } + } + return true; + } + + /** + * Checks whether a Phylogeny object is deleted (or empty). + * + * @return true if the tree is deleted (or empty), false otherwise + */ + public boolean isEmpty() { + return ( getRoot() == null ); + } + + public boolean isRerootable() { + return _rerootable; + } + + /** + * Returns true is this Phylogeny is rooted. + */ + public boolean isRooted() { + return _rooted; + } // isRooted() + + public boolean isTree() { + return true; + } + + public PhylogenyNodeIterator iteratorExternalForward() { + return new ExternalForwardIterator( this ); + } + + public PhylogenyNodeIterator iteratorLevelOrder() { + return new LevelOrderTreeIterator( this ); + } + + public PhylogenyNodeIterator iteratorPostorder() { + return new PostorderTreeIterator( this ); + } + + public PhylogenyNodeIterator iteratorPreorder() { + return new PreorderTreeIterator( this ); + } + + /** + * Resets the ID numbers of the nodes of this Phylogeny in level order, + * starting with start_label (for the root).
+ * WARNING. After this method has been called, node IDs are no longer + * unique. + */ + public void levelOrderReID() { + if ( isEmpty() ) { + return; + } + _idhash = null; + int max = 0; + for( final PhylogenyNodeIterator it = iteratorPreorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + if ( node.isRoot() ) { + node.setId( PhylogenyNode.getNodeCount() ); + } + else { + node.setId( node.getParent().getId() + 1 ); + if ( node.getId() > max ) { + max = node.getId(); + } + } + } + PhylogenyNode.setNodeCount( max + 1 ); + } + + /** + * Arranges the order of childern for each node of this Phylogeny in such a + * way that either the branch with more children is on top (right) or on + * bottom (left), dependent on the value of boolean order. + * + * @param order + * decides in which direction to order + */ + public void orderAppearance( final boolean order ) throws RuntimeException { + if ( !isTree() ) { + throw new FailedConditionCheckException( "Attempt to order appearance on phylogeny which is not tree-like." ); + } + if ( isEmpty() ) { + return; + } + orderAppearanceHelper( getRoot(), order ); + } + + // Helper method for "orderAppearance(boolean)". + // Traverses this Phylogeny recusively. + private void orderAppearanceHelper( final PhylogenyNode n, final boolean order ) { + if ( n.isExternal() ) { + return; + } + else { + PhylogenyNode temp = null; + // FIXME + if ( ( n.getNumberOfDescendants() == 2 ) + && ( n.getChildNode1().getNumberOfExternalNodes() != n.getChildNode2().getNumberOfExternalNodes() ) + && ( ( n.getChildNode1().getNumberOfExternalNodes() < n.getChildNode2().getNumberOfExternalNodes() ) == order ) ) { + temp = n.getChildNode1(); + n.setChild1( n.getChildNode2() ); + n.setChild2( temp ); + } + for( int i = 0; i < n.getNumberOfDescendants(); ++i ) { + orderAppearanceHelper( n.getChildNode( i ), order ); + } + } + } + + public void preOrderReId() { + if ( isEmpty() ) { + return; + } + setIdHash( null ); + int i = PhylogenyNode.getNodeCount(); + for( final PhylogenyNodeIterator it = iteratorPreorder(); it.hasNext(); ) { + it.next().setId( i++ ); + } + PhylogenyNode.setNodeCount( i ); + } + + /** + * Prints descriptions of all external Nodes of this Phylogeny to + * System.out. + */ + public void printExtNodes() { + if ( isEmpty() ) { + return; + } + for( final PhylogenyNodeIterator iter = iteratorExternalForward(); iter.hasNext(); ) { + System.out.println( iter.next() + "\n" ); + } + } + + /** + * (Re)counts the number of children for each PhylogenyNode of this + * Phylogeny. As an example, this method needs to be called after a + * Phylogeny has been reRooted and it is to be displayed. + * + * @param consider_collapsed_nodes + * set to true to take into account collapsed nodes (collapsed + * nodes have 1 child). + */ + public void recalculateNumberOfExternalDescendants( final boolean consider_collapsed_nodes ) { + if ( isEmpty() ) { + return; + } + for( final PhylogenyNodeIterator iter = iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.isExternal() || ( consider_collapsed_nodes && node.isCollapse() ) ) { + node.setSumExtNodes( 1 ); + } + else { + int sum = 0; + for( int i = 0; i < node.getNumberOfDescendants(); ++i ) { + sum += node.getChildNode( i ).getNumberOfExternalNodes(); + } + node.setSumExtNodes( sum ); + } + } + } + + /** + * Places the root of this Phylogeny on the parent branch of the + * PhylogenyNode with a corresponding ID. The new root is always placed on + * the middle of the branch. If the resulting reRooted Phylogeny is to be + * used any further, in most cases the following methods have to be called + * on the resulting Phylogeny: + *

+ *

  • recalculateNumberOfExternalDescendants(boolean) + *
  • recalculateAndReset() + * + * @param id + * ID (int) of PhylogenyNode of this Phylogeny + */ + public void reRoot( final int id ) { + reRoot( getNode( id ) ); + } + + /** + * Places the root of this Phylogeny on Branch b. The new root is always + * placed on the middle of the branch b. + * + */ + public void reRoot( final PhylogenyBranch b ) { + final PhylogenyNode n1 = b.getFirstNode(); + final PhylogenyNode n2 = b.getSecondNode(); + if ( n1.isExternal() ) { + reRoot( n1 ); + } + else if ( n2.isExternal() ) { + reRoot( n2 ); + } + else if ( ( n2 == n1.getChildNode1() ) || ( n2 == n1.getChildNode2() ) ) { + reRoot( n2 ); + } + else if ( ( n1 == n2.getChildNode1() ) || ( n1 == n2.getChildNode2() ) ) { + reRoot( n1 ); + } + else if ( ( n1.getParent() != null ) && n1.getParent().isRoot() + && ( ( n1.getParent().getChildNode1() == n2 ) || ( n1.getParent().getChildNode2() == n2 ) ) ) { + reRoot( n1 ); + } + else { + throw new IllegalArgumentException( "reRoot( Branch b ): b is not a branch." ); + } + } + + /** + * Places the root of this Phylogeny on the parent branch PhylogenyNode n. + * The new root is always placed on the middle of the branch. + *

    + * If the resulting reRooted Phylogeny is to be used any further, in most + * cases the following three methods have to be called on the resulting + * Phylogeny: + *

      + *
    • recalculateNumberOfExternalDescendants(boolean)
    • recalculateAndReset() + *
    + *

    + * (Last modified: 10/01/01) + * + * @param n + * PhylogenyNode of this Phylogeny\ + */ + public void reRoot( final PhylogenyNode n ) { + reRoot( n, -1 ); + } + + public void reRoot( final PhylogenyNode n, final double distance_n_to_parent ) { + if ( isEmpty() || ( getNumberOfExternalNodes() < 2 ) ) { + return; + } + setRooted( true ); + if ( n.isRoot() ) { + return; + } + else if ( n.getParent().isRoot() ) { + if ( ( n.getParent().getNumberOfDescendants() == 2 ) && ( distance_n_to_parent >= 0 ) ) { + final double d = n.getParent().getChildNode1().getDistanceToParent() + + n.getParent().getChildNode2().getDistanceToParent(); + PhylogenyNode other; + if ( n.getChildNodeIndex() == 0 ) { + other = n.getParent().getChildNode2(); + } + else { + other = n.getParent().getChildNode1(); + } + n.setDistanceToParent( distance_n_to_parent ); + final double dm = d - distance_n_to_parent; + if ( dm >= 0 ) { + other.setDistanceToParent( dm ); + } + else { + other.setDistanceToParent( 0 ); + } + } + if ( n.getParent().getNumberOfDescendants() > 2 ) { + final int index = n.getChildNodeIndex(); + final double dn = n.getDistanceToParent(); + final PhylogenyNode prev_root = getRoot(); + prev_root.getDescendants().remove( index ); + final PhylogenyNode new_root = new PhylogenyNode(); + new_root.setChildNode( 0, n ); + new_root.setChildNode( 1, prev_root ); + if ( n.getBranchDataDirectly() != null ) { + prev_root.setBranchData( ( BranchData ) n.getBranchDataDirectly().copy() ); + } + setRoot( new_root ); + if ( distance_n_to_parent >= 0 ) { + n.setDistanceToParent( distance_n_to_parent ); + final double d = dn - distance_n_to_parent; + if ( d >= 0 ) { + prev_root.setDistanceToParent( d ); + } + else { + prev_root.setDistanceToParent( 0 ); + } + } + else { + if ( dn >= 0 ) { + final double d = dn / 2.0; + n.setDistanceToParent( d ); + prev_root.setDistanceToParent( d ); + } + } + } + } + else { + PhylogenyNode a = n; + PhylogenyNode b = null; + PhylogenyNode c = null; + final PhylogenyNode new_root = new PhylogenyNode(); + double distance1 = 0.0; + double distance2 = 0.0; + BranchData branch_data_1 = null; + BranchData branch_data_2 = null; + b = a.getParent(); + c = b.getParent(); + new_root.setChildNode( 0, a ); + new_root.setChildNode( 1, b ); + distance1 = c.getDistanceToParent(); + if ( c.getBranchDataDirectly() != null ) { + branch_data_1 = ( BranchData ) c.getBranchDataDirectly().copy(); + } + c.setDistanceToParent( b.getDistanceToParent() ); + if ( b.getBranchDataDirectly() != null ) { + c.setBranchData( ( BranchData ) b.getBranchDataDirectly().copy() ); + } + if ( a.getBranchDataDirectly() != null ) { + b.setBranchData( ( BranchData ) a.getBranchDataDirectly().copy() ); + } + // New root is always placed in the middle of the branch: + if ( a.getDistanceToParent() == PhylogenyNode.DISTANCE_DEFAULT ) { + b.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + } + else { + if ( distance_n_to_parent >= 0.0 ) { + final double diff = a.getDistanceToParent() - distance_n_to_parent; + a.setDistanceToParent( distance_n_to_parent ); + b.setDistanceToParent( diff >= 0.0 ? diff : 0.0 ); + } + else { + final double d = a.getDistanceToParent() / 2.0; + a.setDistanceToParent( d ); + b.setDistanceToParent( d ); + } + } + b.setChildNodeOnly( a.getChildNodeIndex( b ), c ); + // moving to the old root, swapping references: + while ( !c.isRoot() ) { + a = b; + b = c; + c = c.getParent(); + b.setChildNodeOnly( a.getChildNodeIndex( b ), c ); + b.setParent( a ); + distance2 = c.getDistanceToParent(); + branch_data_2 = c.getBranchDataDirectly(); + c.setDistanceToParent( distance1 ); + c.setBranchData( branch_data_1 ); + distance1 = distance2; + branch_data_1 = branch_data_2; + } + // removing the old root: + if ( c.getNumberOfDescendants() == 2 ) { + final PhylogenyNode node = c.getChildNode( 1 - b.getChildNodeIndex( c ) ); + node.setParent( b ); + if ( ( c.getDistanceToParent() == PhylogenyNode.DISTANCE_DEFAULT ) + && ( node.getDistanceToParent() == PhylogenyNode.DISTANCE_DEFAULT ) ) { + node.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + } + else { + node.setDistanceToParent( ( c.getDistanceToParent() >= 0.0 ? c.getDistanceToParent() : 0.0 ) + + ( node.getDistanceToParent() >= 0.0 ? node.getDistanceToParent() : 0.0 ) ); + } + if ( c.getBranchDataDirectly() != null ) { + node.setBranchData( ( BranchData ) c.getBranchDataDirectly().copy() ); + } + for( int i = 0; i < b.getNumberOfDescendants(); ++i ) { + if ( b.getChildNode( i ) == c ) { + b.setChildNodeOnly( i, node ); + break; + } + } + } + else { + c.setParent( b ); + c.removeChildNode( b.getChildNodeIndex( c ) ); + } + setRoot( new_root ); + } + } + + /** + * Sets all Nodes of this Phylogeny to not-collapsed. + *

    + * In most cases methods adjustNodeCount(false) and recalculateAndReset() + * need to be called after this method has been called. + */ + public void setAllNodesToNotCollapse() { + if ( isEmpty() ) { + return; + } + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + node.setCollapse( false ); + } + } + + private void setAllowMultipleParents( final boolean allow_multiple_parents ) { + _allow_multiple_parents = allow_multiple_parents; + } + + public void setConfidence( final Confidence confidence ) { + _confidence = confidence; + } + + public void setDescription( final String description ) { + _description = description; + } + + public void setDistanceUnit( final String _distance_unit ) { + this._distance_unit = _distance_unit; + } + + public void setIdentifier( final Identifier identifier ) { + _identifier = identifier; + } + + void setIdHash( final HashMap idhash ) { + _idhash = idhash; + } + + /** + * Sets the indicators of all Nodes of this Phylogeny to 0. + */ + public void setIndicatorsToZero() { + if ( isEmpty() ) { + return; + } + for( final PhylogenyNodeIterator iter = iteratorPreorder(); iter.hasNext(); ) { + iter.next().setIndicator( ( byte ) 0 ); + } + } // setIndicatorsToZero() + + /** + * Sets the name of this Phylogeny to s. + */ + public void setName( final String s ) { + _name = s; + } + + public void setRelevantSequenceRelationTypes( final Collection types ) { + _relevant_sequence_relation_types = types; + } + + public void setRerootable( final boolean rerootable ) { + _rerootable = rerootable; + } + + public void setRoot( final PhylogenyNode n ) { + _root = n; + } // setRoot( PhylogenyNode ) + + /** + * Sets whether this Phylogeny is rooted or not. + */ + public void setRooted( final boolean b ) { + _rooted = b; + } // setRooted( boolean ) + + public void setSequenceRelationQueries( final Collection sequencesByName ) { + _sequenceRelationQueries = sequencesByName; + } + + public void setType( final String type ) { + _type = type; + } + + /** + * Swaps the the two childern of a PhylogenyNode node of this Phylogeny. + *

    + * (Last modified: 06/13/01) + * + * @param node + * a PhylogenyNode of this Phylogeny + */ + public void swapChildren( final PhylogenyNode node ) throws RuntimeException { + if ( !isTree() ) { + throw new FailedConditionCheckException( "Attempt to swap children on phylogeny which is not tree-like." ); + } + if ( isEmpty() || node.isExternal() || ( node.getNumberOfDescendants() < 2 ) ) { + return; + } + final PhylogenyNode first = node.getFirstChildNode(); + for( int i = 1; i < node.getNumberOfDescendants(); ++i ) { + node.setChildNode( i - 1, node.getChildNode( i ) ); + } + node.setChildNode( node.getNumberOfDescendants() - 1, first ); + } // swapChildren( PhylogenyNode ) + + public String toNewHampshire() { + return toNewHampshire( false ); + } + + public String toNewHampshire( final boolean simple_nh ) { + try { + return new PhylogenyWriter().toNewHampshire( this, simple_nh, true ).toString(); + } + catch ( final IOException e ) { + throw new Error( "this should not have happend: " + e.getMessage() ); + } + } + + public String toNewHampshireX() { + try { + return new PhylogenyWriter().toNewHampshireX( this ).toString(); + } + catch ( final IOException e ) { + throw new Error( "this should not have happend: " + e.getMessage() ); + } + } + + public String toNexus() { + try { + return new PhylogenyWriter().toNexus( this ).toString(); + } + catch ( final IOException e ) { + throw new Error( "this should not have happend: " + e.getMessage() ); + } + } + + public String toPhyloXML( final int phyloxml_level ) { + try { + return new PhylogenyWriter().toPhyloXML( this, phyloxml_level ).toString(); + } + catch ( final IOException e ) { + throw new Error( "this should not have happend: " + e.getMessage() ); + } + } + + // --------------------------------------------------------- + // Writing of Phylogeny to Strings + // --------------------------------------------------------- + /** + * Converts this Phylogeny to a New Hampshire X (String) representation. + * + * @return New Hampshire X (String) representation of this + * @see #toNewHampshireX() + */ + @Override + public String toString() { + return toNewHampshireX(); + } + + /** + * Removes the root PhylogenyNode this Phylogeny. + */ + public void unRoot() throws RuntimeException { + if ( !isTree() ) { + throw new FailedConditionCheckException( "Attempt to unroot a phylogeny which is not tree-like." ); + } + if ( isEmpty() ) { + return; + } + setIndicatorsToZero(); + if ( !isRooted() || ( getNumberOfExternalNodes() <= 1 ) ) { + return; + } + setRooted( false ); + return; + } // unRoot() +} diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyBranch.java b/forester/java/src/org/forester/phylogeny/PhylogenyBranch.java new file mode 100644 index 0000000..214337f --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/PhylogenyBranch.java @@ -0,0 +1,168 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny; + +import org.forester.phylogeny.data.PhylogenyData; + +/* + * @author Christian M. Zmasek + */ +public class PhylogenyBranch implements Edge { + + private final PhylogenyNode _node_1; + private final PhylogenyNode _node_2; + private PhylogenyData _data; + private final boolean _is_directed; + private boolean _towards_1; + + public PhylogenyBranch( final PhylogenyNode first_node, final PhylogenyNode second_node ) { + if ( ( first_node == null ) || ( second_node == null ) ) { + throw new IllegalArgumentException( "Attempt to create a branch with a null node" ); + } + _node_1 = first_node; + _node_2 = second_node; + _is_directed = false; + } + + public PhylogenyBranch( final PhylogenyNode first_node, + final PhylogenyNode second_node, + final boolean direction_towards_first ) { + if ( ( first_node == null ) || ( second_node == null ) ) { + throw new IllegalArgumentException( "Attempt to create a branch with a null node" ); + } + _node_1 = first_node; + _node_2 = second_node; + _is_directed = true; + _towards_1 = direction_towards_first; + } + + @Override + public boolean equals( final Object obj ) { + if ( this == obj ) { + return true; + } + if ( obj == null ) { + return false; + } + if ( getClass() != obj.getClass() ) { + return false; + } + final PhylogenyBranch other = ( PhylogenyBranch ) obj; + return hashCode() == other.hashCode(); + } + + public PhylogenyNode getConnectedNode( final PhylogenyNode node ) throws IllegalArgumentException { + if ( node == _node_1 ) { + return _node_2; + } + else if ( node == _node_2 ) { + return _node_1; + } + else { + throw new IllegalArgumentException( "Attempt to get " + "connected node on branch with node which is " + + "not connected by the branch" ); + } + } + + public PhylogenyData getData() { + return _data; + } + + public PhylogenyNode getFirstNode() { + return _node_1; + } + + public PhylogenyNode getSecondNode() { + return _node_2; + } + + @Override + public int hashCode() { + final int PRIME = 31; + int result = 1; + final int node_1_hc = _node_1.hashCode(); + final int node_2_hc = _node_2.hashCode(); + int hc_1 = 0; + int hc_2 = 0; + if ( !_is_directed ) { + if ( node_1_hc > node_2_hc ) { + hc_1 = node_2_hc; + hc_2 = node_1_hc; + } + else { + hc_1 = node_1_hc; + hc_2 = node_2_hc; + } + } + else { + if ( _towards_1 ) { + hc_1 = node_2_hc; + hc_2 = node_1_hc; + } + else { + hc_1 = node_1_hc; + hc_2 = node_2_hc; + } + } + result = PRIME * result + ( ( _data == null ) ? 0 : _data.hashCode() ); + result = PRIME * result + ( _is_directed ? 1231 : 1237 ); + result = PRIME * result + hc_1; + result = PRIME * result + hc_2; + return result; + } + + public boolean isDirected() { + return _is_directed; + } + + public boolean isDirectionTowards( final PhylogenyNode node ) throws RuntimeException { + if ( !isDirected() ) { + throw new RuntimeException( "Attempt to get direction of undirected branch" ); + } + return ( ( node == _node_1 ) && _towards_1 ); + } + + public void setDirectionTowards( final PhylogenyNode node ) { + _towards_1 = node == _node_1; + } + + @Override + public String toString() { + if ( isDirected() ) { + if ( isDirectionTowards( getFirstNode() ) ) { + return ( getSecondNode().getName() + " -> " + getFirstNode().getName() ); + } + else { + return ( getFirstNode().getName() + " -> " + getSecondNode().getName() ); + } + } + else { + return ( getFirstNode().getName() + " -- " + getSecondNode().getName() ); + } + } +} diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java new file mode 100644 index 0000000..6569c81 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -0,0 +1,1186 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny; + +import java.awt.Color; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.phylogeny.data.BranchColor; +import org.forester.phylogeny.data.BranchWidth; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.DomainArchitecture; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.FailedConditionCheckException; +import org.forester.util.ForesterUtil; + +public class PhylogenyMethods { + + private static PhylogenyMethods _instance = null; + private final Set _temp_hash_set = new HashSet(); + private PhylogenyNode _farthest_1 = null; + private PhylogenyNode _farthest_2 = null; + + private PhylogenyMethods() { + // Hidden constructor. + } + + /** + * Calculates the distance between PhylogenyNodes node1 and node2. + * + * + * @param node1 + * @param node2 + * @return distance between node1 and node2 + */ + public double calculateDistance( final PhylogenyNode node1, final PhylogenyNode node2 ) { + final PhylogenyNode lca = obtainLCA( node1, node2 ); + final PhylogenyNode n1 = node1; + final PhylogenyNode n2 = node2; + return ( PhylogenyMethods.getDistance( n1, lca ) + PhylogenyMethods.getDistance( n2, lca ) ); + } + + public double calculateFurthestDistance( final Phylogeny phylogeny ) { + if ( phylogeny.getNumberOfExternalNodes() < 2 ) { + return 0.0; + } + _farthest_1 = null; + _farthest_2 = null; + PhylogenyNode node_1 = null; + PhylogenyNode node_2 = null; + double farthest_d = -Double.MAX_VALUE; + final PhylogenyMethods methods = PhylogenyMethods.getInstance(); + final List ext_nodes = phylogeny.getRoot().getAllExternalDescendants(); + for( int i = 1; i < ext_nodes.size(); ++i ) { + for( int j = 0; j < i; ++j ) { + final double d = methods.calculateDistance( ext_nodes.get( i ), ext_nodes.get( j ) ); + if ( d < 0.0 ) { + throw new RuntimeException( "distance cannot be negative" ); + } + if ( d > farthest_d ) { + farthest_d = d; + node_1 = ext_nodes.get( i ); + node_2 = ext_nodes.get( j ); + } + } + } + _farthest_1 = node_1; + _farthest_2 = node_2; + return farthest_d; + } + + @Override + public Object clone() throws CloneNotSupportedException { + throw new CloneNotSupportedException(); + } + + public PhylogenyNode getFarthestNode1() { + return _farthest_1; + } + + public PhylogenyNode getFarthestNode2() { + return _farthest_2; + } + + /** + * Returns the LCA of PhylogenyNodes node1 and node2. + * + * + * @param node1 + * @param node2 + * @return LCA of node1 and node2 + */ + public PhylogenyNode obtainLCA( final PhylogenyNode node1, final PhylogenyNode node2 ) { + _temp_hash_set.clear(); + PhylogenyNode n1 = node1; + PhylogenyNode n2 = node2; + _temp_hash_set.add( n1.getId() ); + while ( !n1.isRoot() ) { + n1 = n1.getParent(); + _temp_hash_set.add( n1.getId() ); + } + while ( !_temp_hash_set.contains( n2.getId() ) && !n2.isRoot() ) { + n2 = n2.getParent(); + } + if ( !_temp_hash_set.contains( n2.getId() ) ) { + throw new IllegalArgumentException( "attempt to get LCA of two nodes which do not share a common root" ); + } + return n2; + } + + /** + * Returns all orthologs of the external PhylogenyNode n of this Phylogeny. + * Orthologs are returned as List of node references. + *

    + * PRECONDITION: This tree must be binary and rooted, and speciation - + * duplication need to be assigned for each of its internal Nodes. + *

    + * Returns null if this Phylogeny is empty or if n is internal. + * @param n + * external PhylogenyNode whose orthologs are to be returned + * @return Vector of references to all orthologous Nodes of PhylogenyNode n + * of this Phylogeny, null if this Phylogeny is empty or if n is + * internal + */ + public List getOrthologousNodes( final Phylogeny phy, final PhylogenyNode node ) { + final List nodes = new ArrayList(); + final PhylogenyNodeIterator it = phy.iteratorExternalForward(); + while ( it.hasNext() ) { + final PhylogenyNode temp_node = it.next(); + if ( ( temp_node != node ) && isAreOrthologous( node, temp_node ) ) { + nodes.add( temp_node ); + } + } + return nodes; + } + + public boolean isAreOrthologous( final PhylogenyNode node1, final PhylogenyNode node2 ) { + return !obtainLCA( node1, node2 ).isDuplication(); + } + + static double addPhylogenyDistances( final double a, final double b ) { + if ( ( a >= 0.0 ) && ( b >= 0.0 ) ) { + return a + b; + } + else if ( a >= 0.0 ) { + return a; + } + else if ( b >= 0.0 ) { + return b; + } + return PhylogenyNode.DISTANCE_DEFAULT; + } + + // Helper for getUltraParalogousNodes( PhylogenyNode ). + public static boolean areAllChildrenDuplications( final PhylogenyNode n ) { + if ( n.isExternal() ) { + return false; + } + else { + if ( n.isDuplication() ) { + //FIXME test me! + for( final PhylogenyNode desc : n.getDescendants() ) { + if ( !areAllChildrenDuplications( desc ) ) { + return false; + } + } + return true; + } + else { + return false; + } + } + } + + public static int calculateDepth( final PhylogenyNode node ) { + PhylogenyNode n = node; + int steps = 0; + while ( !n.isRoot() ) { + steps++; + n = n.getParent(); + } + return steps; + } + + public static double calculateDistanceToRoot( final PhylogenyNode node ) { + PhylogenyNode n = node; + double d = 0.0; + while ( !n.isRoot() ) { + if ( n.getDistanceToParent() > 0.0 ) { + d += n.getDistanceToParent(); + } + n = n.getParent(); + } + return d; + } + + public static short calculateMaxBranchesToLeaf( final PhylogenyNode node ) { + if ( node.isExternal() ) { + return 0; + } + short max = 0; + for( PhylogenyNode d : node.getAllExternalDescendants() ) { + short steps = 0; + while ( d != node ) { + if ( d.isCollapse() ) { + steps = 0; + } + else { + steps++; + } + d = d.getParent(); + } + if ( max < steps ) { + max = steps; + } + } + return max; + } + + public static int calculateMaxDepth( final Phylogeny phy ) { + int max = 0; + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + final int steps = calculateDepth( node ); + if ( steps > max ) { + max = steps; + } + } + return max; + } + + public static double calculateMaxDistanceToRoot( final Phylogeny phy ) { + double max = 0.0; + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + final double d = calculateDistanceToRoot( node ); + if ( d > max ) { + max = d; + } + } + return max; + } + + public static int calculateMaximumNumberOfDescendantsPerNode( final Phylogeny phy ) { + int max = 0; + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.getNumberOfDescendants() > max ) { + max = node.getNumberOfDescendants(); + } + } + return max; + } + + /** + * Returns the set of distinct taxonomies of + * all external nodes of node. + * If at least one the external nodes has no taxonomy, + * null is returned. + * + */ + public static Set obtainDistinctTaxonomies( final PhylogenyNode node ) { + final List descs = node.getAllExternalDescendants(); + final Set tax_set = new HashSet(); + for( final PhylogenyNode n : descs ) { + if ( !n.getNodeData().isHasTaxonomy() || n.getNodeData().getTaxonomy().isEmpty() ) { + return null; + } + tax_set.add( n.getNodeData().getTaxonomy() ); + } + return tax_set; + } + + /** + * Returns a map of distinct taxonomies of + * all external nodes of node. + * If at least one of the external nodes has no taxonomy, + * null is returned. + * + */ + public static SortedMap obtainDistinctTaxonomyCounts( final PhylogenyNode node ) { + final List descs = node.getAllExternalDescendants(); + final SortedMap tax_map = new TreeMap(); + for( final PhylogenyNode n : descs ) { + if ( !n.getNodeData().isHasTaxonomy() || n.getNodeData().getTaxonomy().isEmpty() ) { + return null; + } + final Taxonomy t = n.getNodeData().getTaxonomy(); + if ( tax_map.containsKey( t ) ) { + tax_map.put( t, tax_map.get( t ) + 1 ); + } + else { + tax_map.put( t, 1 ); + } + } + return tax_map; + } + + public static int calculateNumberOfExternalNodesWithoutTaxonomy( final PhylogenyNode node ) { + final List descs = node.getAllExternalDescendants(); + int x = 0; + for( final PhylogenyNode n : descs ) { + if ( !n.getNodeData().isHasTaxonomy() || n.getNodeData().getTaxonomy().isEmpty() ) { + x++; + } + } + return x; + } + + /** + * Deep copies the phylogeny originating from this node. + */ + static PhylogenyNode copySubTree( final PhylogenyNode source ) { + if ( source == null ) { + return null; + } + else { + final PhylogenyNode newnode = source.copyNodeData(); + if ( !source.isExternal() ) { + for( int i = 0; i < source.getNumberOfDescendants(); ++i ) { + newnode.setChildNode( i, PhylogenyMethods.copySubTree( source.getChildNode( i ) ) ); + } + } + return newnode; + } + } + + /** + * Shallow copies the phylogeny originating from this node. + */ + static PhylogenyNode copySubTreeShallow( final PhylogenyNode source ) { + if ( source == null ) { + return null; + } + else { + final PhylogenyNode newnode = source.copyNodeDataShallow(); + if ( !source.isExternal() ) { + for( int i = 0; i < source.getNumberOfDescendants(); ++i ) { + newnode.setChildNode( i, PhylogenyMethods.copySubTreeShallow( source.getChildNode( i ) ) ); + } + } + return newnode; + } + } + + public static void deleteExternalNodesNegativeSelection( final Set to_delete, final Phylogeny phy ) { + phy.hashIDs(); + for( final Integer id : to_delete ) { + phy.deleteSubtree( phy.getNode( id ), true ); + } + phy.hashIDs(); + } + + public static void deleteExternalNodesNegativeSelection( final String[] node_names_to_delete, final Phylogeny p ) + throws IllegalArgumentException { + for( int i = 0; i < node_names_to_delete.length; ++i ) { + if ( ForesterUtil.isEmpty( node_names_to_delete[ i ] ) ) { + continue; + } + List nodes = null; + nodes = p.getNodes( node_names_to_delete[ i ] ); + final Iterator it = nodes.iterator(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !n.isExternal() ) { + throw new IllegalArgumentException( "attempt to delete non-external node \"" + + node_names_to_delete[ i ] + "\"" ); + } + p.deleteSubtree( n, true ); + } + } + } + + public static void deleteExternalNodesPositiveSelection( final Set species_to_keep, final Phylogeny phy ) { + // final Set to_delete = new HashSet(); + for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + if ( !species_to_keep.contains( n.getNodeData().getTaxonomy() ) ) { + //to_delete.add( n.getNodeId() ); + phy.deleteSubtree( n, true ); + } + } + else { + throw new IllegalArgumentException( "node " + n.getId() + " has no taxonomic data" ); + } + } + phy.hashIDs(); + phy.externalNodesHaveChanged(); + // deleteExternalNodesNegativeSelection( to_delete, phy ); + } + + public static List deleteExternalNodesPositiveSelection( final String[] node_names_to_keep, + final Phylogeny p ) { + final PhylogenyNodeIterator it = p.iteratorExternalForward(); + final String[] to_delete = new String[ p.getNumberOfExternalNodes() ]; + int i = 0; + Arrays.sort( node_names_to_keep ); + while ( it.hasNext() ) { + final String curent_name = it.next().getName(); + if ( Arrays.binarySearch( node_names_to_keep, curent_name ) < 0 ) { + to_delete[ i++ ] = curent_name; + } + } + PhylogenyMethods.deleteExternalNodesNegativeSelection( to_delete, p ); + final List deleted = new ArrayList(); + for( final String n : to_delete ) { + if ( !ForesterUtil.isEmpty( n ) ) { + deleted.add( n ); + } + } + return deleted; + } + + public static List getAllDescendants( final PhylogenyNode node ) { + final List descs = new ArrayList(); + final Set encountered = new HashSet(); + if ( !node.isExternal() ) { + final List exts = node.getAllExternalDescendants(); + for( PhylogenyNode current : exts ) { + descs.add( current ); + while ( current != node ) { + current = current.getParent(); + if ( encountered.contains( current.getId() ) ) { + continue; + } + descs.add( current ); + encountered.add( current.getId() ); + } + } + } + return descs; + } + + /** + * + * Convenience method + * + * @param node + * @return + */ + public static Color getBranchColorValue( final PhylogenyNode node ) { + if ( node.getBranchData().getBranchColor() == null ) { + return null; + } + return node.getBranchData().getBranchColor().getValue(); + } + + /** + * Convenience method + */ + public static double getBranchWidthValue( final PhylogenyNode node ) { + if ( !node.getBranchData().isHasBranchWidth() ) { + return BranchWidth.BRANCH_WIDTH_DEFAULT_VALUE; + } + return node.getBranchData().getBranchWidth().getValue(); + } + + /** + * Convenience method + */ + public static double getConfidenceValue( final PhylogenyNode node ) { + if ( !node.getBranchData().isHasConfidences() ) { + return Confidence.CONFIDENCE_DEFAULT_VALUE; + } + return node.getBranchData().getConfidence( 0 ).getValue(); + } + + /** + * Convenience method + */ + public static double[] getConfidenceValuesAsArray( final PhylogenyNode node ) { + if ( !node.getBranchData().isHasConfidences() ) { + return new double[ 0 ]; + } + final double[] values = new double[ node.getBranchData().getConfidences().size() ]; + int i = 0; + for( final Confidence c : node.getBranchData().getConfidences() ) { + values[ i++ ] = c.getValue(); + } + return values; + } + + /** + * Calculates the distance between PhylogenyNodes n1 and n2. + * PRECONDITION: n1 is a descendant of n2. + * + * @param n1 + * a descendant of n2 + * @param n2 + * @return distance between n1 and n2 + */ + private static double getDistance( PhylogenyNode n1, final PhylogenyNode n2 ) { + double d = 0.0; + while ( n1 != n2 ) { + if ( n1.getDistanceToParent() > 0.0 ) { + d += n1.getDistanceToParent(); + } + n1 = n1.getParent(); + } + return d; + } + + /** + * Returns taxonomy t if all external descendants have + * the same taxonomy t, null otherwise. + * + */ + public static Taxonomy getExternalDescendantsTaxonomy( final PhylogenyNode node ) { + final List descs = node.getAllExternalDescendants(); + Taxonomy tax = null; + for( final PhylogenyNode n : descs ) { + if ( !n.getNodeData().isHasTaxonomy() || n.getNodeData().getTaxonomy().isEmpty() ) { + return null; + } + else if ( tax == null ) { + tax = n.getNodeData().getTaxonomy(); + } + else if ( n.getNodeData().getTaxonomy().isEmpty() || !tax.isEqual( n.getNodeData().getTaxonomy() ) ) { + return null; + } + } + return tax; + } + + public static PhylogenyNode getFurthestDescendant( final PhylogenyNode node ) { + final List children = node.getAllExternalDescendants(); + PhylogenyNode farthest = null; + double longest = -Double.MAX_VALUE; + for( final PhylogenyNode child : children ) { + if ( PhylogenyMethods.getDistance( child, node ) > longest ) { + farthest = child; + longest = PhylogenyMethods.getDistance( child, node ); + } + } + return farthest; + } + + public static PhylogenyMethods getInstance() { + if ( PhylogenyMethods._instance == null ) { + PhylogenyMethods._instance = new PhylogenyMethods(); + } + return PhylogenyMethods._instance; + } + + /** + * Returns the largest confidence value found on phy. + */ + static public double getMaximumConfidenceValue( final Phylogeny phy ) { + double max = -Double.MAX_VALUE; + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final double s = PhylogenyMethods.getConfidenceValue( iter.next() ); + if ( ( s != Confidence.CONFIDENCE_DEFAULT_VALUE ) && ( s > max ) ) { + max = s; + } + } + return max; + } + + static public int getMinimumDescendentsPerInternalNodes( final Phylogeny phy ) { + int min = Integer.MAX_VALUE; + int d = 0; + PhylogenyNode n; + for( final PhylogenyNodeIterator it = phy.iteratorPreorder(); it.hasNext(); ) { + n = it.next(); + if ( n.isInternal() ) { + d = n.getNumberOfDescendants(); + if ( d < min ) { + min = d; + } + } + } + return min; + } + + /** + * Convenience method for display purposes. + * Not intended for algorithms. + */ + public static String getSpecies( final PhylogenyNode node ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + return ""; + } + if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + return node.getNodeData().getTaxonomy().getTaxonomyCode(); + } + else if ( !ForesterUtil.isEmpty( node.getNodeData().getTaxonomy().getScientificName() ) ) { + return node.getNodeData().getTaxonomy().getScientificName(); + } + else { + return node.getNodeData().getTaxonomy().getCommonName(); + } + } + + /** + * Returns all Nodes which are connected to external PhylogenyNode n of this + * Phylogeny by a path containing only speciation events. We call these + * "super orthologs". Nodes are returned as Vector of references to Nodes. + *

    + * PRECONDITION: This tree must be binary and rooted, and speciation - + * duplication need to be assigned for each of its internal Nodes. + *

    + * Returns null if this Phylogeny is empty or if n is internal. + * @param n + * external PhylogenyNode whose strictly speciation related Nodes + * are to be returned + * @return Vector of references to all strictly speciation related Nodes of + * PhylogenyNode n of this Phylogeny, null if this Phylogeny is + * empty or if n is internal + */ + public static List getSuperOrthologousNodes( final PhylogenyNode n ) { + // FIXME + PhylogenyNode node = n, deepest = null; + final List v = new ArrayList(); + if ( !node.isExternal() ) { + return null; + } + while ( !node.isRoot() && !node.getParent().isDuplication() ) { + node = node.getParent(); + } + deepest = node; + deepest.setIndicatorsToZero(); + do { + if ( !node.isExternal() ) { + if ( node.getIndicator() == 0 ) { + node.setIndicator( ( byte ) 1 ); + if ( !node.isDuplication() ) { + node = node.getChildNode1(); + } + } + if ( node.getIndicator() == 1 ) { + node.setIndicator( ( byte ) 2 ); + if ( !node.isDuplication() ) { + node = node.getChildNode2(); + } + } + if ( ( node != deepest ) && ( node.getIndicator() == 2 ) ) { + node = node.getParent(); + } + } + else { + if ( node != n ) { + v.add( node ); + } + if ( node != deepest ) { + node = node.getParent(); + } + else { + node.setIndicator( ( byte ) 2 ); + } + } + } while ( ( node != deepest ) || ( deepest.getIndicator() != 2 ) ); + return v; + } + + /** + * Convenience method for display purposes. + * Not intended for algorithms. + */ + public static String getTaxonomyIdentifier( final PhylogenyNode node ) { + if ( !node.getNodeData().isHasTaxonomy() || ( node.getNodeData().getTaxonomy().getIdentifier() == null ) ) { + return ""; + } + return node.getNodeData().getTaxonomy().getIdentifier().getValue(); + } + + /** + * Returns all Nodes which are connected to external PhylogenyNode n of this + * Phylogeny by a path containing, and leading to, only duplication events. + * We call these "ultra paralogs". Nodes are returned as Vector of + * references to Nodes. + *

    + * PRECONDITION: This tree must be binary and rooted, and speciation - + * duplication need to be assigned for each of its internal Nodes. + *

    + * Returns null if this Phylogeny is empty or if n is internal. + *

    + * (Last modified: 10/06/01) + * + * @param n + * external PhylogenyNode whose ultra paralogs are to be returned + * @return Vector of references to all ultra paralogs of PhylogenyNode n of + * this Phylogeny, null if this Phylogeny is empty or if n is + * internal + */ + public static List getUltraParalogousNodes( final PhylogenyNode n ) { + // FIXME test me + PhylogenyNode node = n; + if ( !node.isExternal() ) { + return null; + } + while ( !node.isRoot() && node.getParent().isDuplication() && areAllChildrenDuplications( node.getParent() ) ) { + node = node.getParent(); + } + final List nodes = node.getAllExternalDescendants(); + nodes.remove( n ); + return nodes; + } + + public static String inferCommonPartOfScientificNameOfDescendants( final PhylogenyNode node ) { + final List descs = node.getDescendants(); + String sn = null; + for( final PhylogenyNode n : descs ) { + if ( !n.getNodeData().isHasTaxonomy() + || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + return null; + } + else if ( sn == null ) { + sn = n.getNodeData().getTaxonomy().getScientificName().trim(); + } + else { + String sn_current = n.getNodeData().getTaxonomy().getScientificName().trim(); + if ( !sn.equals( sn_current ) ) { + boolean overlap = false; + while ( ( sn.indexOf( ' ' ) >= 0 ) || ( sn_current.indexOf( ' ' ) >= 0 ) ) { + if ( ForesterUtil.countChars( sn, ' ' ) > ForesterUtil.countChars( sn_current, ' ' ) ) { + sn = sn.substring( 0, sn.lastIndexOf( ' ' ) ).trim(); + } + else { + sn_current = sn_current.substring( 0, sn_current.lastIndexOf( ' ' ) ).trim(); + } + if ( sn.equals( sn_current ) ) { + overlap = true; + break; + } + } + if ( !overlap ) { + return null; + } + } + } + } + return sn; + } + + public static boolean isHasExternalDescendant( final PhylogenyNode node ) { + for( int i = 0; i < node.getNumberOfDescendants(); ++i ) { + if ( node.getChildNode( i ).isExternal() ) { + return true; + } + } + return false; + } + + /* + * This is case insensitive. + * + */ + public synchronized static boolean isTaxonomyHasIdentifierOfGivenProvider( final Taxonomy tax, + final String[] providers ) { + if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getProvider() ) ) { + final String my_tax_prov = tax.getIdentifier().getProvider(); + for( final String provider : providers ) { + if ( provider.equalsIgnoreCase( my_tax_prov ) ) { + return true; + } + } + return false; + } + else { + return false; + } + } + + private static boolean match( final String s, + final String query, + final boolean case_sensitive, + final boolean partial ) { + if ( ForesterUtil.isEmpty( s ) || ForesterUtil.isEmpty( query ) ) { + return false; + } + String my_s = s.trim(); + String my_query = query.trim(); + if ( !case_sensitive ) { + my_s = my_s.toLowerCase(); + my_query = my_query.toLowerCase(); + } + if ( partial ) { + return my_s.indexOf( my_query ) >= 0; + } + else { + return my_s.equals( my_query ); + } + } + + public static void midpointRoot( final Phylogeny phylogeny ) { + if ( phylogeny.getNumberOfExternalNodes() < 2 ) { + return; + } + final PhylogenyMethods methods = getInstance(); + final double farthest_d = methods.calculateFurthestDistance( phylogeny ); + final PhylogenyNode f1 = methods.getFarthestNode1(); + final PhylogenyNode f2 = methods.getFarthestNode2(); + if ( farthest_d <= 0.0 ) { + return; + } + double x = farthest_d / 2.0; + PhylogenyNode n = f1; + if ( PhylogenyMethods.getDistance( f1, phylogeny.getRoot() ) < PhylogenyMethods.getDistance( f2, phylogeny + .getRoot() ) ) { + n = f2; + } + while ( ( x > n.getDistanceToParent() ) && !n.isRoot() ) { + x -= ( n.getDistanceToParent() > 0 ? n.getDistanceToParent() : 0 ); + n = n.getParent(); + } + phylogeny.reRoot( n, x ); + phylogeny.recalculateNumberOfExternalDescendants( true ); + final PhylogenyNode a = getFurthestDescendant( phylogeny.getRoot().getChildNode1() ); + final PhylogenyNode b = getFurthestDescendant( phylogeny.getRoot().getChildNode2() ); + final double da = getDistance( a, phylogeny.getRoot() ); + final double db = getDistance( b, phylogeny.getRoot() ); + if ( Math.abs( da - db ) > 0.000001 ) { + throw new FailedConditionCheckException( "this should not have happened: midpoint rooting failed: da=" + + da + ", db=" + db + ", diff=" + Math.abs( da - db ) ); + } + } + + public static void normalizeBootstrapValues( final Phylogeny phylogeny, + final double max_bootstrap_value, + final double max_normalized_value ) { + for( final PhylogenyNodeIterator iter = phylogeny.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( node.isInternal() ) { + final double confidence = getConfidenceValue( node ); + if ( confidence != Confidence.CONFIDENCE_DEFAULT_VALUE ) { + if ( confidence >= max_bootstrap_value ) { + setBootstrapConfidence( node, max_normalized_value ); + } + else { + setBootstrapConfidence( node, ( confidence * max_normalized_value ) / max_bootstrap_value ); + } + } + } + } + } + + public static List obtainAllNodesAsList( final Phylogeny phy ) { + final List nodes = new ArrayList(); + if ( phy.isEmpty() ) { + return nodes; + } + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + nodes.add( iter.next() ); + } + return nodes; + } + + public static void postorderBranchColorAveragingExternalNodeBased( final Phylogeny p ) { + for( final PhylogenyNodeIterator iter = p.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + double red = 0.0; + double green = 0.0; + double blue = 0.0; + int n = 0; + if ( node.isInternal() ) { + for( final PhylogenyNodeIterator iterator = node.iterateChildNodesForward(); iterator.hasNext(); ) { + final PhylogenyNode child_node = iterator.next(); + final Color child_color = getBranchColorValue( child_node ); + if ( child_color != null ) { + ++n; + red += child_color.getRed(); + green += child_color.getGreen(); + blue += child_color.getBlue(); + } + } + setBranchColorValue( node, new Color( ForesterUtil.roundToInt( red / n ), ForesterUtil + .roundToInt( green / n ), ForesterUtil.roundToInt( blue / n ) ) ); + } + } + } + + public static void removeNode( final PhylogenyNode remove_me, final Phylogeny phylogeny ) { + if ( remove_me.isRoot() ) { + throw new IllegalArgumentException( "ill advised attempt to remove root node" ); + } + if ( remove_me.isExternal() ) { + phylogeny.deleteSubtree( remove_me, false ); + } + else { + final PhylogenyNode parent = remove_me.getParent(); + final List descs = remove_me.getDescendants(); + parent.removeChildNode( remove_me ); + for( final PhylogenyNode desc : descs ) { + parent.addAsChild( desc ); + desc.setDistanceToParent( addPhylogenyDistances( remove_me.getDistanceToParent(), desc + .getDistanceToParent() ) ); + } + remove_me.setParent( null ); + phylogeny.setIdHash( null ); + phylogeny.externalNodesHaveChanged(); + } + } + + public static List searchData( final String query, + final Phylogeny phy, + final boolean case_sensitive, + final boolean partial ) { + final List nodes = new ArrayList(); + if ( phy.isEmpty() || ( query == null ) ) { + return nodes; + } + if ( ForesterUtil.isEmpty( query ) ) { + return nodes; + } + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + boolean match = false; + if ( match( node.getName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getTaxonomyCode(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getCommonName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getScientificName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && ( node.getNodeData().getTaxonomy().getIdentifier() != null ) + && match( node.getNodeData().getTaxonomy().getIdentifier().getValue(), + query, + case_sensitive, + partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() && !node.getNodeData().getTaxonomy().getSynonyms().isEmpty() ) { + final List syns = node.getNodeData().getTaxonomy().getSynonyms(); + I: for( final String syn : syns ) { + if ( match( syn, query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + } + else if ( node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getAccession() != null ) + && match( node.getNodeData().getSequence().getAccession().getValue(), + query, + case_sensitive, + partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getDomainArchitecture() != null ) ) { + final DomainArchitecture da = node.getNodeData().getSequence().getDomainArchitecture(); + I: for( int i = 0; i < da.getNumberOfDomains(); ++i ) { + if ( match( da.getDomain( i ).getName(), query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + } + if ( match ) { + nodes.add( node ); + } + } + return nodes; + } + + public static List searchDataLogicalAnd( final String[] queries, + final Phylogeny phy, + final boolean case_sensitive, + final boolean partial ) { + final List nodes = new ArrayList(); + if ( phy.isEmpty() || ( queries == null ) || ( queries.length < 1 ) ) { + return nodes; + } + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + boolean all_matched = true; + for( final String query : queries ) { + boolean match = false; + if ( ForesterUtil.isEmpty( query ) ) { + continue; + } + if ( match( node.getName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getTaxonomyCode(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getCommonName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && match( node.getNodeData().getTaxonomy().getScientificName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && ( node.getNodeData().getTaxonomy().getIdentifier() != null ) + && match( node.getNodeData().getTaxonomy().getIdentifier().getValue(), + query, + case_sensitive, + partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasTaxonomy() + && !node.getNodeData().getTaxonomy().getSynonyms().isEmpty() ) { + final List syns = node.getNodeData().getTaxonomy().getSynonyms(); + I: for( final String syn : syns ) { + if ( match( syn, query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + } + else if ( node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getName(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasSequence() + && match( node.getNodeData().getSequence().getSymbol(), query, case_sensitive, partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getAccession() != null ) + && match( node.getNodeData().getSequence().getAccession().getValue(), + query, + case_sensitive, + partial ) ) { + match = true; + } + else if ( node.getNodeData().isHasSequence() + && ( node.getNodeData().getSequence().getDomainArchitecture() != null ) ) { + final DomainArchitecture da = node.getNodeData().getSequence().getDomainArchitecture(); + I: for( int i = 0; i < da.getNumberOfDomains(); ++i ) { + if ( match( da.getDomain( i ).getName(), query, case_sensitive, partial ) ) { + match = true; + break I; + } + } + } + if ( !match ) { + all_matched = false; + break; + } + } + if ( all_matched ) { + nodes.add( node ); + } + } + return nodes; + } + + /** + * Convenience method. + * Sets value for the first confidence value (created if not present, values overwritten otherwise). + */ + public static void setBootstrapConfidence( final PhylogenyNode node, final double bootstrap_confidence_value ) { + setConfidence( node, bootstrap_confidence_value, "bootstrap" ); + } + + public static void setBranchColorValue( final PhylogenyNode node, final Color color ) { + if ( node.getBranchData().getBranchColor() == null ) { + node.getBranchData().setBranchColor( new BranchColor() ); + } + node.getBranchData().getBranchColor().setValue( color ); + } + + /** + * Convenience method + */ + public static void setBranchWidthValue( final PhylogenyNode node, final double branch_width_value ) { + node.getBranchData().setBranchWidth( new BranchWidth( branch_width_value ) ); + } + + /** + * Convenience method. + * Sets value for the first confidence value (created if not present, values overwritten otherwise). + */ + public static void setConfidence( final PhylogenyNode node, final double confidence_value ) { + setConfidence( node, confidence_value, "" ); + } + + /** + * Convenience method. + * Sets value for the first confidence value (created if not present, values overwritten otherwise). + */ + public static void setConfidence( final PhylogenyNode node, final double confidence_value, final String type ) { + Confidence c = null; + if ( node.getBranchData().getNumberOfConfidences() > 0 ) { + c = node.getBranchData().getConfidence( 0 ); + } + else { + c = new Confidence(); + node.getBranchData().addConfidence( c ); + } + c.setType( type ); + c.setValue( confidence_value ); + } + + public static void setScientificName( final PhylogenyNode node, final String scientific_name ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy().setScientificName( scientific_name ); + } + + /** + * Convenience method to set the taxonomy code of a phylogeny node. + * + * + * @param node + * @param taxonomy_code + */ + public static void setTaxonomyCode( final PhylogenyNode node, final String taxonomy_code ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + node.getNodeData().getTaxonomy().setTaxonomyCode( taxonomy_code ); + } + + /** + * Removes from Phylogeny to_be_stripped all external Nodes which are + * associated with a species NOT found in Phylogeny reference. + * + * @param reference + * a reference Phylogeny + * @param to_be_stripped + * Phylogeny to be stripped + * @return number of external nodes removed from to_be_stripped + */ + public static int taxonomyBasedDeletionOfExternalNodes( final Phylogeny reference, final Phylogeny to_be_stripped ) { + final Set ref_ext_taxo = new HashSet(); + final ArrayList nodes_to_delete = new ArrayList(); + for( final PhylogenyNodeIterator it = reference.iteratorExternalForward(); it.hasNext(); ) { + ref_ext_taxo.add( getSpecies( it.next() ) ); + } + for( final PhylogenyNodeIterator it = to_be_stripped.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( !ref_ext_taxo.contains( getSpecies( n ) ) ) { + nodes_to_delete.add( n ); + } + } + for( final PhylogenyNode phylogenyNode : nodes_to_delete ) { + to_be_stripped.deleteSubtree( phylogenyNode, true ); + } + return nodes_to_delete.size(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyNode.java b/forester/java/src/org/forester/phylogeny/PhylogenyNode.java new file mode 100644 index 0000000..3858ea9 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/PhylogenyNode.java @@ -0,0 +1,1032 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.nhx.NHXFormatException; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.util.PhylogenyParserException; +import org.forester.phylogeny.data.BranchData; +import org.forester.phylogeny.data.NodeData; +import org.forester.phylogeny.iterators.ChildNodeIteratorForward; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.phylogeny.iterators.PreorderTreeIterator; +import org.forester.util.ForesterUtil; + +public class PhylogenyNode implements PhylogenyNodeI, Comparable { + + /** Value of -99.0 is used as default value. */ + public final static double DISTANCE_DEFAULT = -1024.0; + private static int _node_count = 0; + private byte _indicator; + private int _id; + private int _sum_ext_nodes; + private float _x; + private float _y; + private double _distance_parent; + private boolean _collapse; + private PhylogenyNode _parent; + private PhylogenyNode _link; + private ArrayList _descendants; + private NodeData _node_data; + private BranchData _branch_data; + private float _x_secondary; + private float _y_secondary; + + /** + * Default constructor for PhylogenyNode. + */ + public PhylogenyNode() { + init(); + setId( PhylogenyNode.getNodeCount() ); + PhylogenyNode.increaseNodeCount(); + setSumExtNodes( 1 ); // For ext node, this number is 1 (not 0!!) + } + + public PhylogenyNode( final String nhx ) throws NHXFormatException { + this( nhx, ForesterUtil.TAXONOMY_EXTRACTION.NO ); + } + + public PhylogenyNode( final String nhx, final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction ) + throws NHXFormatException { + init(); + NHXParser.parseNHX( nhx, this, taxonomy_extraction, false ); + setId( PhylogenyNode.getNodeCount() ); + PhylogenyNode.increaseNodeCount(); + setSumExtNodes( 1 ); // For ext node, this number is 1 (not 0!!) + } + + /** + * Constructor for PhylogenyNode. + *

    + * + * @param s + * String representing one PhylogenyNode in New Hampshire (NH) or + * New Hampshire X (NHX) format. + * @throws NHXFormatException + * @throws PhylogenyParserException + */ + public PhylogenyNode( final String nhx, + final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction, + final boolean replace_underscores ) throws NHXFormatException { + init(); + NHXParser.parseNHX( nhx, this, taxonomy_extraction, replace_underscores ); + setId( PhylogenyNode.getNodeCount() ); + PhylogenyNode.increaseNodeCount(); + setSumExtNodes( 1 ); // For ext node, this number is 1 (not 0!!) + } + + /** + * Adds PhylogenyNode n to the list of child nodes and sets the _parent of n + * to this. + * + * @param n + * the PhylogenyNode to add + */ + final public void addAsChild( final PhylogenyNodeI node ) { + final PhylogenyNode n = ( PhylogenyNode ) node; + addChildNode( n ); + n.setParent( this ); + } + + /** + * Adds PhylogenyNode n to the list of child nodes. But does NOT set the + * _parent of n to this. + * + * @see addAsChild( PhylogenyNode n ) + * @param n + * the PhylogenyNode to add + */ + final private void addChildNode( final PhylogenyNode child ) { + getDescendants().add( child ); + } + + final public int compareTo( final PhylogenyNode o ) { + final PhylogenyNode n = o; + if ( ( getName() == null ) || ( n.getName() == null ) ) { + return 0; + } + return getName().compareTo( n.getName() ); + } + + // --------------------------------------------------------- + // Copy and delete Nodes, copy subtress + // --------------------------------------------------------- + /** + * Returns a new PhylogenyNode which has its data copied from this + * PhylogenyNode. Links to the other Nodes in the same Phylogeny are NOT + * copied (e.g. _link to _parent). Field "_link" IS copied. + * + * @see #getLink() + */ + final public PhylogenyNode copyNodeData() { + final PhylogenyNode node = new PhylogenyNode(); + PhylogenyNode.decreaseNodeCount(); + node._id = _id; + node._sum_ext_nodes = _sum_ext_nodes; + node._indicator = _indicator; + node._x = _x; + node._y = _y; + node._distance_parent = _distance_parent; + node._collapse = _collapse; + node._link = _link; + if ( _node_data != null ) { + node._node_data = ( NodeData ) _node_data.copy(); + } + if ( _branch_data != null ) { + node._branch_data = ( BranchData ) _branch_data.copy(); + } + return node; + } + + /** + * Returns a new PhylogenyNode which has the same data as this + * PhylogenyNode. Links to the other Nodes in the same Phylogeny are NOT + * copied (e.g. _link to _parent). Field "_link" IS copied. + * + * @see #getLink() + */ + final public PhylogenyNode copyNodeDataShallow() { + final PhylogenyNode node = new PhylogenyNode(); + PhylogenyNode.decreaseNodeCount(); + node._id = _id; + node._sum_ext_nodes = _sum_ext_nodes; + node._indicator = _indicator; + node._x = _x; + node._y = _y; + node._distance_parent = _distance_parent; + node._collapse = _collapse; + node._link = _link; + node._node_data = _node_data; + node._branch_data = _branch_data; + return node; + } + + @Override + /** + * Based on node name, sequence, and taxonomy. + * + * + */ + final public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + return false; + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + final PhylogenyNode other = ( PhylogenyNode ) o; + if ( !getName().equals( other.getName() ) ) { + return false; + } + final NodeData this_data = getNodeData(); + final NodeData other_data = other.getNodeData(); + if ( ( this_data.isHasSequence() && other_data.isHasSequence() ) + && ( this_data.isHasTaxonomy() && other_data.isHasTaxonomy() ) ) { + return ( this_data.getTaxonomy().isEqual( other_data.getTaxonomy() ) && this_data.getSequence() + .isEqual( other_data.getSequence() ) ); + } + else if ( this_data.isHasSequence() && other_data.isHasSequence() ) { + return ( this_data.getSequence().isEqual( other_data.getSequence() ) ); + } + else if ( this_data.isHasTaxonomy() && other_data.isHasTaxonomy() ) { + return ( this_data.getTaxonomy().isEqual( other_data.getTaxonomy() ) ); + } + else if ( getName().length() > 0 ) { + // Node name is not empty, and equal. + return true; + } + else { + return false; + } + } + } + + // --------------------------------------------------------- + // Obtaining of Nodes + // --------------------------------------------------------- + /** + * Returns a List containing references to all external children of this + * PhylogenyNode. + * + * @return List of references to external Nodes + */ + final public List getAllExternalDescendants() { + final List nodes = new ArrayList(); + if ( isExternal() ) { + nodes.add( this ); + return nodes; + } + PhylogenyNode node1 = this; + while ( !node1.isExternal() ) { + node1 = node1.getFirstChildNode(); + } + PhylogenyNode node2 = this; + while ( !node2.isExternal() ) { + node2 = node2.getLastChildNode(); + } + while ( node1 != node2 ) { + nodes.add( node1 ); + node1 = node1.getNextExternalNode(); + } + nodes.add( node2 ); + return nodes; + } + + /** + * Returns a List containing references to all names of the external + * children of this PhylogenyNode. + * + * @return List of references to names of external Nodes + */ + final public List getAllExternalDescendantsNames() { + final List c = getAllExternalDescendants(); + final List n = new ArrayList( c.size() ); + for( final PhylogenyNode phylogenyNode : c ) { + n.add( phylogenyNode.getName() ); + } + return n; + } + + final public BranchData getBranchData() { + if ( _branch_data == null ) { + _branch_data = new BranchData(); + } + return _branch_data; + } + + final BranchData getBranchDataDirectly() { + return _branch_data; + } + + /** + * This return child node n of this node. + * + * @param n + * the index of the child to get + * @return the child node with index n + * @throws IllegalArgumentException + * if n is out of bounds + */ + final public PhylogenyNode getChildNode( final int i ) { + if ( isExternal() ) { + throw new UnsupportedOperationException( "attempt to get the child node of an external node." ); + } + if ( ( i >= getNumberOfDescendants() ) || ( i < 0 ) ) { + throw new IllegalArgumentException( "attempt to get child node " + i + " of a node with " + + getNumberOfDescendants() + " child nodes" ); + } + return getDescendants().get( i ); + } + + /** + * Convenience method. Returns the first child PhylogenyNode of this + * PhylogenyNode. + */ + final public PhylogenyNode getChildNode1() { + return getChildNode( 0 ); + } + + /** + * Convenience method. Returns the second child PhylogenyNode of this + * PhylogenyNode. + *

    + * [last modified May 18, 2005 by CMZ] + */ + final public PhylogenyNode getChildNode2() { + return getChildNode( 1 ); + } + + /** + * This gets the child node index of this node. + *

    + * + * @return the child node index of this node + * @throws UnsupportedOperationException + * if this node is a root node + */ + final public int getChildNodeIndex() { + return getChildNodeIndex( getParent() ); + } + + /** + * This gets the child node index of this node, given that parent is its + * parent + *

    + * [last modified Aug 14, 2006 by CMZ] + * + * @return the child node index of this node + * @throws UnsupportedOperationException + * if this node is a root node + */ + final public int getChildNodeIndex( final PhylogenyNode parent ) { + if ( isRoot() ) { + throw new UnsupportedOperationException( "Cannot get the child index for a root node." ); + } + for( int i = 0; i < parent.getNumberOfDescendants(); ++i ) { + if ( parent.getChildNode( i ) == this ) { + return i; + } + } + throw new RuntimeException( "Unexpected exception: Could not determine the child index for node: " + this ); + } + + final public List getDescendants() { + return _descendants; + } + + /** + * Returns the length of the branch leading to the _parent of this + * PhylogenyNode (double). + */ + final public double getDistanceToParent() { + return _distance_parent; + } + + /** + * Convenience method. Returns the first child node of this node. + *

    + * [last modified May 18, 2005 by CMZ] + * + * @return the first child node of this node + */ + public final PhylogenyNode getFirstChildNode() { + return getChildNode( 0 ); + } + + /** + * Returns the _indicator value of this PhylogenyNode. + */ + public final byte getIndicator() { + return _indicator; + } + + /** + * Convenience method. Returns the last child node of this node. + *

    + * [last modified May 18, 2005 by CMZ] + * + * @return the last child node of this node + */ + public final PhylogenyNode getLastChildNode() { + return getChildNode( getNumberOfDescendants() - 1 ); + } + + /** + * Returns a refernce to the linked PhylogenyNode of this PhylogenyNode. + * Currently, this method is only used for the speciation-_duplication + * assignment algorithms. + */ + public final PhylogenyNode getLink() { + return _link; + } + + /** + * Returns a refernce to the next external PhylogenyNode of this + * PhylogenyNode. TODO should be in Phylogeny. Returns null if no next + * external node is available. + */ + public final PhylogenyNode getNextExternalNode() { + if ( isInternal() ) { + throw new UnsupportedOperationException( "attempt to get next external node of an internal node" ); + } + else if ( isLastExternalNode() ) { + return null; + } + int index = getChildNodeIndex(); + PhylogenyNode previous_node = this; + PhylogenyNode current_node = getParent(); + while ( !current_node.isRoot() + && ( ( current_node.getNumberOfDescendants() == 1 ) || previous_node.isLastChildNode() ) ) { + index = current_node.getChildNodeIndex(); + previous_node = current_node; + current_node = current_node.getParent(); + } + current_node = current_node.getChildNode( index + 1 ); + while ( current_node.isInternal() ) { + current_node = current_node.getFirstChildNode(); + } + return current_node; + } + + public final NodeData getNodeData() { + if ( _node_data == null ) { + _node_data = new NodeData(); + } + return _node_data; + } + + final NodeData getNodeDataDirectly() { + return _node_data; + } + + // --------------------------------------------------------- + // Set and get methods for Nodes + // --------------------------------------------------------- + /** + * Returns the ID (int) of this PhylogenyNode. + */ + final public int getId() { + return _id; + } + + /** + * Returns the name of this node. + */ + final public String getName() { + return getNodeData().getNodeName(); + } + + final public int getNumberOfDescendants() { + return _descendants.size(); + } + + /** + * Returns the total number of external Nodes originating from this + * PhylogenyNode (int). + */ + final public int getNumberOfExternalNodes() { + return _sum_ext_nodes; + } + + final public int getNumberOfParents() { + return 1; + } + + /** + * Returns a refernce to the parent PhylogenyNode of this PhylogenyNode. + */ + final public PhylogenyNode getParent() { + return _parent; + } + + /** + * Returns a refernce to the next external PhylogenyNode of this + * PhylogenyNode. TODO should be in Phylogeny. Returns null if no next + * external node is available. + */ + final public PhylogenyNode getPreviousExternalNode() { + if ( isInternal() ) { + throw new UnsupportedOperationException( "Cannot get the previous external node for an internal node." ); + } + else if ( isRoot() /* TODO && tree is rooted */) { + throw new UnsupportedOperationException( "Cannot get the previous external node for a root node." ); + } + else if ( isFirstExternalNode() ) { + throw new UnsupportedOperationException( "Attempt to get previous external node of the first external node." ); + } + int index = getChildNodeIndex(); + PhylogenyNode previous_node = this; + PhylogenyNode current_node = getParent(); + while ( !current_node.isRoot() + && ( ( current_node.getNumberOfDescendants() == 1 ) || previous_node.isFirstChildNode() ) ) { + index = current_node.getChildNodeIndex(); + previous_node = current_node; + current_node = current_node.getParent(); + } + current_node = current_node.getChildNode( index - 1 ); + while ( current_node.isInternal() ) { + current_node = current_node.getLastChildNode(); + } + return current_node; + } + + /** + * Used for drawing of Trees. + */ + final public float getXcoord() { + return _x; + } + + final public float getXSecondary() { + return _x_secondary; + } + + /** + * Used for drawing of Trees. + */ + final public float getYcoord() { + return _y; + } + + final public float getYSecondary() { + return _y_secondary; + } + + @Override + final public int hashCode() { + final NodeData data = getNodeData(); + if ( ( getName().length() < 1 ) && !data.isHasSequence() && !data.isHasTaxonomy() ) { + return super.hashCode(); + } + int result = getName().hashCode(); + if ( data.isHasSequence() ) { + result ^= data.getSequence().hashCode(); + } + if ( data.isHasTaxonomy() ) { + result ^= data.getTaxonomy().hashCode(); + } + return result; + } + + final private void init() { + _descendants = new ArrayList(); + _parent = null; + _id = 0; + initializeData(); + } + + /** + * Deletes data of this PhylogenyNode. Links to the other Nodes in the + * Phylogeny, the ID and the sum of external nodes are NOT deleted. Field + * "_link" (_link to Nodes in other Phylogeny) IS deleted. + * + * @see #getLink() (Last modified: 12/20/03) + */ + final public void initializeData() { + _indicator = 0; + _x = 0; + _y = 0; + //_node_name = ""; + _distance_parent = PhylogenyNode.DISTANCE_DEFAULT; + _collapse = false; + _link = null; + _branch_data = null; + _node_data = null; + } + + /** + * Returns whether this PhylogenyNode should be drawn as collapsed. + */ + final public boolean isCollapse() { + return _collapse; + } + + /** + * Returns true if this PhylogenyNode represents a _duplication event, false + * otherwise. + */ + final public boolean isDuplication() { + return getNodeData().isHasEvent() && getNodeData().getEvent().isDuplication(); + } + + /** + * Checks whether this PhylogenyNode is external (tip). + * + * @return true if this PhylogenyNode is external, false otherwise + */ + final public boolean isExternal() { + return ( getNumberOfDescendants() < 1 ); + } + + /** + * DOCUMENT ME! + * + * @return DOCUMENT ME! + */ + final public boolean isFirstChildNode() { + if ( isRoot() /* and tree is rooted TODO */) { + throw new UnsupportedOperationException( "Cannot determine whether the root is the first child node of its _parent." ); + } + return ( getChildNodeIndex() == 0 ); + } + + /** + * DOCUMENT ME! + * + * @return DOCUMENT ME! + */ + final public boolean isFirstExternalNode() { + if ( isInternal() ) { + return false; + } + PhylogenyNode node = this; + while ( !node.isRoot() ) { + if ( !node.isFirstChildNode() ) { + return false; + } + node = node.getParent(); + } + return true; + } + + /** + * Returns whether a _duplication or speciation event has been assigned for + * this PhylogenyNode. + */ + final public boolean isHasAssignedEvent() { + if ( !getNodeData().isHasEvent() ) { + return false; + } + if ( ( getNodeData().getEvent() ).isUnassigned() ) { + return false; + } + return true; + } + + /** + * Checks whether this PhylogenyNode is internal (tip). + * + * @return true if this PhylogenyNode is external, false otherwise + */ + final public boolean isInternal() { + return ( !isExternal() ); + } + + /** + * Returns true if this node is the last child node of its _parent. + *

    + * [last modified June 01, 2005 by CMZ] + * + * @return true if this node is the last child node of its _parent, false + * otherwise + */ + final public boolean isLastChildNode() { + if ( isRoot() /* and tree is rooted TODO */) { + throw new UnsupportedOperationException( "Cannot determine whether the root is the last child node of its _parent." ); + } + return ( getChildNodeIndex() == ( getParent().getNumberOfDescendants() - 1 ) ); + } + + /** + * DOCUMENT ME! + * + * @return DOCUMENT ME! + */ + final public boolean isLastExternalNode() { + if ( isInternal() ) { + return false; + } + PhylogenyNode node = this; + while ( !node.isRoot() ) { + if ( !node.isLastChildNode() ) { + return false; + } + node = node.getParent(); + } + return true; + } + + /** + * Checks whether this PhylogenyNode is a root. + * + * @return true if this PhylogenyNode is the root, false otherwise + */ + final public boolean isRoot() { + return _parent == null; + } + + final public boolean isSpeciation() { + return getNodeData().isHasEvent() && getNodeData().getEvent().isSpeciation(); + } + + // --------------------------------------------------------- + // Iterator + // --------------------------------------------------------- + final public PhylogenyNodeIterator iterateChildNodesForward() { + return new ChildNodeIteratorForward( this ); + } + + // --------------------------------------------------------- + // Basic printing + // --------------------------------------------------------- + /** + * Prints to the console the subtree originating from this PhylogenyNode in + * preorder. + */ + public void preorderPrint() { + System.out.println( this + "\n" ); + if ( isInternal() ) { + for( int i = 0; i < getNumberOfDescendants(); ++i ) { + getChildNode( i ).preorderPrint(); + } + } + } + + final public void removeChildNode( final int i ) { + if ( isExternal() ) { + throw new UnsupportedOperationException( "cannot get the child node for a external node." ); + } + if ( ( i >= getNumberOfDescendants() ) || ( i < 0 ) ) { + throw new IllegalArgumentException( "attempt to get child node " + i + " of a node with " + + getNumberOfDescendants() + " child nodes." ); + } + getDescendants().remove( i ); + } + + final public void removeChildNode( final PhylogenyNode remove_me ) { + removeChildNode( remove_me.getChildNodeIndex() ); + } + + final public void setBranchData( final BranchData branch_data ) { + _branch_data = branch_data; + } + + /** + * Sets the first child PhylogenyNode of this PhylogenyNode to n. + */ + final public void setChild1( final PhylogenyNode n ) { + setChildNode( 0, n ); + } + + /** + * Sets the second child PhylogenyNode of this PhylogenyNode to n. + */ + final public void setChild2( final PhylogenyNode n ) { + setChildNode( 1, n ); + } + + /** + * Inserts PhylogenyNode n at the specified position i into the list of + * child nodes. This does not allow null slots in the list of child nodes: + * If i is larger than the number of child nodes, n is just added to the + * list, not place at index i. + * + * @param i + * the index of position where to add the child + * @param n + * the PhylogenyNode to add + */ + final public void setChildNode( final int i, final PhylogenyNode node ) { + node.setParent( this ); + if ( getNumberOfDescendants() <= i ) { + addChildNode( node ); + } + else { + getDescendants().set( i, node ); + } + } + + final void setChildNodeOnly( final int i, final PhylogenyNode node ) { + if ( getNumberOfDescendants() <= i ) { + addChildNode( node ); + } + else { + getDescendants().set( i, node ); + } + } + + /** + * Sets whether this PhylogenyNode should be drawn as collapsed. + */ + final public void setCollapse( final boolean b ) { + _collapse = b; + } + + /** + * Sets the length of the branch leading to the _parent of this + * PhylogenyNode to double d. + */ + final public void setDistanceToParent( final double d ) { + _distance_parent = d; + } + + /** + * Sets the _indicator value of this PhylogenyNode to i. + */ + final public void setIndicator( final byte i ) { + _indicator = i; + } + + // -------------------------------------------------------------------- + // Adjust methods (related to Phylogeny construction and + // Phylogeny modification) + // -------------------------------------------------------------------- + /** + * Sets the indicators of all the children of this PhylogenyNode to zero. + */ + final void setIndicatorsToZero() { + for( final PreorderTreeIterator it = new PreorderTreeIterator( this ); it.hasNext(); ) { + it.next().setIndicator( ( byte ) 0 ); + } + } + + /** + * Sets the linked PhylogenyNode of this PhylogenyNode to n. Currently, this + * method is only used for the speciation-_duplication assignment + * algorithms. + */ + final public void setLink( final PhylogenyNode n ) { + _link = n; + } + + /** + * Sets the name of this node. + */ + final public void setName( final String node_name ) { + getNodeData().setNodeName( node_name ); + } + + /** + * Sets the Id of this PhylogenyNode to i. In most cases, this number + * should not be set to values lower than getNodeCount() -- which this method + * does not allow. + */ + synchronized final protected void setId( final int i ) { + if ( i < getNodeCount() ) { + throw new IllegalArgumentException( "attempt to set node id to a value less than total node count (thus violating the uniqueness of node ids)" ); + } + _id = i; + } + + /** + * Sets the _parent PhylogenyNode of this PhylogenyNode to n. + */ + final public void setParent( final PhylogenyNode n ) { + _parent = n; + } + + /** + * Sets the total number of external Nodes originating from this + * PhylogenyNode to i (int). + */ + final public void setSumExtNodes( final int i ) { + if ( i < 0 ) { + throw new IllegalArgumentException( "attempt to set sum of external nodes to less than one" ); + } + _sum_ext_nodes = i; + } + + /** + * Used for drawing of Trees. + */ + final public void setXcoord( final float x ) { + _x = x; + } + + final public void setXSecondary( final float x_secondary ) { + _x_secondary = x_secondary; + } + + // ----------- + /** + * Used for drawing of Trees. + */ + final public void setYcoord( final float y ) { + _y = y; + } + + final public void setYSecondary( final float y_secondary ) { + _y_secondary = y_secondary; + } + + // --------------------------------------------------------- + // Writing of Nodes to Strings + // --------------------------------------------------------- + final public String toNewHampshire( final boolean simple_nh, final boolean write_distance_to_parent ) { + final StringBuilder sb = new StringBuilder(); + String data = ""; + if ( !ForesterUtil.isEmpty( getName() ) ) { + data = getName(); + } + else if ( getNodeData().isHasTaxonomy() ) { + if ( !ForesterUtil.isEmpty( getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + data = getNodeData().getTaxonomy().getTaxonomyCode(); + } + else if ( !ForesterUtil.isEmpty( getNodeData().getTaxonomy().getScientificName() ) ) { + data = getNodeData().getTaxonomy().getScientificName(); + } + else if ( !ForesterUtil.isEmpty( getNodeData().getTaxonomy().getCommonName() ) ) { + data = getNodeData().getTaxonomy().getCommonName(); + } + else if ( getNodeData().getTaxonomy().getTaxonomyCode() != null ) { + data = getNodeData().getTaxonomy().getTaxonomyCode(); + } + } + else if ( getNodeData().isHasSequence() ) { + if ( !ForesterUtil.isEmpty( getNodeData().getSequence().getName() ) ) { + data = getNodeData().getSequence().getName(); + } + } + if ( data.length() > 0 ) { + data = ForesterUtil.replaceIllegalNhCharacters( data ); + if ( simple_nh && ( data.length() > 10 ) ) { + data = data.substring( 0, 11 ); + } + if ( ForesterUtil.isContainsParanthesesableNhCharacter( data ) ) { + sb.append( '\'' ); + sb.append( data ); + sb.append( '\'' ); + } + else { + sb.append( data ); + } + } + if ( ( getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) && write_distance_to_parent ) { + sb.append( ":" ); + sb.append( getDistanceToParent() ); + } + return sb.toString(); + } + + /** + * Converts this PhylogenyNode to a New Hampshire X (NHX) String + * representation. + */ + final public String toNewHampshireX() { + final StringBuffer sb = new StringBuffer(); + final StringBuffer s_nhx = new StringBuffer(); + if ( !ForesterUtil.isEmpty( getName() ) ) { + final String name = ForesterUtil.replaceIllegalNhCharacters( getName() ); + if ( ForesterUtil.isContainsParanthesesableNhCharacter( name ) ) { + sb.append( '\'' ); + sb.append( name ); + sb.append( '\'' ); + } + else { + sb.append( name ); + } + } + if ( getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + sb.append( ":" ); + sb.append( getDistanceToParent() ); + } + if ( getNodeDataDirectly() != null ) { + s_nhx.append( getNodeDataDirectly().toNHX() ); + } + if ( getBranchDataDirectly() != null ) { + s_nhx.append( getBranchDataDirectly().toNHX() ); + } + if ( s_nhx.length() > 0 ) { + sb.append( "[&&NHX" ); + sb.append( s_nhx ); + sb.append( "]" ); + } + return sb.toString(); + } + + @Override + final public String toString() { + final StringBuilder sb = new StringBuilder(); + if ( !ForesterUtil.isEmpty( getName() ) ) { + sb.append( getName() ); + sb.append( " " ); + } + sb.append( "[" ); + sb.append( getId() ); + sb.append( "]" ); + return sb.toString(); + } + + /** + * Decreases the total number of all Nodes created so far by one. + */ + final static synchronized void decreaseNodeCount() { + --PhylogenyNode._node_count; + } + + /** + * Returns the total number of all Nodes created so far. + * + * @return total number of Nodes (int) + */ + synchronized final public static int getNodeCount() { + return PhylogenyNode._node_count; + } + + /** + * Increases the total number of all Nodes created so far by one. + */ + synchronized final private static void increaseNodeCount() { + ++PhylogenyNode._node_count; + } + + /** + * Sets the total number of all Nodes created so far to i (int). + */ + synchronized final static void setNodeCount( final int i ) { + PhylogenyNode._node_count = i; + } +} diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyNodeI.java b/forester/java/src/org/forester/phylogeny/PhylogenyNodeI.java new file mode 100644 index 0000000..3920875 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/PhylogenyNodeI.java @@ -0,0 +1,47 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny; + +public interface PhylogenyNodeI { + + public void addAsChild( PhylogenyNodeI node ); + + public PhylogenyNode getChildNode( int i ); + + public double getDistanceToParent(); + + public int getId(); + + public String getName(); + + public void setDistanceToParent( double d ); + + public void setName( String name ); + + public void setParent( PhylogenyNode phylogenyNode ); +} diff --git a/forester/java/src/org/forester/phylogeny/data/Accession.java b/forester/java/src/org/forester/phylogeny/data/Accession.java new file mode 100644 index 0000000..295b8b7 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Accession.java @@ -0,0 +1,142 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class Accession implements PhylogenyData { + + final String _value; + final String _source; + + public Accession( final String value, final String source ) { + _value = value; + _source = source; + } + + public StringBuffer asSimpleText() { + return new StringBuffer( getValue() ); + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + if ( !ForesterUtil.isEmpty( getSource() ) ) { + sb.append( "[" ); + sb.append( getSource() ); + sb.append( "] " ); + } + sb.append( getValue() ); + return sb; + } + + public PhylogenyData copy() { + return new Accession( new String( getValue() ), new String( getSource() ) ); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + return false; + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return isEqual( ( Accession ) o ); + } + } + + public String getSource() { + return _source; + } + + public String getValue() { + return _value; + } + + @Override + public int hashCode() { + if ( getSource() != null ) { + return ( getSource() + getValue() ).hashCode(); + } + return getValue().hashCode(); + } + + public boolean isEqual( final PhylogenyData data ) { + if ( this == data ) { + return true; + } + if ( ( data == null ) || ( getValue() == null ) ) { + return false; + } + final Accession a = ( Accession ) data; + if ( ( getSource() != null ) && ( a.getSource() != null ) ) { + return ( a.getValue().equals( getValue() ) && a.getSource().equals( getSource() ) ); + } + return ( a.getValue().equals( getValue() ) ); + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + sb.append( ":" ); + sb.append( NHXtags.SEQUENCE_ACCESSION ); + sb.append( ForesterUtil.replaceIllegalNhxCharacters( getValue() ) ); + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( ForesterUtil.isEmpty( getSource() ) ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.ACCESSION, + getValue(), + PhyloXmlMapping.ACCESSION_SOURCE_ATTR, + "unknown", + indentation ); + } + else { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.ACCESSION, + getValue(), + PhyloXmlMapping.ACCESSION_SOURCE_ATTR, + getSource(), + indentation ); + } + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Annotation.java b/forester/java/src/org/forester/phylogeny/data/Annotation.java new file mode 100644 index 0000000..cba8b55 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Annotation.java @@ -0,0 +1,282 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public class Annotation implements PhylogenyData, MultipleUris, Comparable { + + private String _desc; + private String _type; + private String _source; + private final String _ref; + private String _evidence; + private Confidence _confidence; + private PropertiesMap _properties; + private List _uris; + + public Annotation( final String ref ) { + if ( ForesterUtil.isEmpty( ref ) ) { + throw new IllegalArgumentException( "illegal attempt to create Annotation with null or empty reference" ); + } + if ( ( ref.indexOf( ':' ) < 1 ) || ( ref.length() < 3 ) ) { + throw new IllegalArgumentException( "illegal format for Annotation reference: [" + ref + "]" ); + } + _ref = ref; + init(); + } + + @Override + public StringBuffer asSimpleText() { + return new StringBuffer( getDesc() ); + } + + @Override + public StringBuffer asText() { + return new StringBuffer( getDesc() ); + } + + @Override + public PhylogenyData copy() { + final Annotation ann = new Annotation( new String( getRef() ) ); + if ( getConfidence() != null ) { + ann.setConfidence( ( Confidence ) getConfidence().copy() ); + } + else { + ann.setConfidence( null ); + } + ann.setType( new String( getType() ) ); + ann.setDesc( new String( getDesc() ) ); + ann.setEvidence( new String( getEvidence() ) ); + ann.setSource( new String( getSource() ) ); + if ( getProperties() != null ) { + ann.setProperties( ( PropertiesMap ) getProperties().copy() ); + } + else { + ann.setProperties( null ); + } + if ( getUris() != null ) { + ann.setUris( new ArrayList() ); + for( final Uri uri : getUris() ) { + if ( uri != null ) { + ann.getUris().add( uri ); + } + } + } + return ann; + } + + public Confidence getConfidence() { + return _confidence; + } + + public String getDesc() { + return _desc; + } + + public String getEvidence() { + return _evidence; + } + + public PropertiesMap getProperties() { + return _properties; + } + + public String getRef() { + return _ref; + } + + public String getSource() { + return _source; + } + + public String getType() { + return _type; + } + + private void init() { + _desc = ""; + _type = ""; + _source = ""; + _evidence = ""; + _confidence = null; + _properties = null; + setUris( null ); + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + final Annotation other = ( Annotation ) data; + return getDesc().equalsIgnoreCase( other.getDesc() ) && getType().equals( other.getType() ) + && getSource().equals( other.getSource() ) && getRef().equals( other.getRef() ); + } + + public void setConfidence( final Confidence confidence ) { + _confidence = confidence; + } + + public void setDesc( final String desc ) { + _desc = desc; + } + + public void setEvidence( final String evidence ) { + _evidence = evidence; + } + + public void setProperties( final PropertiesMap property ) { + _properties = property; + } + + // public void setRef( final String ref ) { + // _ref = ref; + // } + public void setSource( final String source ) { + _source = source; + } + + public void setType( final String type ) { + _type = type; + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( ( getConfidence() != null ) || ( getProperties() != null ) + || ( ( getUris() != null ) && !getUris().isEmpty() ) || !ForesterUtil.isEmpty( getDesc() ) ) { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.ANNOTATION, + PhyloXmlMapping.ANNOTATION_REF_ATTR, + getRef(), + PhyloXmlMapping.ANNOTATION_EVIDENCE_ATTR, + getEvidence(), + PhyloXmlMapping.ANNOTATION_TYPE_ATTR, + getType(), + PhyloXmlMapping.ANNOTATION_SOURCE_ATTR, + getSource() ); + if ( !ForesterUtil.isEmpty( getDesc() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.ANNOTATION_DESC, getDesc(), indentation ); + } + if ( getConfidence() != null ) { + getConfidence().toPhyloXML( writer, level, indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + } + if ( getProperties() != null ) { + getProperties().toPhyloXML( writer, level, indentation ); + } + if ( getUris() != null ) { + for( final Uri uri : getUris() ) { + if ( uri != null ) { + uri.toPhyloXML( writer, level, indentation ); + } + } + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.ANNOTATION ); + } + else { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.ANNOTATION, + PhyloXmlMapping.ANNOTATION_REF_ATTR, + getRef(), + PhyloXmlMapping.ANNOTATION_EVIDENCE_ATTR, + getEvidence(), + PhyloXmlMapping.ANNOTATION_TYPE_ATTR, + getType(), + PhyloXmlMapping.ANNOTATION_SOURCE_ATTR, + getSource(), + indentation ); + } + } + + @Override + public String toString() { + return asText().toString(); + } + + @Override + public void addUri( final Uri uri ) { + if ( getUris() == null ) { + setUris( new ArrayList() ); + } + getUris().add( uri ); + } + + @Override + public Uri getUri( final int index ) { + return getUris().get( index ); + } + + @Override + public List getUris() { + return _uris; + } + + @Override + public void setUris( final List uris ) { + _uris = uris; + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + return false; + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return isEqual( ( Annotation ) o ); + } + } + + @Override + public int compareTo( final Annotation o ) { + if ( equals( o ) ) { + return 0; + } + if ( getRef().equals( o.getRef() ) ) { + return getDesc().compareTo( o.getDesc() ); + } + return getRef().compareTo( o.getRef() ); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/BinaryCharacters.java b/forester/java/src/org/forester/phylogeny/data/BinaryCharacters.java new file mode 100644 index 0000000..4b3bdd1 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/BinaryCharacters.java @@ -0,0 +1,319 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.Iterator; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public class BinaryCharacters implements PhylogenyData { + + public final static int COUNT_DEFAULT = -1; + private final SortedSet _present; + private final SortedSet _gained; + private final SortedSet _lost; + private final int _present_count; + private final int _gained_count; + private final int _lost_count; + private String _type; + + public BinaryCharacters() { + _present = new TreeSet(); + _gained = new TreeSet(); + _lost = new TreeSet(); + _present_count = COUNT_DEFAULT; + _gained_count = COUNT_DEFAULT; + _lost_count = COUNT_DEFAULT; + } + + public BinaryCharacters( final SortedSet present_characters, + final SortedSet gained_characters, + final SortedSet lost_characters, + final String type ) { + _present = present_characters; + _gained = gained_characters; + _lost = lost_characters; + _type = type; + _present_count = COUNT_DEFAULT; + _gained_count = COUNT_DEFAULT; + _lost_count = COUNT_DEFAULT; + } + + public BinaryCharacters( final SortedSet present_characters, + final SortedSet gained_characters, + final SortedSet lost_characters, + final String type, + final int present_count, + final int gained_count, + final int lost_count ) { + _present = present_characters; + _gained = gained_characters; + _lost = lost_characters; + _type = type; + _present_count = present_count; + _gained_count = gained_count; + _lost_count = lost_count; + validate(); + } + + private void addCharacters( final String indentation, final Writer w, final String[] present ) throws IOException { + for( final String string : present ) { + PhylogenyDataUtil.appendElement( w, PhyloXmlMapping.BINARY_CHARACTER, string, indentation ); + } + } + + public void addGainedCharacter( final String binary_character ) { + if ( getLostCharacters().contains( binary_character ) ) { + throw new IllegalArgumentException( "attempt to add binary character [" + binary_character + + "] to gained characters but is already listed as lost" ); + } + getGainedCharacters().add( binary_character ); + } + + public void addLostCharacter( final String binary_character ) { + if ( getPresentCharacters().contains( binary_character ) ) { + throw new IllegalArgumentException( "attempt to add binary character [" + binary_character + + "] to lost characters but is already listed as present" ); + } + if ( getGainedCharacters().contains( binary_character ) ) { + throw new IllegalArgumentException( "attempt to add binary character [" + binary_character + + "] to lost characters but is already listed as gained" ); + } + getLostCharacters().add( binary_character ); + } + + public void addPresentCharacter( final String binary_character ) { + if ( getLostCharacters().contains( binary_character ) ) { + throw new IllegalArgumentException( "attempt to add binary character [" + binary_character + + "] to present characters but is already listed as lost" ); + } + getPresentCharacters().add( binary_character ); + } + + @Override + public StringBuffer asSimpleText() { + return asText(); + } + + @Override + public StringBuffer asText() { + validate(); + final StringBuffer sb = new StringBuffer(); + sb.append( "present [" ); + sb.append( getPresentCount() ); + sb.append( "]: " ); + sb.append( getPresentCharactersAsStringBuffer() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( "gained [ " ); + sb.append( getGainedCount() ); + sb.append( "]: " ); + sb.append( getGainedCharactersAsStringBuffer() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( "lost [" ); + sb.append( getLostCount() ); + sb.append( "]: " ); + sb.append( getLostCharactersAsStringBuffer() ); + return sb; + } + + @Override + /** + * Not a deep copy. + * + */ + public PhylogenyData copy() { + validate(); + return new BinaryCharacters( getPresentCharacters(), + getGainedCharacters(), + getLostCharacters(), + getType(), + getPresentCount(), + getGainedCount(), + getLostCount() ); + } + + public SortedSet getGainedCharacters() { + return _gained; + } + + public String[] getGainedCharactersAsStringArray() { + return sortedSetToStringArray( getGainedCharacters() ); + } + + public StringBuffer getGainedCharactersAsStringBuffer() { + return sortedSetToStringBuffer( getGainedCharacters(), " " ); + } + + public int getGainedCount() { + return _gained_count; + } + + public SortedSet getLostCharacters() { + return _lost; + } + + public String[] getLostCharactersAsStringArray() { + return sortedSetToStringArray( getLostCharacters() ); + } + + public StringBuffer getLostCharactersAsStringBuffer() { + return sortedSetToStringBuffer( getLostCharacters(), " " ); + } + + public int getLostCount() { + return _lost_count; + } + + public SortedSet getPresentCharacters() { + return _present; + } + + public String[] getPresentCharactersAsStringArray() { + return sortedSetToStringArray( getPresentCharacters() ); + } + + public StringBuffer getPresentCharactersAsStringBuffer() { + return sortedSetToStringBuffer( getPresentCharacters(), " " ); + } + + public int getPresentCount() { + return _present_count; + } + + public String getType() { + return _type; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + public void setType( final String type ) { + _type = type; + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + validate(); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.BINARY_CHARACTERS, + PhyloXmlMapping.BINARY_CHARACTERS_TYPE_ATTR, + getType(), + PhyloXmlMapping.BINARY_CHARACTERS_GAINED_COUNT_ATTR, + getGainedCount() != COUNT_DEFAULT ? String.valueOf( getGainedCount() ) : "", + PhyloXmlMapping.BINARY_CHARACTERS_LOST_COUNT_ATTR, + getLostCount() != COUNT_DEFAULT ? String.valueOf( getLostCount() ) : "", + PhyloXmlMapping.BINARY_CHARACTERS_PRESENT_COUNT_ATTR, + getPresentCount() != COUNT_DEFAULT ? String.valueOf( getPresentCount() ) : "" ); + final String my_ind = indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE; + if ( getGainedCharacters().size() > 0 ) { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( my_ind ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.BINARY_CHARACTERS_GAINED ); + addCharacters( my_ind, writer, getGainedCharactersAsStringArray() ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( my_ind ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.BINARY_CHARACTERS_GAINED ); + } + if ( getLostCharacters().size() > 0 ) { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( my_ind ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.BINARY_CHARACTERS_LOST ); + addCharacters( my_ind, writer, getLostCharactersAsStringArray() ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( my_ind ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.BINARY_CHARACTERS_LOST ); + } + if ( getPresentCharacters().size() > 0 ) { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( my_ind ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.BINARY_CHARACTERS_PRESENT ); + addCharacters( my_ind, writer, getPresentCharactersAsStringArray() ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( my_ind ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.BINARY_CHARACTERS_PRESENT ); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.BINARY_CHARACTERS ); + } + + @Override + public String toString() { + return asText().toString(); + } + + private void validate() { + if ( ( getPresentCount() != COUNT_DEFAULT ) && ( getPresentCharacters().size() > 0 ) + && ( getPresentCount() != getPresentCharacters().size() ) ) { + throw new RuntimeException( "present characters size and count are unequal" ); + } + if ( ( getGainedCount() != COUNT_DEFAULT ) && ( getGainedCharacters().size() > 0 ) + && ( getGainedCount() != getGainedCharacters().size() ) ) { + throw new RuntimeException( "gained characters size and count are unequal" ); + } + if ( ( getLostCount() != COUNT_DEFAULT ) && ( getLostCharacters().size() > 0 ) + && ( getLostCount() != getLostCharacters().size() ) ) { + throw new RuntimeException( "lost characters size and count are unequal" ); + } + } + + private static String[] sortedSetToStringArray( final SortedSet set ) { + final String[] chars = new String[ set.size() ]; + final Iterator it = set.iterator(); + int i = 0; + while ( it.hasNext() ) { + chars[ i++ ] = it.next(); + } + return chars; + } + + private static StringBuffer sortedSetToStringBuffer( final SortedSet set, final String separator ) { + final StringBuffer sb = new StringBuffer(); + final Iterator it = set.iterator(); + while ( it.hasNext() ) { + sb.append( it.next() ); + if ( it.hasNext() ) { + sb.append( separator ); + } + } + return sb; + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/BranchColor.java b/forester/java/src/org/forester/phylogeny/data/BranchColor.java new file mode 100644 index 0000000..84a356b --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/BranchColor.java @@ -0,0 +1,111 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.awt.Color; +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class BranchColor implements PhylogenyData { + + private Color _color; + + public BranchColor() { + _color = null; + } + + public BranchColor( final Color color ) { + _color = color; + } + + @Override + public StringBuffer asSimpleText() { + return new StringBuffer( getValue().toString() ); + } + + @Override + public StringBuffer asText() { + return new StringBuffer( getValue().toString() ); + } + + @Override + /** + * Not a deep copy. + * + */ + public PhylogenyData copy() { + final BranchColor bc = new BranchColor(); + bc.setValue( getValue() ); + return bc; + } + + public Color getValue() { + return _color; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + return getValue().equals( ( ( BranchColor ) data ).getValue() ); + } + + public void setValue( final Color color ) { + _color = color; + } + + @Override + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + sb.append( NHXtags.COLOR ); + sb.append( getValue().getRed() ); + sb.append( "." ); + sb.append( getValue().getGreen() ); + sb.append( "." ); + sb.append( getValue().getBlue() ); + return sb; + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.COLOR ); + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.COLOR_RED, getValue().getRed() + "", indentation ); + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.COLOR_GREEN, getValue().getGreen() + "", indentation ); + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.COLOR_BLUE, getValue().getBlue() + "", indentation ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.COLOR ); + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/BranchData.java b/forester/java/src/org/forester/phylogeny/data/BranchData.java new file mode 100644 index 0000000..34a9dc0 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/BranchData.java @@ -0,0 +1,156 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +public class BranchData implements PhylogenyData { + + private BranchColor _branch_color; + private List _confidences; + private BranchWidth _branch_width; + + public BranchData() { + // Doing nothing. + } + + public void addConfidence( final Confidence confidence ) { + getConfidences().add( confidence ); + } + + @Override + public StringBuffer asSimpleText() { + throw new UnsupportedOperationException(); + } + + @Override + public StringBuffer asText() { + throw new UnsupportedOperationException(); + } + + @Override + public PhylogenyData copy() { + final BranchData new_bd = new BranchData(); + if ( isHasBranchColor() ) { + new_bd.setBranchColor( ( BranchColor ) getBranchColor().copy() ); + } + if ( isHasBranchWidth() ) { + new_bd.setBranchWidth( ( BranchWidth ) getBranchWidth().copy() ); + } + if ( isHasConfidences() ) { + for( final Confidence confidence : getConfidences() ) { + new_bd.addConfidence( ( Confidence ) confidence.copy() ); + } + } + return new_bd; + } + + public BranchColor getBranchColor() { + return _branch_color; + } + + public BranchWidth getBranchWidth() { + return _branch_width; + } + + public Confidence getConfidence( final int index ) { + return getConfidences().get( index ); + } + + public List getConfidences() { + if ( _confidences == null ) { + _confidences = new ArrayList(); + } + return _confidences; + } + + public int getNumberOfConfidences() { + return getConfidences().size(); + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + public boolean isHasBranchColor() { + return getBranchColor() != null; + } + + public boolean isHasBranchWidth() { + return getBranchWidth() != null; + } + + public boolean isHasConfidences() { + return getNumberOfConfidences() > 0; + } + + public void setBranchColor( final BranchColor branch_color ) { + _branch_color = branch_color; + } + + public void setBranchWidth( final BranchWidth branch_width ) { + _branch_width = branch_width; + } + + @Override + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + if ( isHasConfidences() && ( getConfidence( 0 ).getValue() != Confidence.CONFIDENCE_DEFAULT_VALUE ) ) { + sb.append( ":" ); + sb.append( getConfidence( 0 ).toNHX() ); + } + if ( isHasBranchWidth() && ( getBranchWidth().getValue() != BranchWidth.BRANCH_WIDTH_DEFAULT_VALUE ) ) { + sb.append( ":" ); + sb.append( getBranchWidth().toNHX() ); + } + if ( isHasBranchColor() && ( getBranchColor().getValue() != null ) ) { + sb.append( ":" ); + sb.append( getBranchColor().toNHX() ); + } + return sb; + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( isHasConfidences() ) { + for( final Confidence confidence : getConfidences() ) { + confidence.toPhyloXML( writer, level, indentation ); + } + } + if ( isHasBranchWidth() ) { + getBranchWidth().toPhyloXML( writer, level, indentation ); + } + if ( isHasBranchColor() ) { + getBranchColor().toPhyloXML( writer, level, indentation ); + } + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/BranchWidth.java b/forester/java/src/org/forester/phylogeny/data/BranchWidth.java new file mode 100644 index 0000000..803fbda --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/BranchWidth.java @@ -0,0 +1,91 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class BranchWidth implements PhylogenyData { + + public final static double BRANCH_WIDTH_DEFAULT_VALUE = 1.0; + private final double _value; + + public BranchWidth() { + _value = BRANCH_WIDTH_DEFAULT_VALUE; + } + + public BranchWidth( final double value ) { + _value = value; + } + + @Override + public StringBuffer asSimpleText() { + return new StringBuffer( getValue() + "" ); + } + + @Override + public StringBuffer asText() { + return asSimpleText(); + } + + @Override + public PhylogenyData copy() { + return new BranchWidth( getValue() ); + } + + public double getValue() { + return _value; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + return getValue() == ( ( BranchWidth ) data ).getValue(); + } + + @Override + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + sb.append( NHXtags.PARENT_BRANCH_WIDTH ); + sb.append( getValue() ); + return sb; + } + + @Override + public void toPhyloXML( final Writer w, final int level, final String indentation ) throws IOException { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( indentation ); + PhylogenyDataUtil.appendElement( w, PhyloXmlMapping.WIDTH, getValue() + "" ); + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Confidence.java b/forester/java/src/org/forester/phylogeny/data/Confidence.java new file mode 100644 index 0000000..dbb9d07 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Confidence.java @@ -0,0 +1,142 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.util.ForesterUtil; + +public class Confidence implements PhylogenyData, Comparable { + + public final static double CONFIDENCE_DEFAULT_VALUE = -9999.0; + private double _value; + private String _type; + + public Confidence() { + init(); + } + + public Confidence( final double value, final String type ) { + setValue( value ); + setType( type ); + } + + public StringBuffer asSimpleText() { + return new StringBuffer().append( ForesterUtil.FORMATTER_6.format( getValue() ) ); + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + if ( !ForesterUtil.isEmpty( getType() ) ) { + sb.append( "[" ); + sb.append( getType() ); + sb.append( "] " ); + } + sb.append( ForesterUtil.FORMATTER_6.format( getValue() ) ); + return sb; + } + + @Override + public int compareTo( final Confidence confidence ) { + if ( this == confidence ) { + return 0; + } + return getType().compareToIgnoreCase( confidence.getType() ); + } + + public PhylogenyData copy() { + return new Confidence( getValue(), getType() ); + } + + public String getType() { + return _type; + } + + public double getValue() { + return _value; + } + + public void init() { + setValue( CONFIDENCE_DEFAULT_VALUE ); + setType( "" ); + } + + public boolean isEqual( final PhylogenyData confidence ) { + if ( confidence == null ) { + return false; + } + if ( !( confidence instanceof Confidence ) ) { + return false; + } + final Confidence s = ( Confidence ) confidence; + if ( s.getValue() != getValue() ) { + return false; + } + if ( !s.getType().equals( getType() ) ) { + return false; + } + return true; + } + + public void setType( final String type ) { + _type = type; + } + + public void setValue( final double value ) { + _value = value; + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + sb.append( NHXtags.SUPPORT ); + sb.append( getValue() ); + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( getValue() == CONFIDENCE_DEFAULT_VALUE ) { + return; + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.CONFIDENCE, + String.valueOf( ForesterUtil + .round( getValue(), + PhyloXmlUtil.ROUNDING_DIGITS_FOR_PHYLOXML_DOUBLE_OUTPUT ) ), + PhyloXmlMapping.CONFIDENCE_TYPE_ATTR, + ForesterUtil.isEmpty( getType() ) ? "unknown" : getType() ); + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Date.java b/forester/java/src/org/forester/phylogeny/data/Date.java new file mode 100644 index 0000000..6e2b814 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Date.java @@ -0,0 +1,188 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.math.BigDecimal; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class Date implements PhylogenyData { + + private String _desc; + private BigDecimal _value; + private BigDecimal _min; + private BigDecimal _max; + private String _unit; + + public Date() { + _desc = ""; + _value = null; + _min = null; + _max = null; + _unit = ""; + } + + public Date( final String desc ) { + if ( desc == null ) { + throw new IllegalArgumentException( "illegaly empty of null fields in constructor" ); + } + _desc = desc; + _value = null; + _min = null; + _max = null; + _unit = ""; + } + + public Date( final String desc, + final BigDecimal value, + final BigDecimal min, + final BigDecimal max, + final String unit ) { + if ( ( desc == null ) || ( unit == null ) ) { + throw new IllegalArgumentException( "illegaly empty of null fields in constructor" ); + } + _desc = desc; + _value = value; + _min = min; + _max = max; + _unit = unit; + } + + @Override + public StringBuffer asSimpleText() { + if ( getValue() != null ) { + return new StringBuffer( getDesc() + " [" + getValue().toPlainString() + " " + getUnit() + "]" ); + } + else { + return new StringBuffer( getDesc() ); + } + } + + @Override + public StringBuffer asText() { + return asSimpleText(); + } + + @Override + public PhylogenyData copy() { + return new Date( getDesc(), + getValue() == null ? null : new BigDecimal( getValue().toPlainString() ), + getMin() == null ? null : new BigDecimal( getMin().toPlainString() ), + getMax() == null ? null : new BigDecimal( getMax().toPlainString() ), + getUnit() ); + } + + public String getDesc() { + return _desc; + } + + public BigDecimal getMax() { + return _max; + } + + public BigDecimal getMin() { + return _min; + } + + public String getUnit() { + return _unit; + } + + public BigDecimal getValue() { + return _value; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + public void setDesc( final String desc ) { + _desc = desc; + } + + public void setMax( final BigDecimal max ) { + _max = max; + } + + public void setMin( final BigDecimal min ) { + _min = min; + } + + public void setUnit( final String unit ) { + _unit = unit; + } + + public void setValue( final BigDecimal value ) { + _value = value; + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.CLADE_DATE, PhyloXmlMapping.CLADE_DATE_UNIT, getUnit() ); + if ( !ForesterUtil.isEmpty( getDesc() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.CLADE_DATE_DESC, getDesc(), indentation ); + } + if ( getValue() != null ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.CLADE_DATE_VALUE, + getValue().toPlainString(), + indentation ); + } + if ( getMin() != null ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.CLADE_DATE_MIN, + getMin().toPlainString(), + indentation ); + } + if ( getMax() != null ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.CLADE_DATE_MAX, + getMax().toPlainString(), + indentation ); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.CLADE_DATE ); + } + + @Override + public String toString() { + return asSimpleText().toString(); + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/phylogeny/data/Distribution.java b/forester/java/src/org/forester/phylogeny/data/Distribution.java new file mode 100644 index 0000000..9a6ebdb --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Distribution.java @@ -0,0 +1,182 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public class Distribution implements PhylogenyData { + + private final String _desc; + private final List _points; + private final List _polygons; + + public Distribution( final String desc ) { + _desc = desc; + _points = null; + _polygons = null; + } + + public Distribution( final String desc, final List points ) { + _desc = null; + _points = points; + _polygons = null; + } + + public Distribution( final String desc, final List points, final List polygons ) { + _desc = desc; + _points = points; + _polygons = polygons; + } + + public boolean isEmpty() { + return ForesterUtil.isEmpty( _desc ) && ForesterUtil.isEmpty( _points ) && ForesterUtil.isEmpty( _polygons ); + } + + @Override + public StringBuffer asSimpleText() { + final StringBuffer sb = new StringBuffer(); + if ( isEmpty() ) { + return sb; + } + sb.append( "Distribution: " ); + if ( !ForesterUtil.isEmpty( getDesc() ) ) { + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( " Description: " ); + sb.append( getDesc() ); + } + int i = 0; + if ( getPoints() != null ) { + for( final Point point : getPoints() ) { + if ( point != null ) { + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( " Point " + i + ": " ); + sb.append( point.asSimpleText() ); + i++; + } + } + } + i = 0; + if ( getPolygons() != null ) { + for( final Polygon polygon : getPolygons() ) { + if ( polygon != null ) { + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( " Polygon " + i + ":" ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( polygon.asSimpleText() ); + i++; + } + } + } + return sb; + } + + @Override + public StringBuffer asText() { + return asSimpleText(); + } + + @Override + public PhylogenyData copy() { + List new_points = null; + List new_polygons = null; + if ( getPoints() != null ) { + new_points = new ArrayList(); + for( final Point point : getPoints() ) { + new_points.add( ( Point ) point.copy() ); + } + } + if ( getPolygons() != null ) { + new_polygons = new ArrayList(); + for( final Polygon polygon : getPolygons() ) { + new_polygons.add( ( Polygon ) polygon.copy() ); + } + } + return new Distribution( getDesc(), new_points, new_polygons ); + } + + public String getDesc() { + return _desc; + } + + public List getPoints() { + return _points; + } + + public List getPolygons() { + return _polygons; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( isEmpty() ) { + return; + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.DISTRIBUTION ); + if ( !ForesterUtil.isEmpty( getDesc() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.DISTRIBUTION_DESC, getDesc(), indentation ); + } + final String ind = indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE; + if ( getPoints() != null ) { + for( final Point point : getPoints() ) { + point.toPhyloXML( writer, level, ind ); + } + } + if ( getPolygons() != null ) { + for( final Polygon polygon : getPolygons() ) { + polygon.toPhyloXML( writer, level, ind ); + } + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.DISTRIBUTION ); + } + + @Override + public String toString() { + return asSimpleText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/DomainArchitecture.java b/forester/java/src/org/forester/phylogeny/data/DomainArchitecture.java new file mode 100644 index 0000000..e19c2ef --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/DomainArchitecture.java @@ -0,0 +1,221 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; +import java.util.SortedMap; +import java.util.StringTokenizer; +import java.util.TreeMap; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public class DomainArchitecture implements PhylogenyData { + + public final static String NHX_SEPARATOR = ">"; + private static final double INCREASE_KEY = 0.0001; + private SortedMap _domains; + private int _total_length; + + public DomainArchitecture() { + init(); + } + + public DomainArchitecture( final List domains, final int total_length ) { + init(); + for( final PhylogenyData phylogenyData : domains ) { + final ProteinDomain pd = ( ProteinDomain ) phylogenyData; + addDomain( pd ); + } + _total_length = total_length; + } + + public DomainArchitecture( final String da_str ) { + init(); + int total_length = 0; + int to = -1; + try { + final StringTokenizer st = new StringTokenizer( da_str, DomainArchitecture.NHX_SEPARATOR ); + final String length_str = ( String ) st.nextElement(); + total_length = new Integer( length_str ).intValue(); + while ( st.hasMoreElements() ) { + final String from_str = ( String ) st.nextElement(); + final String to_str = ( String ) st.nextElement(); + final String support_str = ( String ) st.nextElement(); + final String name = ( String ) st.nextElement(); + to = new Integer( to_str ).intValue(); + final int from = new Integer( from_str ).intValue(); + final double support = new Double( support_str ).doubleValue(); + final ProteinDomain pd = new ProteinDomain( name, from, to, support ); + addDomain( pd ); + } + } + catch ( final Exception e ) { + throw new IllegalArgumentException( "Malformed format for domain structure \"" + da_str + "\": " + + e.getMessage() ); + } + if ( to > total_length ) { + throw new IllegalArgumentException( "total length of domain structure is too short" ); + } + _total_length = total_length; + } + + public void addDomain( final ProteinDomain pd ) { + Double key = new Double( pd.getFrom() ); + while ( _domains.containsKey( key ) ) { + key = new Double( key.doubleValue() + DomainArchitecture.INCREASE_KEY ); + } + _domains.put( key, pd ); + } + + public StringBuffer asSimpleText() { + final StringBuffer sb = new StringBuffer(); + for( int i = 0; i < getDomains().size(); ++i ) { + if ( i > 0 ) { + sb.append( "~" ); + } + sb.append( getDomain( i ).asSimpleText() ); + } + return sb; + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + for( int i = 0; i < getDomains().size(); ++i ) { + if ( i > 0 ) { + sb.append( "~" ); + } + sb.append( getDomain( i ).asText() ); + } + return sb; + } + + public PhylogenyData copy() { + final List domains = new ArrayList( getDomains().size() ); + for( int i = 0; i < getDomains().size(); ++i ) { + domains.add( getDomain( i ).copy() ); + } + return new DomainArchitecture( domains, getTotalLength() ); + } + + public ProteinDomain getDomain( final int i ) { + return ( ProteinDomain ) _domains.values().toArray()[ i ]; + } + + public SortedMap getDomains() { + return _domains; + } + + public int getNumberOfDomains() { + return _domains.size(); + } + + public int getTotalLength() { + return _total_length; + } + + private void init() { + _domains = new TreeMap(); + _total_length = 0; + } + + /** + * Returns true if the names and the order of the domains match (domain and + * linker lengths are ignored). + * + * + */ + public boolean isEqual( final PhylogenyData domain_architecture ) { + if ( domain_architecture == null ) { + return false; + } + if ( !( domain_architecture instanceof DomainArchitecture ) ) { + return false; + } + final DomainArchitecture d = ( DomainArchitecture ) domain_architecture; + if ( getDomains().size() != d.getDomains().size() ) { + return false; + } + for( int i = 0; i < getDomains().size(); ++i ) { + if ( !getDomain( i ).getName().equals( d.getDomain( i ).getName() ) ) { + return false; + } + } + return true; + } + + public void setTotalLength( final int total_length ) { + _total_length = total_length; + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + sb.append( ":" ); + sb.append( NHXtags.DOMAIN_STRUCTURE ); + sb.append( getTotalLength() ); + if ( getDomains() != null ) { + for( int i = 0; i < getDomains().size(); ++i ) { + sb.append( DomainArchitecture.NHX_SEPARATOR ); + sb.append( getDomain( i ).getFrom() ); + sb.append( DomainArchitecture.NHX_SEPARATOR ); + sb.append( getDomain( i ).getTo() ); + sb.append( DomainArchitecture.NHX_SEPARATOR ); + sb.append( getDomain( i ).getConfidence() ); + sb.append( DomainArchitecture.NHX_SEPARATOR ); + sb.append( ForesterUtil.replaceIllegalNhxCharacters( getDomain( i ).getName() ) ); + } + } + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECURE, + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_LENGTH, + getTotalLength() + "" ); + if ( getDomains() != null ) { + final String ind = indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE; + for( int i = 0; i < getDomains().size(); ++i ) { + getDomain( i ).toPhyloXML( writer, level, ind ); + } + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECURE ); + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Event.java b/forester/java/src/org/forester/phylogeny/data/Event.java new file mode 100644 index 0000000..22dacc9 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Event.java @@ -0,0 +1,376 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.StringTokenizer; + +import org.forester.io.parsers.nhx.NHXFormatException; +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public class Event implements PhylogenyData { + + public final static int DEFAULT_VALUE = -1; + private static final String NHX_SEPARATOR = ">"; + private int _duplications; + private int _speciations; + private int _gene_losses; + private EventType _event_type; + private Confidence _confidence; + + public Event() { + _duplications = DEFAULT_VALUE; + _speciations = DEFAULT_VALUE; + _gene_losses = DEFAULT_VALUE; + _event_type = EventType.unassigned; + } + + public Event( final EventType type ) { + _duplications = DEFAULT_VALUE; + _speciations = DEFAULT_VALUE; + _gene_losses = DEFAULT_VALUE; + _event_type = type; + } + + public Event( final int duplications, final int speciations, final int gene_losses ) { + _duplications = duplications; + _speciations = speciations; + _gene_losses = gene_losses; + _event_type = EventType.mixed; + } + + public Event( final int duplications, final int speciations, final int gene_losses, final String type ) { + _duplications = duplications; + _speciations = speciations; + _gene_losses = gene_losses; + _event_type = EventType.valueOf( type ); + } + + public Event( final String nhx ) throws NHXFormatException { + if ( ForesterUtil.isEmpty( nhx ) ) { + _duplications = DEFAULT_VALUE; + _speciations = DEFAULT_VALUE; + _gene_losses = DEFAULT_VALUE; + _event_type = EventType.unassigned; + } + else { + final StringTokenizer st = new StringTokenizer( nhx, NHX_SEPARATOR ); + if ( st.countTokens() != 4 ) { + throw new NHXFormatException( "malformed NHX format for event [" + nhx + "]" ); + } + final String duplications = ( String ) st.nextElement(); + final String speciations = ( String ) st.nextElement(); + final String losses = ( String ) st.nextElement(); + final String event_type = ( String ) st.nextElement(); + int d = 0; + int s = 0; + int l = 0; + try { + d = Integer.parseInt( duplications ); + s = Integer.parseInt( speciations ); + l = Integer.parseInt( losses ); + _duplications = d; + _speciations = s; + _gene_losses = l; + _event_type = EventType.valueOf( event_type ); + } + catch ( final Exception e ) { + throw new NHXFormatException( "malformed NHX format for event [" + nhx + "]:" + e.getMessage() ); + } + } + } + + public StringBuffer asSimpleText() { + final StringBuffer sb = new StringBuffer(); + if ( isUnassigned() ) { + } + else if ( isSpeciationOrDuplication() ) { + sb.append( "?" ); + } + else if ( isOther() || isRoot() || isTransfer() || isFusion() ) { + sb.append( getEventType().toString() ); + } + else { + if ( getNumberOfDuplications() > 0 ) { + if ( getNumberOfDuplications() > 1 ) { + sb.append( getNumberOfDuplications() ); + } + sb.append( "D" ); + } + if ( getNumberOfSpeciations() > 0 ) { + if ( getNumberOfSpeciations() > 1 ) { + sb.append( getNumberOfSpeciations() ); + } + sb.append( "S" ); + } + if ( getNumberOfGeneLosses() > 0 ) { + if ( getNumberOfGeneLosses() > 1 ) { + sb.append( getNumberOfGeneLosses() ); + } + sb.append( "L" ); + } + } + return sb; + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + if ( isUnassigned() || isSpeciationOrDuplication() || isOther() || isRoot() || isTransfer() || isFusion() ) { + sb.append( getEventType().toString() ); + } + else { + if ( isDuplication() ) { + if ( getNumberOfDuplications() == 1 ) { + sb.append( "duplication" ); + } + else { + sb.append( "duplications [" + getNumberOfDuplications() + "]" ); + } + } + else if ( isSpeciation() ) { + if ( getNumberOfSpeciations() == 1 ) { + sb.append( "speciation" ); + } + else { + sb.append( "speciations [" + getNumberOfSpeciations() + "]" ); + } + } + else if ( isGeneLoss() ) { + if ( getNumberOfGeneLosses() == 1 ) { + sb.append( "gene-loss" ); + } + else { + sb.append( "gene-losses [" + getNumberOfGeneLosses() + "]" ); + } + } + else { + sb.append( "duplications [" + getNumberOfDuplications() + "] " ); + sb.append( "speciations [" + getNumberOfSpeciations() + "] " ); + sb.append( "gene-losses [" + getNumberOfGeneLosses() + "]" ); + } + } + return sb; + } + + public PhylogenyData copy() { + if ( isUnassigned() ) { + return new Event(); + } + else if ( _event_type != EventType.mixed ) { + return new Event( _event_type ); + } + else { + return new Event( _duplications, _speciations, _gene_losses ); + } + } + + public Confidence getConfidence() { + return _confidence; + } + + public EventType getEventType() { + return _event_type; + } + + public int getNumberOfDuplications() { + return _duplications; + } + + public int getNumberOfGeneLosses() { + return _gene_losses; + } + + public int getNumberOfSpeciations() { + return _speciations; + } + + /** + * Returns true if this event contains one or more duplications events only + * + * @return true if this event contains one or more duplications events only + */ + public boolean isDuplication() { + return ( _duplications > 0 ) && ( _gene_losses < 1 ) && ( _speciations < 1 ); + } + + public boolean isEqual( final PhylogenyData event ) { + if ( ( event == null ) || !( event instanceof Event ) ) { + return false; + } + final Event e = ( Event ) event; + if ( getEventType().compareTo( e.getEventType() ) != 0 ) { + return false; + } + if ( getNumberOfDuplications() != e.getNumberOfDuplications() ) { + return false; + } + if ( getNumberOfSpeciations() != e.getNumberOfSpeciations() ) { + return false; + } + if ( getNumberOfGeneLosses() != e.getNumberOfGeneLosses() ) { + return false; + } + return true; + } + + public boolean isFusion() { + return _event_type == EventType.fusion; + } + + /** + * Returns true if this event contains one or more gene loss events only + * + * @return true if this event contains one or more gene loss events only + */ + public boolean isGeneLoss() { + return ( _duplications < 1 ) && ( _gene_losses > 0 ) && ( _speciations < 1 ); + } + + public boolean isOther() { + return _event_type == EventType.other; + } + + public boolean isRoot() { + return _event_type == EventType.root; + } + + /** + * Returns true if this event contains one or more speciation events only + * + * @return true if this event contains one or more speciation events only + */ + public boolean isSpeciation() { + return ( _duplications < 1 ) && ( _gene_losses < 1 ) && ( _speciations > 0 ); + } + + public boolean isSpeciationOrDuplication() { + return _event_type == EventType.speciation_or_duplication; + } + + public boolean isTransfer() { + return _event_type == EventType.transfer; + } + + public boolean isUnassigned() { + return ( _duplications == DEFAULT_VALUE ) && ( _event_type == EventType.unassigned ); + } + + public void setConfidence( final Confidence confidence ) { + _confidence = confidence; + } + + public void setDuplications( final int duplications ) { + _duplications = duplications; + _event_type = EventType.mixed; + } + + public void setGeneLosses( final int gene_losses ) { + _gene_losses = gene_losses; + _event_type = EventType.mixed; + } + + public void setSpeciations( final int speciations ) { + _speciations = speciations; + _event_type = EventType.mixed; + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + if ( !isUnassigned() && ( isSpeciationOrDuplication() || isDuplication() || isSpeciation() ) ) { + sb.append( ":" ); + sb.append( NHXtags.IS_DUPLICATION ); + if ( isSpeciationOrDuplication() ) { + sb.append( "?" ); + } + else if ( isDuplication() ) { + sb.append( "Y" ); + } + else if ( isSpeciation() ) { + sb.append( "N" ); + } + } + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.EVENTS ); + if ( ( getEventType() != EventType.unassigned ) && ( getEventType() != EventType.mixed ) ) { + PhylogenyDataUtil + .appendElement( writer, PhyloXmlMapping.EVENT_TYPE, getEventType().toString(), indentation ); + } + if ( getNumberOfDuplications() > 0 ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.EVENT_DUPLICATIONS, + getNumberOfDuplications() + "", + indentation ); + } + if ( getNumberOfSpeciations() > 0 ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.EVENT_SPECIATIONS, + getNumberOfSpeciations() + "", + indentation ); + } + if ( getNumberOfGeneLosses() > 0 ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.EVENT_LOSSES, + getNumberOfGeneLosses() + "", + indentation ); + } + if ( getConfidence() != null ) { + getConfidence().toPhyloXML( writer, level, indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.EVENTS ); + } + + @Override + public String toString() { + return asText().toString(); + } + + public static Event createSingleDuplicationEvent() { + return new Event( 1, 0, 0 ); + } + + public static Event createSingleSpeciationEvent() { + return new Event( 0, 1, 0 ); + } + + public static Event createSingleSpeciationOrDuplicationEvent() { + return new Event( EventType.speciation_or_duplication ); + } + + public static enum EventType { + transfer, fusion, root, speciation_or_duplication, other, mixed, unassigned + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Identifier.java b/forester/java/src/org/forester/phylogeny/data/Identifier.java new file mode 100644 index 0000000..33c3a8b --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Identifier.java @@ -0,0 +1,147 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class Identifier implements PhylogenyData { + + final String _value; + final String _provider; + + public Identifier() { + _value = ""; + _provider = ""; + } + + public Identifier( final String value ) { + _value = value; + _provider = ""; + } + + public Identifier( final String value, final String provider ) { + _value = value; + _provider = provider; + } + + public StringBuffer asSimpleText() { + return new StringBuffer( getValue() ); + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + if ( !ForesterUtil.isEmpty( getProvider() ) ) { + sb.append( "[" ); + sb.append( getProvider() ); + sb.append( "] " ); + } + sb.append( getValue() ); + return sb; + } + + public PhylogenyData copy() { + return new Identifier( getValue(), getProvider() ); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + return false; + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return isEqual( ( Identifier ) o ); + } + } + + public String getProvider() { + return _provider; + } + + public String getValue() { + return _value; + } + + @Override + public int hashCode() { + if ( getProvider() != null ) { + return ( getProvider() + getValue() ).hashCode(); + } + return getValue().hashCode(); + } + + public boolean isEqual( final PhylogenyData data ) { + if ( this == data ) { + return true; + } + if ( ( data == null ) || ( getValue() == null ) ) { + return false; + } + final Identifier a = ( Identifier ) data; + if ( ( getProvider() != null ) && ( a.getProvider() != null ) ) { + return ( a.getValue().equals( getValue() ) && a.getProvider().equals( getProvider() ) ); + } + return ( a.getValue().equals( getValue() ) ); + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + sb.append( ":" ); + sb.append( NHXtags.NODE_IDENTIFIER ); + sb.append( ForesterUtil.replaceIllegalNhxCharacters( getValue() ) ); + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( !org.forester.util.ForesterUtil.isEmpty( getProvider() ) ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.IDENTIFIER, + getValue(), + PhyloXmlMapping.IDENTIFIER_PROVIDER_ATTR, + getProvider(), + indentation ); + } + else { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.IDENTIFIER, getValue(), indentation ); + } + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/MultipleUris.java b/forester/java/src/org/forester/phylogeny/data/MultipleUris.java new file mode 100644 index 0000000..cce4be5 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/MultipleUris.java @@ -0,0 +1,39 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.util.List; + +public interface MultipleUris { + + public List getUris(); + + public void setUris( final List uris ); + + public Uri getUri( final int index ); + + public void addUri( final Uri uri ); +} diff --git a/forester/java/src/org/forester/phylogeny/data/NodeData.java b/forester/java/src/org/forester/phylogeny/data/NodeData.java new file mode 100644 index 0000000..d1c75e2 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/NodeData.java @@ -0,0 +1,524 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.phylogeny.data.Property.AppliesTo; +import org.forester.util.ForesterUtil; + +public class NodeData implements PhylogenyData { + + private String _node_name; + private Event _event; + private List _sequences; + private Identifier _node_identifier; + private List _taxonomies; + private List _distributions; + private Date _date; + private BinaryCharacters _binary_characters; + private PropertiesMap _properties; + private List _references; + private List _vector; + + public NodeData() { + init(); + } + + private void init() { + _node_name = ""; + } + + public void addDistribution( final Distribution distribution ) { + if ( _distributions == null ) { + _distributions = new ArrayList(); + } + _distributions.add( distribution ); + } + + public void addReference( final Reference reference ) { + if ( _references == null ) { + _references = new ArrayList(); + } + _references.add( reference ); + } + + public void addSequence( final Sequence sequence ) { + if ( _sequences == null ) { + _sequences = new ArrayList(); + } + _sequences.add( sequence ); + } + + public void addTaxonomy( final Taxonomy taxonomy ) { + if ( _taxonomies == null ) { + _taxonomies = new ArrayList(); + } + _taxonomies.add( taxonomy ); + } + + public StringBuffer asSimpleText() { + throw new UnsupportedOperationException(); + } + + public StringBuffer asText() { + throw new UnsupportedOperationException(); + } + + public PhylogenyData copy() { + final NodeData new_data = new NodeData(); + new_data.setNodeName( getNodeName() ); + if ( ( getSequences() != null ) && ( getSequences().size() > 0 ) ) { + new_data.setSequences( new ArrayList() ); + for( final Sequence s : getSequences() ) { + if ( s != null ) { + new_data.addSequence( ( Sequence ) s.copy() ); + } + } + } + if ( isHasEvent() ) { + new_data.setEvent( ( Event ) getEvent().copy() ); + } + if ( isHasNodeIdentifier() ) { + new_data.setNodeIdentifier( ( Identifier ) getNodeIdentifier().copy() ); + } + if ( ( getTaxonomies() != null ) && ( getTaxonomies().size() > 0 ) ) { + new_data.setTaxonomies( new ArrayList() ); + for( final Taxonomy t : getTaxonomies() ) { + if ( t != null ) { + new_data.addTaxonomy( ( Taxonomy ) t.copy() ); + } + } + } + if ( isHasBinaryCharacters() ) { + new_data.setBinaryCharacters( ( BinaryCharacters ) getBinaryCharacters().copy() ); + } + if ( ( getReferences() != null ) && ( getReferences().size() > 0 ) ) { + new_data.setReferences( new ArrayList() ); + for( final Reference r : getReferences() ) { + if ( r != null ) { + new_data.addReference( ( Reference ) r.copy() ); + } + } + } + if ( ( getDistributions() != null ) && ( getDistributions().size() > 0 ) ) { + new_data.setDistributions( new ArrayList() ); + for( final Distribution d : getDistributions() ) { + if ( d != null ) { + new_data.addDistribution( ( Distribution ) d.copy() ); + } + } + } + if ( isHasDate() ) { + new_data.setDate( ( Date ) getDate().copy() ); + } + if ( isHasProperties() ) { + new_data.setProperties( ( PropertiesMap ) getProperties().copy() ); + } + return new_data; + } + + public BinaryCharacters getBinaryCharacters() { + return _binary_characters; + } + + public Date getDate() { + return _date; + } + + /** + * Convenience method -- always returns the first Distribution. + * + * @return Distribution + */ + public Distribution getDistribution() { + return getDistribution( 0 ); + } + + public Distribution getDistribution( final int index ) { + return _distributions.get( index ); + } + + public List getDistributions() { + return _distributions; + } + + public Event getEvent() { + return _event; + } + + public Identifier getNodeIdentifier() { + return _node_identifier; + } + + public PropertiesMap getProperties() { + return _properties; + } + + /** + * Convenience method -- always returns the first Reference. + * + * @return Reference + * + */ + public Reference getReference() { + return getReference( 0 ); + } + + public Reference getReference( final int index ) { + return _references.get( index ); + } + + public List getReferences() { + return _references; + } + + /** + * Convenience method -- always returns the first Sequence. + * + * @return Sequence + */ + public Sequence getSequence() { + return getSequence( 0 ); + } + + public Sequence getSequence( final int index ) { + return _sequences.get( index ); + } + + public List getSequences() { + return _sequences; + } + + public List getTaxonomies() { + return _taxonomies; + } + + /** + * Convenience method -- always returns the first Taxonomy. + * + * @return Taxonomy + * + */ + public Taxonomy getTaxonomy() { + return getTaxonomy( 0 ); + } + + public Taxonomy getTaxonomy( final int index ) { + return _taxonomies.get( index ); + } + + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + public boolean isHasBinaryCharacters() { + return getBinaryCharacters() != null; + } + + public boolean isHasDate() { + return ( getDate() != null ) + && ( !ForesterUtil.isEmpty( getDate().getDesc() ) || !ForesterUtil.isNull( getDate().getMax() ) + || !ForesterUtil.isNull( getDate().getMin() ) || !ForesterUtil.isNull( getDate().getValue() ) || !ForesterUtil + .isEmpty( getDate().getUnit() ) ); + } + + public boolean isHasDistribution() { + return ( ( ( getDistributions() != null ) && ( getDistributions().size() > 0 ) ) && ( ( !ForesterUtil + .isEmpty( getDistribution().getDesc() ) ) + || ( ( getDistribution().getPoints() != null ) && ( getDistribution().getPoints().size() > 0 ) ) || ( ( getDistribution() + .getPolygons() != null ) && ( getDistribution().getPolygons().size() > 0 ) ) ) ); + } + + public boolean isHasEvent() { + return getEvent() != null; + } + + public boolean isHasNodeIdentifier() { + return getNodeIdentifier() != null; + } + + public boolean isHasProperties() { + return getProperties() != null; + } + + public boolean isHasReference() { + return ( ( getReferences() != null ) && ( getReferences().size() > 0 ) ) + && ( !ForesterUtil.isEmpty( getReference().getDoi() ) || !ForesterUtil.isEmpty( getReference() + .getDescription() ) ); + } + + public boolean isHasSequence() { + return ( getSequences() != null ) && ( getSequences().size() > 0 ) && ( getSequences().get( 0 ) != null ); + } + + public boolean isHasTaxonomy() { + return ( getTaxonomies() != null ) && ( getTaxonomies().size() > 0 ) && ( getTaxonomies().get( 0 ) != null ); + } + + public void setBinaryCharacters( final BinaryCharacters binary_characters ) { + _binary_characters = binary_characters; + } + + public void setDate( final Date date ) { + _date = date; + } + + /** + * Convenience method -- always sets the first Distribution. + * + */ + public void setDistribution( final Distribution distribution ) { + if ( _distributions == null ) { + _distributions = new ArrayList(); + } + if ( _distributions.size() == 0 ) { + _distributions.add( distribution ); + } + else { + _distributions.set( 0, distribution ); + } + } + + public void setDistribution( final int index, final Distribution distribution ) { + if ( _distributions == null ) { + _distributions = new ArrayList(); + } + _distributions.set( index, distribution ); + } + + private void setDistributions( final List distributions ) { + _distributions = distributions; + } + + public void setEvent( final Event event ) { + _event = event; + } + + public void setNodeIdentifier( final Identifier node_identifier ) { + _node_identifier = node_identifier; + } + + public void setProperties( final PropertiesMap custom_data ) { + _properties = custom_data; + } + + public void setReference( final int index, final Reference reference ) { + if ( _references == null ) { + _references = new ArrayList(); + } + _references.set( index, reference ); + } + + /** + * Convenience method -- always sets the first Reference. + * + */ + public void setReference( final Reference reference ) { + if ( _references == null ) { + _references = new ArrayList(); + } + if ( _references.size() == 0 ) { + _references.add( reference ); + } + else { + _references.set( 0, reference ); + } + } + + private void setReferences( final List references ) { + _references = references; + } + + public void setSequence( final int index, final Sequence sequence ) { + if ( _sequences == null ) { + _sequences = new ArrayList(); + } + _sequences.set( index, sequence ); + } + + /** + * Convenience method -- always sets the first Sequence. + * + */ + public void setSequence( final Sequence sequence ) { + if ( _sequences == null ) { + _sequences = new ArrayList(); + } + if ( _sequences.size() == 0 ) { + _sequences.add( sequence ); + } + else { + _sequences.set( 0, sequence ); + } + } + + private void setSequences( final List sequences ) { + _sequences = sequences; + } + + private void setTaxonomies( final List taxonomies ) { + _taxonomies = taxonomies; + } + + public void setTaxonomy( final int index, final Taxonomy taxonomy ) { + if ( _taxonomies == null ) { + _taxonomies = new ArrayList(); + } + _taxonomies.set( index, taxonomy ); + } + + /** + * Convenience method -- always sets the first Taxonomy. + * + */ + public void setTaxonomy( final Taxonomy taxonomy ) { + if ( _taxonomies == null ) { + _taxonomies = new ArrayList(); + } + if ( _taxonomies.size() == 0 ) { + _taxonomies.add( taxonomy ); + } + else { + _taxonomies.set( 0, taxonomy ); + } + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + if ( isHasNodeIdentifier() ) { + sb.append( getNodeIdentifier().toNHX() ); + } + if ( isHasTaxonomy() ) { + sb.append( getTaxonomy().toNHX() ); + } + if ( isHasSequence() ) { + sb.append( getSequence().toNHX() ); + } + if ( isHasEvent() ) { + sb.append( getEvent().toNHX() ); + } + if ( isHasProperties() ) { + sb.append( getProperties().toNHX() ); + } + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( isHasNodeIdentifier() ) { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + // if ( !org.forester.util.ForesterUtil.isEmpty( getNodeIdentifier().getProvider() ) ) { + // PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.NODE_IDENTIFIER, getNodeIdentifier() + // .getValue(), PhyloXmlMapping.IDENTIFIER_PROVIDER_ATTR, getNodeIdentifier().getProvider() ); + // } + // else { + // PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.NODE_IDENTIFIER, getNodeIdentifier() + // .getValue() ); + // } + } + if ( isHasTaxonomy() ) { + for( final Taxonomy t : getTaxonomies() ) { + if ( !t.isEmpty() ) { + t.toPhyloXML( writer, level, indentation ); + } + } + } + if ( isHasSequence() ) { + for( final Sequence s : getSequences() ) { + if ( !s.isEmpty() ) { + s.toPhyloXML( writer, level, indentation ); + } + } + } + if ( isHasEvent() ) { + getEvent().toPhyloXML( writer, level, indentation ); + } + if ( isHasBinaryCharacters() ) { + getBinaryCharacters().toPhyloXML( writer, level, indentation ); + } + if ( isHasDistribution() ) { + for( final Distribution d : getDistributions() ) { + d.toPhyloXML( writer, level, indentation ); + } + } + if ( isHasDate() ) { + getDate().toPhyloXML( writer, level, indentation ); + } + if ( isHasReference() ) { + for( final Reference r : getReferences() ) { + r.toPhyloXML( writer, level, indentation ); + } + } + if ( isHasProperties() ) { + getProperties().toPhyloXML( writer, level, indentation.substring( 0, indentation.length() - 2 ) ); + } + if ( ( getVector() != null ) + && !getVector().isEmpty() + && ( ( getProperties() == null ) || getProperties() + .getPropertiesWithGivenReferencePrefix( PhyloXmlUtil.VECTOR_PROPERTY_REF ).isEmpty() ) ) { + final List ps = vectorToProperties( getVector() ); + final String my_indent = indentation.substring( 0, indentation.length() - 2 ); + for( final Property p : ps ) { + p.toPhyloXML( writer, level, my_indent ); + } + } + } + + private List vectorToProperties( final List vector ) { + final List properties = new ArrayList(); + for( int i = 0; i < vector.size(); ++i ) { + properties.add( new Property( PhyloXmlUtil.VECTOR_PROPERTY_REF + i, + String.valueOf( vector.get( i ) ), + "", + PhyloXmlUtil.VECTOR_PROPERTY_TYPE, + AppliesTo.NODE ) ); + } + return properties; + } + + public void setVector( final List vector ) { + _vector = vector; + } + + public List getVector() { + return _vector; + } + + public String getNodeName() { + return _node_name; + } + + public void setNodeName( final String node_name ) { + _node_name = node_name; + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/PhylogenyData.java b/forester/java/src/org/forester/phylogeny/data/PhylogenyData.java new file mode 100644 index 0000000..d5be4d7 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/PhylogenyData.java @@ -0,0 +1,72 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; + +/* + * Interface for data for annotating a Phylogeny. + */ +public interface PhylogenyData { + + public StringBuffer asSimpleText(); + + public StringBuffer asText(); + + /** + * Creates a new PhylogenyData object with identical values as this + * PhylogenyData. + * This ~should~ return a deep copy, but not there yet. + * + * + * @return a ~deep~ copy of this PhylogenyData + */ + public PhylogenyData copy(); + + /** + * Compares this PhylogenyData to PhylogenyData data. In general, this + * should return true if and only if all fiels are exactly identical. + * + * @param PhylogenyData + * the PhylogenyData to compare to + * @return in general, true if and only if all fiels are exactly identical, + * false otherwise + */ + public boolean isEqual( final PhylogenyData data ); + + public StringBuffer toNHX(); + + /** + * Writes a phyloXML representation of this phylogeny data. + * + * @param writer + * @param level + * @param indentation + * @throws IOException + */ + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException; +} \ No newline at end of file diff --git a/forester/java/src/org/forester/phylogeny/data/PhylogenyDataUtil.java b/forester/java/src/org/forester/phylogeny/data/PhylogenyDataUtil.java new file mode 100644 index 0000000..a90181d --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/PhylogenyDataUtil.java @@ -0,0 +1,372 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.awt.Graphics; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; + +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public final class PhylogenyDataUtil { + + public static void appendClose( final Writer w, final String element_name ) throws IOException { + w.write( "" ); + } + + public static void appendElement( final Writer w, final String element_name, final String value ) + throws IOException { + appendOpen( w, element_name ); + w.write( replaceIllegalXmlCharacters( value ) ); + appendClose( w, element_name ); + } + + public static void appendElement( final Writer w, + final String element_name, + final String value, + final String indentation ) throws IOException { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( indentation ); + w.write( PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + // Something like this replacement needs to be done in a more systematic manner. + appendElement( w, element_name, value ); + } + + public static void appendElement( final Writer w, + final String element_name, + final String value, + final String attribute_name, + final String attribute_value ) throws IOException { + appendOpen( w, element_name, attribute_name, attribute_value ); + w.write( replaceIllegalXmlCharacters( value ) ); + appendClose( w, element_name ); + } + + public static void appendElement( final Writer w, + final String element_name, + final String value, + final String attribute_name, + final String attribute_value, + final String indentation ) throws IOException { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( indentation ); + w.write( PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + appendOpen( w, element_name, attribute_name, attribute_value ); + w.write( replaceIllegalXmlCharacters( value ) ); + appendClose( w, element_name ); + } + + public static void appendElement( final Writer w, + final String element_name, + final String value, + final String attribute1_name, + final String attribute1_value, + final String attribute2_name, + final String attribute2_value, + final String indentation ) throws IOException { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( indentation ); + w.write( PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + appendOpen( w, element_name, attribute1_name, attribute1_value, attribute2_name, attribute2_value ); + w.write( replaceIllegalXmlCharacters( value ) ); + appendClose( w, element_name ); + } + + public static void appendElement( final Writer w, + final String element_name, + final String attribute1_name, + final String attribute1_value, + final String attribute2_name, + final String attribute2_value, + final String attribute3_name, + final String attribute3_value, + final String attribute4_name, + final String attribute4_value, + final String indentation ) throws IOException { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( indentation ); + appendOpen( w, + element_name, + attribute1_name, + attribute1_value, + attribute2_name, + attribute2_value, + attribute3_name, + attribute3_value, + attribute4_name, + attribute4_value ); + appendClose( w, element_name ); + } + + public static void appendElement( final Writer w, + final String element_name, + final String value, + final String attribute1_name, + final String attribute1_value, + final String attribute2_name, + final String attribute2_value, + final String attribute3_name, + final String attribute3_value, + final String attribute4_name, + final String attribute4_value, + final String attribute5_name, + final String attribute5_value, + final String indentation ) throws IOException { + w.write( ForesterUtil.LINE_SEPARATOR ); + w.write( indentation ); + w.write( PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + appendOpen( w, + element_name, + attribute1_name, + attribute1_value, + attribute2_name, + attribute2_value, + attribute3_name, + attribute3_value, + attribute4_name, + attribute4_value, + attribute5_name, + attribute5_value ); + w.write( replaceIllegalXmlCharacters( value ) ); + appendClose( w, element_name ); + } + + public static void appendOpen( final Writer w, final String element_name ) throws IOException { + w.write( "<" ); + w.write( element_name ); + w.write( ">" ); + } + + public static void appendOpen( final Writer w, + final String element_name, + final String attribute_name, + final String attribute_value ) throws IOException { + w.write( "<" ); + w.write( element_name ); + if ( !ForesterUtil.isEmpty( attribute_value ) ) { + w.write( " " ); + w.write( attribute_name ); + w.write( "=\"" ); + w.write( attribute_value ); + w.write( "\"" ); + } + w.write( ">" ); + } + + public static void appendOpen( final Writer w, + final String element_name, + final String attribute1_name, + final String attribute1_value, + final String attribute2_name, + final String attribute2_value ) throws IOException { + w.write( "<" ); + w.write( element_name ); + if ( !ForesterUtil.isEmpty( attribute1_value ) ) { + w.write( " " ); + w.write( attribute1_name ); + w.write( "=\"" ); + w.write( attribute1_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute2_value ) ) { + w.write( " " ); + w.write( attribute2_name ); + w.write( "=\"" ); + w.write( attribute2_value ); + w.write( "\"" ); + } + w.write( ">" ); + } + + public static void appendOpen( final Writer w, + final String element_name, + final String attribute1_name, + final String attribute1_value, + final String attribute2_name, + final String attribute2_value, + final String attribute3_name, + final String attribute3_value ) throws IOException { + w.write( "<" ); + w.write( element_name ); + if ( !ForesterUtil.isEmpty( attribute1_value ) ) { + w.write( " " ); + w.write( attribute1_name ); + w.write( "=\"" ); + w.write( attribute1_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute2_value ) ) { + w.write( " " ); + w.write( attribute2_name ); + w.write( "=\"" ); + w.write( attribute2_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute2_value ) ) { + w.write( " " ); + w.write( attribute3_name ); + w.write( "=\"" ); + w.write( attribute3_value ); + w.write( "\"" ); + } + w.write( ">" ); + } + + public static void appendOpen( final Writer w, + final String element_name, + final String attribute1_name, + final String attribute1_value, + final String attribute2_name, + final String attribute2_value, + final String attribute3_name, + final String attribute3_value, + final String attribute4_name, + final String attribute4_value ) throws IOException { + w.write( "<" ); + w.write( element_name ); + if ( !ForesterUtil.isEmpty( attribute1_value ) ) { + w.write( " " ); + w.write( attribute1_name ); + w.write( "=\"" ); + w.write( attribute1_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute2_value ) ) { + w.write( " " ); + w.write( attribute2_name ); + w.write( "=\"" ); + w.write( attribute2_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute3_value ) ) { + w.write( " " ); + w.write( attribute3_name ); + w.write( "=\"" ); + w.write( attribute3_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute4_value ) ) { + w.write( " " ); + w.write( attribute4_name ); + w.write( "=\"" ); + w.write( attribute4_value ); + w.write( "\"" ); + } + w.write( ">" ); + } + + public static void appendOpen( final Writer w, + final String element_name, + final String attribute1_name, + final String attribute1_value, + final String attribute2_name, + final String attribute2_value, + final String attribute3_name, + final String attribute3_value, + final String attribute4_name, + final String attribute4_value, + final String attribute5_name, + final String attribute5_value ) throws IOException { + w.write( "<" ); + w.write( element_name ); + if ( !ForesterUtil.isEmpty( attribute1_value ) ) { + w.write( " " ); + w.write( attribute1_name ); + w.write( "=\"" ); + w.write( attribute1_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute2_value ) ) { + w.write( " " ); + w.write( attribute2_name ); + w.write( "=\"" ); + w.write( attribute2_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute3_value ) ) { + w.write( " " ); + w.write( attribute3_name ); + w.write( "=\"" ); + w.write( attribute3_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute4_value ) ) { + w.write( " " ); + w.write( attribute4_name ); + w.write( "=\"" ); + w.write( attribute4_value ); + w.write( "\"" ); + } + if ( !ForesterUtil.isEmpty( attribute5_value ) ) { + w.write( " " ); + w.write( attribute5_name ); + w.write( "=\"" ); + w.write( attribute5_value ); + w.write( "\"" ); + } + w.write( ">" ); + } + + /** + * Creates a deep copy of ArrayList of PhylogenyData objects. + * + * @param list + * an ArrayList of PhylogenyData objects + * @return a deep copy of ArrayList list + */ + public static ArrayList copy( final ArrayList list ) { + final ArrayList l = new ArrayList( list.size() ); + for( int i = 0; i < list.size(); ++i ) { + l.add( ( list.get( i ) ).copy() ); + } + return l; + } + + public static void drawLine( final double x1, final double y1, final double x2, final double y2, final Graphics g ) { + g.drawLine( org.forester.util.ForesterUtil.roundToInt( x1 ), + org.forester.util.ForesterUtil.roundToInt( y1 ), + org.forester.util.ForesterUtil.roundToInt( x2 ), + org.forester.util.ForesterUtil.roundToInt( y2 ) ); + } + + public static void drawString( final String str, final double x, final double y, final Graphics g ) { + g.drawString( str, org.forester.util.ForesterUtil.roundToInt( x ), org.forester.util.ForesterUtil + .roundToInt( y ) ); + } + + public static String replaceIllegalXmlCharacters( final String value ) { + String v = value.replaceAll( "&", "&" ); + v = v.replaceAll( "<", "<" ); + v = v.replaceAll( ">", ">" ); + v = v.replaceAll( "'", "'" ); + v = v.replaceAll( "\"", """ ); + return v; + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Point.java b/forester/java/src/org/forester/phylogeny/data/Point.java new file mode 100644 index 0000000..a20b3d9 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Point.java @@ -0,0 +1,152 @@ + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.math.BigDecimal; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class Point implements PhylogenyData { + + private final String _geodetic_datum; + private final BigDecimal _lat; + private final BigDecimal _long; + private final BigDecimal _alt; + private final String _alt_unit; + public static final String UNKNOWN_GEODETIC_DATUM = "?"; + + public Point() { + this( UNKNOWN_GEODETIC_DATUM, null, null, null, "" ); + } + + public Point( final String geodetic_datum, final BigDecimal lat, final BigDecimal longitude ) { + this( geodetic_datum, lat, longitude, null, "" ); + } + + public boolean isEmpty() { + return ( _lat == null ) && ( _long == null ) && ( _alt == null ); + } + + public Point( final String geodetic_datum, + final BigDecimal lat, + final BigDecimal longitude, + final BigDecimal alt, + final String alt_unit ) { + if ( ForesterUtil.isEmpty( geodetic_datum ) ) { + throw new IllegalArgumentException( "illegal attempt to use empty geodetic datum" ); + } + if ( ( alt != null ) && ForesterUtil.isEmpty( alt_unit ) ) { + throw new IllegalArgumentException( "altitude must hava a unit" ); + } + _geodetic_datum = geodetic_datum; + _lat = lat; + _long = longitude; + _alt = alt; + _alt_unit = alt_unit; + } + + @Override + public StringBuffer asSimpleText() { + if ( isEmpty() ) { + return new StringBuffer(); + } + else if ( getAltitude() == null ) { + return new StringBuffer( "[" + getLatitude().toPlainString() + ", " + getLongitude() + "]" ); + } + else { + return new StringBuffer( "[" + getLatitude().toPlainString() + ", " + getLongitude() + ", " + getAltitude() + + getAltiudeUnit() + "]" ); + } + } + + @Override + public StringBuffer asText() { + return asSimpleText(); + } + + @Override + public PhylogenyData copy() { + return new Point( getGeodeticDatum(), + getLatitude() == null ? null : new BigDecimal( getLatitude().toPlainString() ), + getLongitude() == null ? null : new BigDecimal( getLongitude().toPlainString() ), + getAltitude() == null ? null : new BigDecimal( getAltitude().toPlainString() ), + getAltiudeUnit() ); + } + + public BigDecimal getAltitude() { + return _alt; + } + + public String getAltiudeUnit() { + return _alt_unit; + } + + public String getGeodeticDatum() { + return _geodetic_datum; + } + + public BigDecimal getLatitude() { + return _lat; + } + + public BigDecimal getLongitude() { + return _long; + } + + @Override + public boolean isEqual( final PhylogenyData point ) { + throw new UnsupportedOperationException(); + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( isEmpty() ) { + return; + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + if ( getAltitude() != null ) { + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.POINT, + PhyloXmlMapping.POINT_GEODETIC_DATUM, + getGeodeticDatum(), + PhyloXmlMapping.POINT_ALTITUDE_UNIT_ATTR, + getAltiudeUnit() ); + } + else { + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.POINT, + PhyloXmlMapping.POINT_GEODETIC_DATUM, + getGeodeticDatum() ); + } + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.POINT_LATITUDE, + getLatitude().toPlainString(), + indentation ); + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.POINT_LONGITUDE, + getLongitude().toPlainString(), + indentation ); + if ( getAltitude() != null ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.POINT_ALTITUDE, + getAltitude().toPlainString(), + indentation ); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.POINT ); + } + + @Override + public String toString() { + return asSimpleText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Polygon.java b/forester/java/src/org/forester/phylogeny/data/Polygon.java new file mode 100644 index 0000000..427d294 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Polygon.java @@ -0,0 +1,109 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public class Polygon implements PhylogenyData { + + private final List _points; + + public Polygon( final List points ) { + _points = points; + } + + @Override + public StringBuffer asSimpleText() { + final StringBuffer sb = new StringBuffer(); + boolean first = true; + for( final Point point : getPoints() ) { + if ( first ) { + first = false; + } + else { + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + sb.append( point.asSimpleText() ); + } + return sb; + } + + @Override + public StringBuffer asText() { + return asSimpleText(); + } + + @Override + public PhylogenyData copy() { + final List new_points = new ArrayList(); + for( final Point point : getPoints() ) { + new_points.add( ( Point ) point.copy() ); + } + return new Polygon( new_points ); + } + + public List getPoints() { + return _points; + } + + public boolean isEmpty() { + return ForesterUtil.isEmpty( _points ); + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( isEmpty() ) { + return; + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.POLYGON ); + for( final Point point : getPoints() ) { + point.toPhyloXML( writer, level, indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE ); + writer.write( indentation ); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.POLYGON ); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/PropertiesMap.java b/forester/java/src/org/forester/phylogeny/data/PropertiesMap.java new file mode 100644 index 0000000..195590f --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/PropertiesMap.java @@ -0,0 +1,205 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.util.ForesterUtil; + +public class PropertiesMap implements PhylogenyData { + + private final SortedMap _properties; + + public PropertiesMap() { + _properties = new TreeMap(); + } + + public int size() { + return _properties.size(); + } + + public void addProperty( final Property property ) throws IllegalArgumentException { + if ( getProperties().containsKey( property.getRef() ) ) { + throw new IllegalArgumentException( "ref [" + property.getRef() + "] is already present" ); + } + getProperties().put( property.getRef(), property ); + } + + @Override + public StringBuffer asSimpleText() { + final StringBuffer sb = new StringBuffer(); + boolean first = true; + for( final String ref : getPropertyRefs() ) { + if ( first ) { + first = false; + } + else { + sb.append( " " ); + } + sb.append( getProperty( ref ).asText() ); + } + return sb; + } + + @Override + public StringBuffer asText() { + return asSimpleText(); + } + + @Override + public PhylogenyData copy() { + final PropertiesMap new_one = new PropertiesMap(); + for( final String r : getProperties().keySet() ) { + new_one.addProperty( getProperties().get( r ) ); + } + return new_one; + } + + public SortedMap getProperties() { + return _properties; + } + + public Property[] getPropertiesArray() { + final Property[] a = new Property[ getProperties().size() ]; + int i = 0; + for( final String ref : getProperties().keySet() ) { + a[ i++ ] = getProperties().get( ref ); + } + return a; + } + + public List getPropertiesWithGivenReferencePrefix( final String ref_prefix ) + throws IllegalArgumentException { + if ( ForesterUtil.isEmpty( ref_prefix ) ) { + throw new IllegalArgumentException( "reference prefix is null or empty" ); + } + final String my_ref_prefix = new String( ref_prefix.trim() ); + final List props = new ArrayList(); + for( final String ref : getProperties().keySet() ) { + if ( ref.startsWith( my_ref_prefix ) ) { + props.add( getProperty( ref ) ); + } + } + return props; + } + + public Property getProperty( final String ref ) throws IllegalArgumentException { + if ( getProperties().containsKey( ref ) ) { + return getProperties().get( ref ); + } + else { + throw new IllegalArgumentException( "reference [" + ref + "] is not present" ); + } + } + + /** + * Returns all property refs of this PhylogenyNode as String array. + */ + public String[] getPropertyRefs() { + if ( getProperties() == null ) { + return new String[ 0 ]; + } + final Property[] properties = getPropertiesArray(); + final String[] refs = new String[ properties.length ]; + for( int i = 0; i < properties.length; ++i ) { + refs[ i ] = properties[ i ].getRef(); + } + return refs; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + public boolean refExists( final String ref ) { + if ( getProperties() != null ) { + for( final String r : getProperties().keySet() ) { + if ( r.equalsIgnoreCase( ref ) ) { + return true; + } + } + } + return false; + } + + public Property removeProperty( final String ref ) throws IllegalArgumentException { + if ( getProperties().containsKey( ref ) ) { + return getProperties().remove( ref ); + } + else { + throw new IllegalArgumentException( "reference [" + ref + "] is not present" ); + } + } + + public List removePropertiesWithGivenReferencePrefix( final String ref_prefix ) + throws IllegalArgumentException { + if ( ForesterUtil.isEmpty( ref_prefix ) ) { + throw new IllegalArgumentException( "reference prefix is null or empty" ); + } + final String my_ref_prefix = new String( ref_prefix.trim() ); + final List to_remove = new ArrayList(); + for( final String ref : getProperties().keySet() ) { + if ( ref.startsWith( my_ref_prefix ) ) { + to_remove.add( ref ); + } + } + for( final String ref : to_remove ) { + getProperties().remove( ref ); + } + return to_remove; + } + + @Override + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + if ( getProperties() != null ) { + for( final String ref : getProperties().keySet() ) { + sb.append( getProperties().get( ref ).toNHX() ); + } + } + return sb; + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( getProperties() != null ) { + for( final String ref : getProperties().keySet() ) { + getProperties().get( ref ).toPhyloXML( writer, level, indentation ); + } + } + } + + @Override + public String toString() { + return asSimpleText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Property.java b/forester/java/src/org/forester/phylogeny/data/Property.java new file mode 100644 index 0000000..9fd9904 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Property.java @@ -0,0 +1,332 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.StringTokenizer; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class Property implements PhylogenyData { + + private String _value; + private final String _ref; + private final String _unit; + private final String _datatype; + private final AppliesTo _applies_to; + private final String _id_ref; + + public Property( final String ref, + final String value, + final String unit, + final String datatype, + final AppliesTo applies_to ) { + this( ref, value, unit, datatype, applies_to, "" ); + } + + // Only used by method createFromNhxString. + private Property( final String ref, + final String value, + final String unit, + final String datatype, + final AppliesTo applies_to, + final boolean dummy ) { + _ref = ref; + _unit = unit; + _datatype = datatype; + _applies_to = applies_to; + _id_ref = ""; + setValue( value ); + } + + public Property( final String ref, + final String value, + final String unit, + final String datatype, + final AppliesTo applies_to, + final String id_ref ) { + if ( !ForesterUtil.isEmpty( ref ) && ( ref.indexOf( ":" ) < 1 ) ) { + throw new IllegalArgumentException( "property reference [" + ref + + "] is not in the expected format (missing a \":\")" ); + } + if ( !ForesterUtil.isEmpty( unit ) && ( unit.indexOf( ":" ) < 1 ) ) { + throw new IllegalArgumentException( "property unit [" + unit + + "] is not in the expected format (missing a \":\")" ); + } + if ( !ForesterUtil.isEmpty( datatype ) && ( datatype.indexOf( ":" ) < 1 ) ) { + throw new IllegalArgumentException( "property datatype [" + unit + + "] is not in the expected format (missing a \":\")" ); + } + _ref = ref; + _unit = unit; + _datatype = datatype; + _applies_to = applies_to; + _id_ref = id_ref; + setValue( value ); + } + + @Override + public StringBuffer asSimpleText() { + return new StringBuffer( getValue() ); + } + + @Override + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + sb.append( getRef() ); + sb.append( ": " ); + sb.append( getValue() ); + if ( !ForesterUtil.isEmpty( getUnit() ) ) { + sb.append( getUnit() ); + } + return sb; + } + + @Override + public PhylogenyData copy() { + return new Property( getRef(), getValue(), getUnit(), getDataType(), getAppliesTo(), getIdRef() ); + } + + public AppliesTo getAppliesTo() { + return _applies_to; + } + + public String getDataType() { + return _datatype; + } + + public String getIdRef() { + return _id_ref; + } + + public String getRef() { + return _ref; + } + + public String getUnit() { + return _unit; + } + + public String getValue() { + return _value; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + if ( data == null ) { + return false; + } + return ( ( Property ) data ).getValue().equals( getValue() ) + && ( ( Property ) data ).getUnit().equals( getUnit() ) + && ( ( Property ) data ).getRef().equals( getRef() ); + } + + public void setValue( final String value ) { + _value = value; + } + + @Override + public StringBuffer toNHX() { + final StringBuffer nhx = new StringBuffer(); + nhx.append( ":X" ); + switch ( getAppliesTo() ) { + case CLADE: + nhx.append( "C=" ); + break; + case NODE: + nhx.append( "N=" ); + break; + case PARENT_BRANCH: + nhx.append( "B=" ); + break; + case PHYLOGENY: + nhx.append( "P=" ); + break; + case ANNOTATION: + nhx.append( "S=" ); + break; + default: + nhx.append( "O=" ); + break; + } + if ( !getDataType().equals( "" ) ) { + if ( getDataType().equals( "xsd:string" ) ) { + nhx.append( "S=" ); + } + else if ( getDataType().equals( "xsd:long" ) ) { + nhx.append( "L=" ); + } + else if ( getDataType().equals( "xsd:decimal" ) ) { + nhx.append( "D=" ); + } + else if ( getDataType().equals( "xsd:boolean" ) ) { + nhx.append( "B=" ); + } + else if ( getDataType().equals( "xsd:anyUR" ) ) { + nhx.append( "U=" ); + } + } + nhx.append( getRef() ); + nhx.append( "=" ); + nhx.append( getValue() ); + if ( !getUnit().equals( "" ) ) { + nhx.append( "=" ); + nhx.append( getUnit() ); + } + return nhx; + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.PROPERTY, + getValue(), + PhyloXmlMapping.PROPERTY_REF, + getRef(), + PhyloXmlMapping.PROPERTY_UNIT, + getUnit(), + PhyloXmlMapping.PROPERTY_DATATYPE, + getDataType(), + PhyloXmlMapping.PROPERTY_APPLIES_TO, + getAppliesTo().toString(), + PhyloXmlMapping.ID_REF, + getIdRef(), + indentation ); + } + + @Override + public String toString() { + return asText().toString(); + } + + public static Property createFromNhxString( final String nhx ) throws IllegalArgumentException { + final StringTokenizer st = new StringTokenizer( nhx, "=" ); + final int tokens = st.countTokens(); + final String error = "error in NHX property tag format: " + + "expected: X[N|B|C|S|T|P|O]===[=], got: \"" + nhx + "\" instead"; + if ( ( tokens != 4 ) && ( tokens != 5 ) ) { + throw new IllegalArgumentException( error ); + } + final String first = st.nextToken(); + AppliesTo applies_to = null; + if ( first.equals( "XN" ) ) { + applies_to = AppliesTo.NODE; + } + else if ( first.equals( "XB" ) ) { + applies_to = AppliesTo.PARENT_BRANCH; + } + else if ( first.equals( "XC" ) ) { + applies_to = AppliesTo.CLADE; + } + else if ( first.equals( "XS" ) ) { + applies_to = AppliesTo.ANNOTATION; + } + else if ( first.equals( "XT" ) ) { + applies_to = AppliesTo.OTHER; + } + else if ( first.equals( "XP" ) ) { + applies_to = AppliesTo.PHYLOGENY; + } + else if ( first.equals( "XO" ) ) { + applies_to = AppliesTo.OTHER; + } + else { + throw new IllegalArgumentException( error ); + } + String datatype = st.nextToken(); + if ( datatype.equals( "S" ) ) { + datatype = "xsd:string"; + } + else if ( datatype.equals( "L" ) ) { + datatype = "xsd:long"; + } + else if ( datatype.equals( "D" ) ) { + datatype = "xsd:decimal"; + } + else if ( datatype.equals( "B" ) ) { + datatype = "xsd:boolean"; + } + else if ( datatype.equals( "U" ) ) { + datatype = "xsd:anyURI"; + } + final String ref = st.nextToken(); + final String value = st.nextToken(); + String unit = ""; + if ( tokens == 5 ) { + unit = st.nextToken(); + } + return new Property( ref, value, unit, datatype, applies_to, true ); + } + + public static enum AppliesTo { + PHYLOGENY { + + @Override + public String toString() { + return "phylogeny"; + } + }, + CLADE { + + @Override + public String toString() { + return "clade"; + } + }, + NODE { + + @Override + public String toString() { + return "node"; + } + }, + ANNOTATION { + + @Override + public String toString() { + return "annotation"; + } + }, + PARENT_BRANCH { + + @Override + public String toString() { + return "parent_branch"; + } + }, + OTHER { + + @Override + public String toString() { + return "other"; + } + } + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/ProteinDomain.java b/forester/java/src/org/forester/phylogeny/data/ProteinDomain.java new file mode 100644 index 0000000..81155ff --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/ProteinDomain.java @@ -0,0 +1,171 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.util.ForesterUtil; + +public class ProteinDomain implements PhylogenyData { + + final public static double CONFIDENCE_DEFAULT = 0.0; + final public static String IDENTIFIER_DEFAULT = ""; + final private String _name; + final private int _from; + final private int _to; + final private String _id; + final private double _confidence; + + public ProteinDomain( final String name, final int from, final int to ) { + this( name, from, to, ProteinDomain.IDENTIFIER_DEFAULT, ProteinDomain.CONFIDENCE_DEFAULT ); + } + + public ProteinDomain( final String name, final int from, final int to, final double confidence ) { + this( name, from, to, ProteinDomain.IDENTIFIER_DEFAULT, confidence ); + } + + public ProteinDomain( final String name, final int from, final int to, final String id ) { + this( name, from, to, id, ProteinDomain.CONFIDENCE_DEFAULT ); + } + + public ProteinDomain( final String name, final int from, final int to, final String id, final double confidence ) { + if ( ( from >= to ) || ( to < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain from " + from + " to " + to ); + } + _name = name; + _from = from; + _to = to; + _id = id; + _confidence = confidence; + } + + public StringBuffer asSimpleText() { + return new StringBuffer( getName() ); + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer( getName() ); + sb.append( " [" ); + sb.append( getLength() ); + if ( !ForesterUtil.isEmpty( getId() ) ) { + sb.append( " " ); + sb.append( getId() ); + } + if ( getConfidence() != CONFIDENCE_DEFAULT ) { + sb.append( " " ); + sb.append( getConfidence() ); + } + sb.append( "]" ); + return sb; + } + + public PhylogenyData copy() { + if ( getId() == null ) { + return new ProteinDomain( getName(), getFrom(), getTo(), getConfidence() ); + } + return new ProteinDomain( getName(), getFrom(), getTo(), getId(), getConfidence() ); + } + + public double getConfidence() { + return _confidence; + } + + public int getFrom() { + return _from; + } + + public String getId() { + return _id; + } + + public int getLength() { + return ( getTo() - getFrom() + 1 ); + } + + public String getName() { + return _name; + } + + public int getTo() { + return _to; + } + + public boolean isEqual( final PhylogenyData protein_domain ) { + if ( protein_domain == null ) { + return false; + } + if ( !( protein_domain instanceof ProteinDomain ) ) { + return false; + } + else if ( ( ( ProteinDomain ) protein_domain ).getLength() != getLength() ) { + return false; + } + else if ( !( ( ProteinDomain ) protein_domain ).getName().equals( getName() ) ) { + return false; + } + return true; + } + + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + if ( getId() != null ) { + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_DOMAIN, + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_FROM, + getFrom() + "", + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_TO, + getTo() + "", + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_CONFIDENCE, + getConfidence() + "", + PhyloXmlMapping.IDENTIFIER, + getId() ); + } + else { + PhylogenyDataUtil.appendOpen( writer, + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_DOMAIN, + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_FROM, + getFrom() + "", + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_TO, + getTo() + "", + PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_PROT_DOMAIN_CONFIDENCE, + getConfidence() + "" ); + } + writer.write( getName() ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.SEQUENCE_DOMAIN_ARCHITECTURE_DOMAIN ); + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Reference.java b/forester/java/src/org/forester/phylogeny/data/Reference.java new file mode 100644 index 0000000..01118a5 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Reference.java @@ -0,0 +1,117 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; + +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.util.ForesterUtil; + +public class Reference implements PhylogenyData { + + String _desc; + String _doi; + + public Reference( final String desc ) { + _desc = desc; + _doi = ""; + } + + public Reference( final String desc, final String doi ) { + _desc = desc; + _doi = doi; + } + + public StringBuffer asSimpleText() { + return new StringBuffer( getDescription() ); + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + if ( !ForesterUtil.isEmpty( getDoi() ) ) { + sb.append( "[doi:" ); + sb.append( getDoi() ); + sb.append( "] " ); + } + sb.append( getDescription() ); + return sb; + } + + public PhylogenyData copy() { + return new Reference( getDescription(), getDoi() ); + } + + public String getDoi() { + return _doi; + } + + public String getDescription() { + return _desc; + } + + public boolean isEqual( final PhylogenyData data ) { + if ( ( data == null ) || ( getDescription() == null ) ) { + return false; + } + return ( ( Reference ) data ).getDescription().equals( getDescription() ) + && ( ( Reference ) data ).getDoi().equals( getDoi() ); + } + + public void setDoi( final String doi ) { + if ( !ForesterUtil.isEmpty( doi ) && !PhyloXmlUtil.LIT_REF_DOI_PATTERN.matcher( doi ).matches() ) { + throw new PhyloXmlDataFormatException( "illegal doi: [" + doi + "]" ); + } + _doi = doi; + } + + public void setValue( final String value ) { + _desc = value; + } + + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.REFERENCE, PhyloXmlMapping.REFERENCE_DOI_ATTR, getDoi() ); + if ( !ForesterUtil.isEmpty( getDescription() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.REFERENCE_DESC, getDescription(), indentation ); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.REFERENCE ); + } + + @Override + public String toString() { + return asText().toString(); + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/phylogeny/data/Sequence.java b/forester/java/src/org/forester/phylogeny/data/Sequence.java new file mode 100644 index 0000000..27b3b0c --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Sequence.java @@ -0,0 +1,388 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.util.ForesterUtil; + +public class Sequence implements PhylogenyData, MultipleUris { + + private String _mol_sequence; + private boolean _mol_sequence_is_aligned; + private String _name; + private String _source_id; + private Accession _accession; + private String _symbol; + private String _location; + private String _type; + private SortedSet _annotations; + private DomainArchitecture _da; + private List _uris; + private List _seq_relations; + + public Sequence() { + init(); + } + + public boolean isEmpty() { + return ( getAccession() == null ) && ForesterUtil.isEmpty( getName() ) && ForesterUtil.isEmpty( getSymbol() ) + && ForesterUtil.isEmpty( getType() ) && ForesterUtil.isEmpty( getLocation() ) + && ForesterUtil.isEmpty( getSourceId() ) && ForesterUtil.isEmpty( getMolecularSequence() ) + && ( getDomainArchitecture() == null ) && ForesterUtil.isEmpty( _annotations ) + && ForesterUtil.isEmpty( _uris ) && ForesterUtil.isEmpty( _seq_relations ); + } + + public void addAnnotation( final Annotation annotation ) { + getAnnotations().add( annotation ); + } + + public void addUri( final Uri uri ) { + if ( getUris() == null ) { + setUris( new ArrayList() ); + } + getUris().add( uri ); + } + + public void addSequenceRelation( final SequenceRelation sr ) { + _seq_relations.add( sr ); + } + + public StringBuffer asSimpleText() { + final StringBuffer sb = new StringBuffer(); + if ( getAccession() != null ) { + sb.append( "[" ); + sb.append( getAccession() ); + sb.append( "] " ); + } + if ( !ForesterUtil.isEmpty( getName() ) ) { + sb.append( getName() ); + sb.append( " " ); + } + if ( !ForesterUtil.isEmpty( getLocation() ) ) { + sb.append( getLocation() ); + } + return sb; + } + + public StringBuffer asText() { + return asSimpleText(); + } + + /** + * Not a deep copy. + * + */ + public PhylogenyData copy() { + final Sequence seq = new Sequence(); + seq.setAnnotations( getAnnotations() ); + seq.setName( getName() ); + seq.setSymbol( getSymbol() ); + seq.setMolecularSequence( getMolecularSequence() ); + seq.setMolecularSequenceAligned( isMolecularSequenceAligned() ); + seq.setLocation( getLocation() ); + if ( getAccession() != null ) { + seq.setAccession( ( Accession ) getAccession().copy() ); + } + else { + seq.setAccession( null ); + } + seq.setType( getType() ); + if ( getUris() != null ) { + seq.setUris( new ArrayList() ); + for( final Uri uri : getUris() ) { + if ( uri != null ) { + seq.getUris().add( uri ); + } + } + } + if ( getDomainArchitecture() != null ) { + seq.setDomainArchitecture( ( DomainArchitecture ) getDomainArchitecture().copy() ); + } + else { + seq.setDomainArchitecture( null ); + } + return seq; + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + return false; + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return isEqual( ( Sequence ) o ); + } + } + + public Accession getAccession() { + return _accession; + } + + public Annotation getAnnotation( final int i ) { + return ( Annotation ) getAnnotations().toArray()[ i ]; + } + + public SortedSet getAnnotations() { + if ( _annotations == null ) { + _annotations = new TreeSet(); + } + return _annotations; + } + + public DomainArchitecture getDomainArchitecture() { + return _da; + } + + public String getLocation() { + return _location; + } + + public String getMolecularSequence() { + return _mol_sequence; + } + + public boolean isMolecularSequenceAligned() { + return _mol_sequence_is_aligned; + } + + public String getName() { + return _name; + } + + public List getSequenceRelations() { + if ( _seq_relations == null ) { + _seq_relations = new ArrayList(); + } + return _seq_relations; + } + + private void setSequenceRelations( final List seq_relations ) { + _seq_relations = seq_relations; + } + + public String getSourceId() { + return _source_id; + } + + public String getSymbol() { + return _symbol; + } + + public String getType() { + return _type; + } + + public List getUris() { + return _uris; + } + + public Uri getUri( final int index ) { + return getUris().get( index ); + } + + @Override + public int hashCode() { + if ( getAccession() != null ) { + return getAccession().hashCode(); + } + int result = getSymbol().hashCode(); + if ( getName().length() > 0 ) { + result ^= getName().hashCode(); + } + if ( getMolecularSequence().length() > 0 ) { + result ^= getMolecularSequence().hashCode(); + } + return result; + } + + public boolean hasSequenceRelations() { + return _seq_relations.size() > 0; + } + + public void init() { + setAnnotations( null ); + setName( "" ); + setMolecularSequence( "" ); + setMolecularSequenceAligned( false ); + setLocation( "" ); + setAccession( null ); + setSymbol( "" ); + setType( "" ); + setDomainArchitecture( null ); + setUris( null ); + setSequenceRelations( null ); + setSourceId( null ); + } + + public boolean isEqual( final PhylogenyData data ) { + if ( this == data ) { + return true; + } + final Sequence s = ( Sequence ) data; + if ( ( getAccession() != null ) && ( s.getAccession() != null ) ) { + return getAccession().isEqual( s.getAccession() ); + } + return s.getMolecularSequence().equals( getMolecularSequence() ) && s.getName().equals( getName() ) + && s.getSymbol().equals( getSymbol() ); + } + + public void setAccession( final Accession accession ) { + _accession = accession; + } + + private void setAnnotations( final SortedSet annotations ) { + _annotations = annotations; + } + + public void setDomainArchitecture( final DomainArchitecture ds ) { + _da = ds; + } + + public void setLocation( final String description ) { + _location = description; + } + + public void setMolecularSequence( final String mol_sequence ) { + _mol_sequence = mol_sequence; + } + + public void setMolecularSequenceAligned( final boolean aligned ) { + _mol_sequence_is_aligned = aligned; + } + + public void setName( final String name ) { + _name = name; + } + + public void setSourceId( final String source_id ) { + _source_id = source_id; + } + + public void setSymbol( final String symbol ) { + if ( !ForesterUtil.isEmpty( symbol ) && !PhyloXmlUtil.SEQUENCE_SYMBOL_PATTERN.matcher( symbol ).matches() ) { + throw new PhyloXmlDataFormatException( "illegal sequence symbol: [" + symbol + "]" ); + } + _symbol = symbol; + } + + public void setType( final String type ) { + if ( !ForesterUtil.isEmpty( type ) && !PhyloXmlUtil.SEQUENCE_TYPES.contains( type ) ) { + throw new PhyloXmlDataFormatException( "illegal sequence type: [" + type + "]" ); + } + _type = type; + } + + public void setUris( final List uris ) { + _uris = uris; + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + if ( getName().length() > 0 ) { + sb.append( ":" ); + sb.append( NHXtags.GENE_NAME ); + sb.append( ForesterUtil.replaceIllegalNhxCharacters( getName() ) ); + } + if ( getAccession() != null ) { + getAccession().toNHX(); + } + if ( getDomainArchitecture() != null ) { + sb.append( getDomainArchitecture().toNHX() ); + } + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( isEmpty() ) { + return; + } + final String my_ind = indentation + PhylogenyWriter.PHYLO_XML_INTENDATION_BASE; + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.SEQUENCE, PhyloXmlMapping.SEQUENCE_TYPE, getType() ); + if ( !ForesterUtil.isEmpty( getSymbol() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.SEQUENCE_SYMBOL, getSymbol(), indentation ); + } + if ( ( getAccession() != null ) && !ForesterUtil.isEmpty( getAccession().getValue() ) ) { + getAccession().toPhyloXML( writer, level, indentation ); + } + if ( !ForesterUtil.isEmpty( getName() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.SEQUENCE_NAME, getName(), indentation ); + } + if ( !ForesterUtil.isEmpty( getLocation() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.SEQUENCE_LOCATION, getLocation(), indentation ); + } + if ( !ForesterUtil.isEmpty( getMolecularSequence() ) ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.SEQUENCE_MOL_SEQ, + getMolecularSequence(), + PhyloXmlMapping.SEQUENCE_MOL_SEQ_ALIGNED_ATTR, + String.valueOf( isMolecularSequenceAligned() ), + indentation ); + } + if ( getUris() != null ) { + for( final Uri uri : getUris() ) { + if ( uri != null ) { + uri.toPhyloXML( writer, level, indentation ); + } + } + } + if ( _annotations != null ) { + for( final PhylogenyData annotation : getAnnotations() ) { + annotation.toPhyloXML( writer, level, my_ind ); + } + } + if ( getDomainArchitecture() != null ) { + getDomainArchitecture().toPhyloXML( writer, level, my_ind ); + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.SEQUENCE ); + } + + @Override + public String toString() { + return asText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/SequenceRelation.java b/forester/java/src/org/forester/phylogeny/data/SequenceRelation.java new file mode 100644 index 0000000..de50d59 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/SequenceRelation.java @@ -0,0 +1,149 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.LinkedHashMap; +import java.util.Map; + +public class SequenceRelation implements PhylogenyData { + + //public final static Map typesToNames = new LinkedHashMap(); + public final static Map typesToNames = new LinkedHashMap(); + public final static String SEQUENCE_RELATION_TYPE_ORTHOLOGY = "orthology"; + public final static String SEQUENCE_RELATION_TYPE_ONE_TO_ONE_ORTHOLOGY = "one_to_one_orthology"; + public final static String SEQUENCE_RELATION_TYPE_SUPER_ORTHOLOGY = "super_orthology"; + public final static String SEQUENCE_RELATION_TYPE_PARALOGY = "paralogy"; + public final static String SEQUENCE_RELATION_TYPE_ULTRA_PARALOGY = "ultra_paralogy"; + public final static String SEQUENCE_RELATION_TYPE_XENOLOGY = "xenology"; + public final static String SEQUENCE_RELATION_TYPE_UNKNOWN = "unknown"; + public final static String SEQUENCE_RELATION_TYPE_OTHER = "other"; + private Sequence ref0; + private Sequence ref1; + private SEQUENCE_RELATION_TYPE type; + private Double distance; + private Confidence confidence; + static { + typesToNames.put( SEQUENCE_RELATION_TYPE.orthology, SEQUENCE_RELATION_TYPE_ORTHOLOGY ); + typesToNames.put( SEQUENCE_RELATION_TYPE.one_to_one_orthology, SEQUENCE_RELATION_TYPE_ONE_TO_ONE_ORTHOLOGY ); + typesToNames.put( SEQUENCE_RELATION_TYPE.super_orthology, SEQUENCE_RELATION_TYPE_SUPER_ORTHOLOGY ); + typesToNames.put( SEQUENCE_RELATION_TYPE.paralogy, SEQUENCE_RELATION_TYPE_PARALOGY ); + typesToNames.put( SEQUENCE_RELATION_TYPE.ultra_paralogy, SEQUENCE_RELATION_TYPE_ULTRA_PARALOGY ); + typesToNames.put( SEQUENCE_RELATION_TYPE.xenology, SEQUENCE_RELATION_TYPE_XENOLOGY ); + typesToNames.put( SEQUENCE_RELATION_TYPE.unknown, SEQUENCE_RELATION_TYPE_UNKNOWN ); + typesToNames.put( SEQUENCE_RELATION_TYPE.other, SEQUENCE_RELATION_TYPE_OTHER ); + } + + @Override + public StringBuffer asSimpleText() { + // TODO Auto-generated method stub + return null; + } + + @Override + public StringBuffer asText() { + // TODO Auto-generated method stub + return null; + } + + @Override + public PhylogenyData copy() { + // TODO Auto-generated method stub + return null; + } + + public Confidence getConfidence() { + return confidence; + } + + public Double getDistance() { + return distance; + } + + public Sequence getRef0() { + return ref0; + } + + public Sequence getRef1() { + return ref1; + } + + public SEQUENCE_RELATION_TYPE getType() { + return type; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + // TODO Auto-generated method stub + return false; + } + + public void setConfidence( final Confidence confidence ) { + this.confidence = confidence; + } + + public void setDistance( final Double distance ) { + this.distance = distance; + } + + public void setRef0( final Sequence ref0 ) { + this.ref0 = ref0; + } + + public void setRef1( final Sequence ref1 ) { + this.ref1 = ref1; + } + + public void setType( final SEQUENCE_RELATION_TYPE type ) { + this.type = type; + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + // TODO Auto-generated method stub + } + + public static String getPrintableNameByType( final SEQUENCE_RELATION_TYPE type ) { + String s = typesToNames.get( type ); + if ( s != null ) { + s = s.replace( '_', ' ' ); + if ( ( s.length() > 15 ) && s.toLowerCase().endsWith( "ology" ) ) { + s = s.substring( 0, s.length() - 5 ) + "."; + } + } + return s; + } + + public static enum SEQUENCE_RELATION_TYPE { + orthology, one_to_one_orthology, super_orthology, paralogy, ultra_paralogy, xenology, unknown, other; + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Taxonomy.java b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java new file mode 100644 index 0000000..026d68b --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Taxonomy.java @@ -0,0 +1,394 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +import org.forester.io.parsers.nhx.NHXtags; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.util.ForesterUtil; + +public class Taxonomy implements PhylogenyData, MultipleUris, Comparable { + + private String _scientific_name; + private String _common_name; + private List _synonyms; + private String _authority; + private Identifier _identifier; + private String _taxonomy_code; + private String _rank; + private List _uris; + + public Taxonomy() { + init(); + } + + public StringBuffer asSimpleText() { + return asText(); + } + + public Uri getUri( final int index ) { + return getUris().get( index ); + } + + public void addUri( final Uri uri ) { + if ( getUris() == null ) { + setUris( new ArrayList() ); + } + getUris().add( uri ); + } + + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + if ( getIdentifier() != null ) { + sb.append( "[" ); + sb.append( getIdentifier().asSimpleText() ); + sb.append( "]" ); + } + if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) ) { + if ( sb.length() > 0 ) { + sb.append( " " ); + } + sb.append( "[" ); + sb.append( getTaxonomyCode() ); + sb.append( "]" ); + } + if ( !ForesterUtil.isEmpty( getScientificName() ) ) { + if ( sb.length() > 0 ) { + sb.append( " " ); + } + sb.append( getScientificName() ); + if ( !ForesterUtil.isEmpty( getAuthority() ) ) { + sb.append( " (" ); + sb.append( getAuthority() ); + sb.append( ")" ); + } + } + if ( !ForesterUtil.isEmpty( getCommonName() ) ) { + if ( sb.length() > 0 ) { + sb.append( " " ); + } + sb.append( getCommonName() ); + } + return sb; + } + + public PhylogenyData copy() { + final Taxonomy t = new Taxonomy(); + t.setTaxonomyCode( getTaxonomyCode() ); + t.setScientificName( getScientificName() ); + t.setCommonName( getCommonName() ); + t.setAuthority( getAuthority() ); + for( final String syn : getSynonyms() ) { + t.getSynonyms().add( syn ); + } + if ( getIdentifier() != null ) { + t.setIdentifier( ( Identifier ) getIdentifier().copy() ); + } + else { + t.setIdentifier( null ); + } + t.setRank( new String( getRank() ) ); + if ( getUris() != null ) { + t.setUris( new ArrayList() ); + for( final Uri uri : getUris() ) { + if ( uri != null ) { + t.getUris().add( uri ); + } + } + } + return t; + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + return false; + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return isEqual( ( Taxonomy ) o ); + } + } + + public String getAuthority() { + return _authority; + } + + public String getCommonName() { + return _common_name; + } + + public Identifier getIdentifier() { + return _identifier; + } + + public String getRank() { + return _rank; + } + + public String getScientificName() { + return _scientific_name; + } + + public List getSynonyms() { + if ( _synonyms == null ) { + _synonyms = new ArrayList(); + } + return _synonyms; + } + + public String getTaxonomyCode() { + return _taxonomy_code; + } + + public List getUris() { + return _uris; + } + + @Override + public int hashCode() { + if ( getIdentifier() != null ) { + return getIdentifier().hashCode(); + } + else if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) ) { + return getTaxonomyCode().hashCode(); + } + else if ( !ForesterUtil.isEmpty( getScientificName() ) ) { + if ( !ForesterUtil.isEmpty( getAuthority() ) ) { + return ( getScientificName().toLowerCase() + getAuthority().toLowerCase() ).hashCode(); + } + return getScientificName().toLowerCase().hashCode(); + } + else { + return getCommonName().toLowerCase().hashCode(); + } + } + + public void init() { + setScientificName( "" ); + setCommonName( "" ); + setIdentifier( null ); + setRank( "" ); + setTaxonomyCode( "" ); + setAuthority( "" ); + setSynonyms( null ); + setUris( null ); + } + + public boolean isEmpty() { + return ( ( getIdentifier() == null ) && ForesterUtil.isEmpty( getTaxonomyCode() ) + && ForesterUtil.isEmpty( getCommonName() ) && ForesterUtil.isEmpty( getScientificName() ) + && ForesterUtil.isEmpty( getRank() ) && ForesterUtil.isEmpty( _uris ) + && ForesterUtil.isEmpty( getAuthority() ) && ForesterUtil.isEmpty( _synonyms ) ); + } + + /** + * + * If this and taxonomy 'data' has an identifier, comparison will be based on that. + * Otherwise, if this and taxonomy 'data' has a code, comparison will be based on that. + * Otherwise, if Taxonomy 'data' has a scientific name, comparison will be + * based on that (case insensitive!). + * Otherwise, if Taxonomy 'data' has a common name, comparison will be + * based on that (case insensitive!). + * (Note. This is important and should not be change without a very good reason.) + * + */ + public boolean isEqual( final PhylogenyData data ) { + if ( this == data ) { + return true; + } + final Taxonomy tax = ( Taxonomy ) data; + if ( ( getIdentifier() != null ) && ( tax.getIdentifier() != null ) ) { + return getIdentifier().isEqual( tax.getIdentifier() ); + } + else if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) && !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { + return getTaxonomyCode().equals( tax.getTaxonomyCode() ); + } + else if ( !ForesterUtil.isEmpty( getScientificName() ) && !ForesterUtil.isEmpty( tax.getScientificName() ) ) { + if ( !ForesterUtil.isEmpty( getAuthority() ) && !ForesterUtil.isEmpty( tax.getAuthority() ) ) { + return ( getScientificName().equalsIgnoreCase( tax.getScientificName() ) ) + && ( getAuthority().equalsIgnoreCase( tax.getAuthority() ) ); + } + return getScientificName().equalsIgnoreCase( tax.getScientificName() ); + } + else if ( !ForesterUtil.isEmpty( getCommonName() ) && !ForesterUtil.isEmpty( tax.getCommonName() ) ) { + return getCommonName().equalsIgnoreCase( tax.getCommonName() ); + } + else if ( !ForesterUtil.isEmpty( getScientificName() ) && !ForesterUtil.isEmpty( tax.getCommonName() ) ) { + return getScientificName().equalsIgnoreCase( tax.getCommonName() ); + } + else if ( !ForesterUtil.isEmpty( getCommonName() ) && !ForesterUtil.isEmpty( tax.getScientificName() ) ) { + return getCommonName().equalsIgnoreCase( tax.getScientificName() ); + } + throw new RuntimeException( "comparison not possible with empty fields" ); + } + + public void setAuthority( final String authority ) { + _authority = authority; + } + + public void setCommonName( final String common_name ) { + _common_name = common_name; + } + + public void setIdentifier( final Identifier identifier ) { + _identifier = identifier; + } + + public void setRank( final String rank ) { + if ( !ForesterUtil.isEmpty( rank ) && !PhyloXmlUtil.TAXONOMY_RANKS.contains( rank ) ) { + throw new PhyloXmlDataFormatException( "illegal rank: [" + rank + "]" ); + } + _rank = rank; + } + + public void setScientificName( final String scientific_name ) { + _scientific_name = scientific_name; + } + + private void setSynonyms( final List synonyms ) { + _synonyms = synonyms; + } + + public void setTaxonomyCode( final String taxonomy_code ) { + if ( !ForesterUtil.isEmpty( taxonomy_code ) + && !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( taxonomy_code ).matches() ) { + throw new PhyloXmlDataFormatException( "illegal taxonomy code: [" + taxonomy_code + "]" ); + } + _taxonomy_code = taxonomy_code; + } + + public void setUris( final List uris ) { + _uris = uris; + } + + public StringBuffer toNHX() { + final StringBuffer sb = new StringBuffer(); + if ( getIdentifier() != null ) { + sb.append( ':' + NHXtags.TAXONOMY_ID ); + sb.append( ForesterUtil.replaceIllegalNhxCharacters( getIdentifier().getValue() ) ); + } + final StringBuffer species = new StringBuffer(); + if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) ) { + species.append( ForesterUtil.replaceIllegalNhxCharacters( getTaxonomyCode() ) ); + } + if ( !ForesterUtil.isEmpty( getScientificName() ) ) { + ForesterUtil.appendSeparatorIfNotEmpty( species, '|' ); + species.append( ForesterUtil.replaceIllegalNhxCharacters( getScientificName() ) ); + } + if ( !ForesterUtil.isEmpty( getCommonName() ) ) { + ForesterUtil.appendSeparatorIfNotEmpty( species, '|' ); + species.append( ForesterUtil.replaceIllegalNhxCharacters( getCommonName() ) ); + } + if ( species.length() > 0 ) { + sb.append( ':' + NHXtags.SPECIES_NAME ); + sb.append( species ); + } + return sb; + } + + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + if ( isEmpty() ) { + return; + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendOpen( writer, PhyloXmlMapping.TAXONOMY ); + if ( ( getIdentifier() != null ) && !ForesterUtil.isEmpty( getIdentifier().getValue() ) ) { + getIdentifier().toPhyloXML( writer, level, indentation ); + } + if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.TAXONOMY_CODE, getTaxonomyCode(), indentation ); + } + if ( !ForesterUtil.isEmpty( getScientificName() ) ) { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.TAXONOMY_SCIENTIFIC_NAME, + getScientificName(), + indentation ); + } + if ( !ForesterUtil.isEmpty( getAuthority() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.TAXONOMY_AUTHORITY, getAuthority(), indentation ); + } + if ( !ForesterUtil.isEmpty( getCommonName() ) ) { + PhylogenyDataUtil + .appendElement( writer, PhyloXmlMapping.TAXONOMY_COMMON_NAME, getCommonName(), indentation ); + } + if ( _synonyms != null ) { + for( final String syn : getSynonyms() ) { + if ( !ForesterUtil.isEmpty( syn ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.TAXONOMY_SYNONYM, syn, indentation ); + } + } + } + if ( !ForesterUtil.isEmpty( getRank() ) ) { + PhylogenyDataUtil.appendElement( writer, PhyloXmlMapping.TAXONOMY_RANK, getRank(), indentation ); + } + if ( getUris() != null ) { + for( final Uri uri : getUris() ) { + if ( uri != null ) { + uri.toPhyloXML( writer, level, indentation ); + } + } + } + writer.write( ForesterUtil.LINE_SEPARATOR ); + writer.write( indentation ); + PhylogenyDataUtil.appendClose( writer, PhyloXmlMapping.TAXONOMY ); + } + + @Override + public String toString() { + return asText().toString(); + } + + @Override + public int compareTo( final Taxonomy o ) { + if ( equals( o ) ) { + return 0; + } + else if ( !ForesterUtil.isEmpty( getScientificName() ) && !ForesterUtil.isEmpty( o.getScientificName() ) ) { + return getScientificName().compareToIgnoreCase( o.getScientificName() ); + } + else if ( !ForesterUtil.isEmpty( getCommonName() ) && !ForesterUtil.isEmpty( o.getCommonName() ) ) { + return getCommonName().compareToIgnoreCase( o.getCommonName() ); + } + else if ( !ForesterUtil.isEmpty( getTaxonomyCode() ) && !ForesterUtil.isEmpty( o.getTaxonomyCode() ) ) { + return getTaxonomyCode().compareToIgnoreCase( o.getTaxonomyCode() ); + } + return 0; + } +} diff --git a/forester/java/src/org/forester/phylogeny/data/Uri.java b/forester/java/src/org/forester/phylogeny/data/Uri.java new file mode 100644 index 0000000..06559fa --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/data/Uri.java @@ -0,0 +1,127 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.data; + +import java.io.IOException; +import java.io.Writer; +import java.net.URI; + +import org.forester.io.parsers.phyloxml.PhyloXmlMapping; + +public class Uri implements PhylogenyData { + + final private URI _uri; + final private String _description; + final private String _type; + + public Uri( final String uri_str, final String description, final String type ) { + if ( uri_str == null ) { + throw new IllegalArgumentException( "attempt to create Uri from null" ); + } + _uri = URI.create( uri_str ); + _description = description; + _type = type; + } + + public Uri( final URI uri ) { + if ( uri == null ) { + throw new IllegalArgumentException( "attempt to create Uri from null URI" ); + } + _uri = uri; + _description = ""; + _type = ""; + } + + public Uri( final URI uri, final String description, final String type ) { + if ( uri == null ) { + throw new IllegalArgumentException( "attempt to create Uri from null URI" ); + } + _uri = uri; + _description = description; + _type = type; + } + + @Override + public StringBuffer asSimpleText() { + return new StringBuffer( getValue().toString() ); + } + + @Override + public StringBuffer asText() { + final StringBuffer sb = new StringBuffer(); + sb.append( "[" ); + sb.append( getDescription() ); + sb.append( " " ); + sb.append( getType() ); + sb.append( "] " ); + sb.append( getValue().toString() ); + return sb; + } + + @Override + public PhylogenyData copy() { + return new Uri( getValue().toString(), new String( getDescription() ), new String( getType() ) ); + } + + public String getDescription() { + return _description; + } + + public String getType() { + return _type; + } + + public URI getValue() { + return _uri; + } + + @Override + public boolean isEqual( final PhylogenyData data ) { + throw new UnsupportedOperationException(); + } + + @Override + public StringBuffer toNHX() { + throw new UnsupportedOperationException(); + } + + @Override + public void toPhyloXML( final Writer writer, final int level, final String indentation ) throws IOException { + PhylogenyDataUtil.appendElement( writer, + PhyloXmlMapping.URI, + getValue().toString(), + PhyloXmlMapping.TYPE_ATTR, + getType(), + PhyloXmlMapping.URI_DESC_ATTR, + getDescription(), + indentation ); + } + + @Override + public String toString() { + return asSimpleText().toString(); + } +} diff --git a/forester/java/src/org/forester/phylogeny/factories/BasicPhylogenyFactory.java b/forester/java/src/org/forester/phylogeny/factories/BasicPhylogenyFactory.java new file mode 100644 index 0000000..0d37507 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/factories/BasicPhylogenyFactory.java @@ -0,0 +1,46 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.factories; + +import java.io.IOException; + +import org.forester.phylogeny.Phylogeny; + +/* + * Convinience class for PhylogenyFactories not using parameters. + * + * @author Christian M. Zmasek + */ +public abstract class BasicPhylogenyFactory implements PhylogenyFactory { + + public Phylogeny create() { + return new Phylogeny(); + } + + public Phylogeny[] create( final Object source, final Object creator ) throws IOException { + return create( source, creator, null ); + } +} diff --git a/forester/java/src/org/forester/phylogeny/factories/ParserBasedPhylogenyFactory.java b/forester/java/src/org/forester/phylogeny/factories/ParserBasedPhylogenyFactory.java new file mode 100644 index 0000000..eab4efa --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/factories/ParserBasedPhylogenyFactory.java @@ -0,0 +1,89 @@ +// $Id: +// $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.factories; + +import java.io.IOException; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.util.ForesterUtil; + +public class ParserBasedPhylogenyFactory extends BasicPhylogenyFactory { + + private final static PhylogenyFactory _instance; + static { + try { + _instance = new ParserBasedPhylogenyFactory(); + } + catch ( final Throwable e ) { + throw new RuntimeException( e.getMessage() ); + } + } + + private ParserBasedPhylogenyFactory() { + // Private constructor. + } + + @Override + public Object clone() throws CloneNotSupportedException { + throw new CloneNotSupportedException(); + } + + public synchronized Phylogeny[] create( final Object source, final Object parser, final List parameters ) + throws IOException { + if ( !( parser instanceof PhylogenyParser ) ) { + throw new IllegalArgumentException( "attempt to use object of type other than PhylogenyParser as creator for ParserBasedPhylogenyFactory" ); + } + final PhylogenyParser my_parser = ( PhylogenyParser ) parser; + my_parser.setSource( source ); + return my_parser.parse(); + } + + public synchronized Phylogeny[] create( final Object source, + final Object parser, + final String schema_location, + final List parameters ) throws IOException { + if ( !( parser instanceof PhylogenyParser ) ) { + throw new IllegalArgumentException( "attempt to use object of type other than PhylogenyParser as creator for ParserBasedPhylogenyFactory." ); + } + if ( !( parser instanceof PhyloXmlParser ) ) { + throw new IllegalArgumentException( "attempt to use schema location with other than phyloXML parser" ); + } + final PhyloXmlParser xml_parser = ( PhyloXmlParser ) parser; + if ( !ForesterUtil.isEmpty( schema_location ) ) { + xml_parser.setValidateAgainstSchema( schema_location ); + } + xml_parser.setSource( source ); + return xml_parser.parse(); + } + + public static PhylogenyFactory getInstance() { + return _instance; + } +} diff --git a/forester/java/src/org/forester/phylogeny/factories/PhylogenyFactory.java b/forester/java/src/org/forester/phylogeny/factories/PhylogenyFactory.java new file mode 100644 index 0000000..ce90f03 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/factories/PhylogenyFactory.java @@ -0,0 +1,77 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.factories; + +import java.io.IOException; +import java.util.List; + +import org.forester.phylogeny.Phylogeny; + +/* + * Interface for Phylogeny factories. + * + * @author Christian M. Zmasek + */ +public interface PhylogenyFactory { + + /** + * This must be implemented in such a way that it returns an empty + * Phylogeny. + * + * @return an empty Phylogeny + */ + public Phylogeny create(); + + /** + * This must create a Phylogeny from source (e.g. an XML file, an alignment, + * pairwise distances) by using creator (e.g. an XML file parser, an + * algorithm implementation). + * + * @param source + * a source to create a Phylogeny from + * @param creator + * a means to create a Phylogeny + * @return a Phylogeny[] based on argument source + * @throws IOException + */ + public Phylogeny[] create( Object source, Object creator ) throws IOException; + + /** + * This must create a Phylogeny from source (e.g. an XML file, an alignment, + * pairwise distances) by using creator (e.g. an XML file parser, an + * algorithm implementation) with parameters listed in parameters. + * + * @param source + * a source to create a Phylogeny from + * @param creator + * a means to create a Phylogeny + * @param parameters + * a List of parameters for Phylogeny creation + * @return a Phylogeny[] based on argument source + * @throws IOException + */ + public Phylogeny[] create( Object source, Object creator, List parameters ) throws IOException; +} // PhylogenyFactory diff --git a/forester/java/src/org/forester/phylogeny/iterators/ChildNodeIteratorForward.java b/forester/java/src/org/forester/phylogeny/iterators/ChildNodeIteratorForward.java new file mode 100644 index 0000000..98bbf59 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/iterators/ChildNodeIteratorForward.java @@ -0,0 +1,141 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.iterators; + +import java.util.NoSuchElementException; + +import org.forester.phylogeny.PhylogenyNode; + +/* + * An iterator to forward iterate over child nodes of a PhylogenyNode. Created: + * 10/23/2005 by Christian M. Zmasek. Last modified: 12/28/2006 by Christian M. + * Zmasek. + * + * @author Christian M. Zmasek + * + * @version 1.000 + */ +public class ChildNodeIteratorForward implements PhylogenyNodeIterator { + + // Instance variables + // ------------------ + private int _i; + final private PhylogenyNode _node; + + // Constructor + // ----------- + /** + * Creates a new ChildNodeIteratorForward. + * + * @param node + * the parent of the PhylogenyNodes to iterate over. + * @throws IllegalArgumentException + * if node has no child nodes + */ + public ChildNodeIteratorForward( final PhylogenyNode node ) throws IllegalArgumentException { + if ( node.getNumberOfDescendants() < 1 ) { + throw new IllegalArgumentException( "Attempt to use ChildNodeIteratorForward on node with no child nodes." ); + } + _node = node; + reset(); + } + + // Private methods + // --------------- + /** + * Returns the counter. + */ + private int getI() { + return _i; + } + + /** + * Returns the parent of the nodes to iterate over. + * + * @return the parent of the nodes to iterate over. + */ + private PhylogenyNode getNode() { + return _node; + } + + // Public methods + // -------------- + /** + * Returns true is this iterator has at least one more element, false + * otherwise. + * + * @return true is this iterator has at least one more element, false + * otherwise + */ + public boolean hasNext() { + return ( getI() < getNode().getNumberOfDescendants() ); + } + + /** + * Increases the counter by one. + */ + private void increaseI() { + ++_i; + } + + /** + * Returns the next PhylogenyNode. + * + * @return the next PhylogenyNode + * @throws NoSuchElementException + * if iteration is complete + */ + public PhylogenyNode next() throws NoSuchElementException { + if ( !hasNext() ) { + throw new NoSuchElementException( "Attempt to call \"next()\" on iterator which has no more next elements." ); + } + final PhylogenyNode n = getNode().getChildNode( getI() ); + increaseI(); + return n; + } + + /** + * Not supported. + * + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Resets the iterator. + */ + public void reset() { + setI( 0 ); + } + + /** + * Sets the counter. + */ + private void setI( final int i ) { + _i = i; + } +} // end of class ChildNodeIteratorForward. diff --git a/forester/java/src/org/forester/phylogeny/iterators/ExternalForwardIterator.java b/forester/java/src/org/forester/phylogeny/iterators/ExternalForwardIterator.java new file mode 100644 index 0000000..f75e6f7 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/iterators/ExternalForwardIterator.java @@ -0,0 +1,119 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.iterators; + +import java.util.NoSuchElementException; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; + +/* + * @author Christian Zmasek + */ +public class ExternalForwardIterator implements PhylogenyNodeIterator { + + private PhylogenyNode _current_node; + private final PhylogenyNode _last_ext_node; + private final PhylogenyNode _first_ext_node; + + /** + * Constructor for ExternalForwardIterator. + * + * @param tree + * the tree on which to iterate over all external nodes. + */ + public ExternalForwardIterator( final Phylogeny phylogeny ) throws IllegalArgumentException { + if ( phylogeny.isEmpty() ) { + throw new IllegalArgumentException( "Attempt to use ExternalForwardIterator on an empty phylogeny." ); + } + PhylogenyNode n = phylogeny.getRoot(); + while ( !n.isExternal() ) { + n = n.getLastChildNode(); + } + _last_ext_node = n; + _first_ext_node = phylogeny.getFirstExternalNode(); + reset(); + } + + private PhylogenyNode getCurrentNode() { + return _current_node; + } + + private PhylogenyNode getFirstExtNode() { + return _first_ext_node; + } + + private PhylogenyNode getLastExtNode() { + return _last_ext_node; + } + + /* + * (non-Javadoc) + * + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + return getCurrentNode() != null; + } + + /* + * (non-Javadoc) + * + * @see java.util.Iterator#next() + */ + public PhylogenyNode next() throws NoSuchElementException { + if ( !hasNext() ) { + throw new NoSuchElementException( "Attempt to call \"next()\" on iterator which has no more next elements." ); + } + final PhylogenyNode n = getCurrentNode(); + if ( n == getLastExtNode() ) { + setCurrentNode( null ); + } + else { + setCurrentNode( n.getNextExternalNode() ); + } + return n; + } + + /** + * Not supported. + * + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * DOCUMENT ME! + */ + public void reset() { + setCurrentNode( getFirstExtNode() ); + } + + private void setCurrentNode( final PhylogenyNode current_node ) { + _current_node = current_node; + } +} // end of class ExternalForwardIterator diff --git a/forester/java/src/org/forester/phylogeny/iterators/LevelOrderTreeIterator.java b/forester/java/src/org/forester/phylogeny/iterators/LevelOrderTreeIterator.java new file mode 100644 index 0000000..1b1450f --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/iterators/LevelOrderTreeIterator.java @@ -0,0 +1,147 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.iterators; + +import java.util.NoSuchElementException; + +import org.forester.datastructures.Queue; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; + +/* + * An iterator to iterate a Phylogeny in level order. + * + * Created: 10/23/2005 by Christian M. Zmasek. Last modified: 10/23/2005 by + * Christian M. Zmasek. + * + * @author Christian M. Zmasek + * + * @version 1.000 + */ +public class LevelOrderTreeIterator implements PhylogenyNodeIterator { + + // Instance variables + // ------------------ + private final Queue _queue; + private final PhylogenyNode _root; + + // Constructors + // ------------ + /** + * Creates a new LevelOrderTreeIterator for iterating over all the nodes of + * Phylogeny phylogeny + * + * @param phylogeny + * the Phylogeny to iterate over + * @throws IllegalArgumentException + * if phylogeny is empty + */ + public LevelOrderTreeIterator( final Phylogeny phylogeny ) throws IllegalArgumentException { + this( phylogeny.getRoot() ); + if ( phylogeny.isEmpty() ) { + throw new IllegalArgumentException( "Attempt to use LevelOrderTreeIterator on an empty phylogeny." ); + } + } + + /** + * Creates a new LevelOrderTreeIterator for iterating over all the child + * nodes of PhylogenyNode node (including node itself). + * + * @param node + * the parent of the nodes to iterate over + */ + public LevelOrderTreeIterator( final PhylogenyNode node ) { + _queue = new Queue(); + _root = node; + reset(); + } + + // Private methods + // --------------- + /** + * Returns the queue upon which this iterator is based. + * + */ + private Queue getQueue() { + return _queue; + } + + /** + * Returns the root of the phylogeny this iterators parses over. + * + * @return the root of the phylogeny this iterators parses over. + */ + private PhylogenyNode getRoot() { + return _root; + } + + // Public methods + // -------------- + /** + * Returns true is this iterator has at least one more element, false + * otherwise. + * + * @return true is this iterator has at least one more element, false + * otherwise + */ + public boolean hasNext() { + return !getQueue().isEmpty(); + } + + /** + * Returns the next PhylogenyNode. + * + * @return the next PhylogenyNode + * @throws NoSuchElementException + * if iteration is complete + */ + public PhylogenyNode next() throws NoSuchElementException { + if ( !hasNext() ) { + throw new NoSuchElementException( "Attempt to call \"next()\" on iterator which has no more next elements." ); + } + final PhylogenyNode node = ( PhylogenyNode ) getQueue().dequeue(); + for( int i = 0; i < node.getNumberOfDescendants(); ++i ) { + getQueue().enqueue( node.getChildNode( i ) ); + } + return node; + } + + /** + * Not supported. + * + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Resets the iterator. + */ + public void reset() { + getQueue().clear(); + getQueue().enqueue( getRoot() ); + } +} // enod of class LevelOrderTreeIterator diff --git a/forester/java/src/org/forester/phylogeny/iterators/PhylogenyNodeIterator.java b/forester/java/src/org/forester/phylogeny/iterators/PhylogenyNodeIterator.java new file mode 100644 index 0000000..587ab51 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/iterators/PhylogenyNodeIterator.java @@ -0,0 +1,46 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.iterators; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +import org.forester.phylogeny.PhylogenyNode; + +/* + * @author Christian Zmasek + * + * TODO To change the template for this generated type comment go to Window - + * Preferences - Java - Code Style - Code Templates + */ +public interface PhylogenyNodeIterator extends Iterator { + + public boolean hasNext(); + + public PhylogenyNode next() throws NoSuchElementException; + + public void reset(); +} diff --git a/forester/java/src/org/forester/phylogeny/iterators/PostOrderStackObject.java b/forester/java/src/org/forester/phylogeny/iterators/PostOrderStackObject.java new file mode 100644 index 0000000..9f06e50 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/iterators/PostOrderStackObject.java @@ -0,0 +1,70 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.iterators; + +import org.forester.phylogeny.PhylogenyNode; + +/* + * @author Christian M. Zmasek + * + * @version 1.00 -- last modified: 06/15/00 + */ +public class PostOrderStackObject { + + final private PhylogenyNode _node; + final private int _phase; + + /** + * Creates a new PostOrderStackObject object. + * + * @param n + * DOCUMENT ME! + * @param i + * DOCUMENT ME! + */ + public PostOrderStackObject( final PhylogenyNode n, final int i ) { + _node = n; + _phase = i; + } + + /** + * DOCUMENT ME! + * + * @return DOCUMENT ME! + */ + public PhylogenyNode getNode() { + return _node; + } + + /** + * DOCUMENT ME! + * + * @return DOCUMENT ME! + */ + public int getPhase() { + return _phase; + } +} diff --git a/forester/java/src/org/forester/phylogeny/iterators/PostorderTreeIterator.java b/forester/java/src/org/forester/phylogeny/iterators/PostorderTreeIterator.java new file mode 100644 index 0000000..1405476 --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/iterators/PostorderTreeIterator.java @@ -0,0 +1,128 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.iterators; + +import java.util.NoSuchElementException; +import java.util.Stack; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; + +/* + * * + */ +public class PostorderTreeIterator implements PhylogenyNodeIterator { + + final private Phylogeny _tree; + final private PhylogenyNode _root; + private boolean _has_next; + final private Stack _stack; + + /** + * @param t + * Phylogeny for which a Iterator is to be constructed. + */ + public PostorderTreeIterator( final Phylogeny tree ) throws IllegalArgumentException { + if ( tree.isEmpty() ) { + throw new IllegalArgumentException( "Attempt to use PostorderTreeIterator on an empty phylogeny." ); + } + _tree = tree; + _root = getTree().getRoot(); + _stack = new Stack(); + reset(); + } + + private PhylogenyNode getRoot() { + return _root; + } + + private Stack getStack() { + return _stack; + } + + private Phylogeny getTree() { + return _tree; + } + + /** + * DOCUMENT ME! + * + * @return DOCUMENT ME! + */ + public boolean hasNext() { + return _has_next; + } + + /** + * Advances the Iterator by one. + */ + public PhylogenyNode next() throws NoSuchElementException { + if ( !hasNext() ) { + throw new NoSuchElementException( "Attempt to call \"next()\" on iterator which has no more next elements." ); + } + while ( true ) { + final PostOrderStackObject si = getStack().pop(); + final PhylogenyNode node = si.getNode(); + final int phase = si.getPhase(); + // if ( node != null ) { + if ( phase > node.getNumberOfDescendants() ) { + setHasNext( node != getRoot() ); + return node; + } + else { + getStack().push( new PostOrderStackObject( node, ( phase + 1 ) ) ); + if ( node.isInternal() ) { + getStack().push( new PostOrderStackObject( node.getChildNode( phase - 1 ), 1 ) ); + } + // else { + // getStack().push( new PostOrderStackObject( null, 1 ) ); + // } + } + // } + } + } + + /** + * Not supported. + * + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * DOCUMENT ME! + */ + public void reset() { + setHasNext( true ); + getStack().clear(); + getStack().push( new PostOrderStackObject( getTree().getRoot(), 1 ) ); + } + + private void setHasNext( final boolean has_next ) { + _has_next = has_next; + } +} // End of class PostorderTreeIterator. diff --git a/forester/java/src/org/forester/phylogeny/iterators/PreorderTreeIterator.java b/forester/java/src/org/forester/phylogeny/iterators/PreorderTreeIterator.java new file mode 100644 index 0000000..18928db --- /dev/null +++ b/forester/java/src/org/forester/phylogeny/iterators/PreorderTreeIterator.java @@ -0,0 +1,115 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.phylogeny.iterators; + +import java.util.NoSuchElementException; +import java.util.Stack; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; + +// import java.util.Iterator; TODO should implement this, not some iterator of +// this package. +/* + * @author Christian M. Zmasek + * + * @version 1.020 -- last modified: 10/10/05 + */ +public class PreorderTreeIterator implements PhylogenyNodeIterator { + + final private Phylogeny _tree; + final private Stack _stack; + + /** + * @param tree + * Phylogeny for which a Iterator is to be constructed. + */ + public PreorderTreeIterator( final Phylogeny tree ) throws IllegalArgumentException { + if ( tree.isEmpty() ) { + throw new IllegalArgumentException( "Attempt to use PreorderTreeIterator on empty tree." ); + } + _stack = new Stack(); + _tree = tree; + reset(); + } + + public PreorderTreeIterator( final PhylogenyNode node ) throws IllegalArgumentException { + _stack = new Stack(); + _tree = null; + reset( node ); + } + + private Stack getStack() { + return _stack; + } + + private Phylogeny getTree() { + return _tree; + } + + /* + * (non-Javadoc) + * + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + return !getStack().isEmpty(); + } + + /** + * Advances the Iterator by one. + */ + public PhylogenyNode next() throws NoSuchElementException { + if ( !hasNext() ) { + throw new NoSuchElementException( "Attempt to call \"next()\" on iterator which has no more next elements." ); + } + final PhylogenyNode node = getStack().pop(); + if ( !node.isExternal() ) { + for( int i = node.getNumberOfDescendants() - 1; i >= 0; --i ) { + getStack().push( node.getChildNode( i ) ); + } + } + return node; + } // next() + + /** + * Not supported. + * + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + public void reset() { + getStack().clear(); + getStack().push( getTree().getRoot() ); + } + + private void reset( final PhylogenyNode node ) { + getStack().clear(); + getStack().push( node ); + } +} // End of class PreorderTreeIterator. diff --git a/forester/java/src/org/forester/sdi/DistanceCalculator.java b/forester/java/src/org/forester/sdi/DistanceCalculator.java new file mode 100644 index 0000000..51b4201 --- /dev/null +++ b/forester/java/src/org/forester/sdi/DistanceCalculator.java @@ -0,0 +1,500 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.io.File; +import java.util.ArrayList; +import java.util.ListIterator; +import java.util.Vector; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.util.ForesterUtil; + +/* + * @author Christian M. Zmasek + * + * @version 1.001 -- last modified: 12/04/00 + */ +public class DistanceCalculator { + + public final static double DEFAULT = -1.0; + private Phylogeny tree_; + private ArrayList nodes_; + private int n_; + private double mean_, variance_, stand_dev_; + private PhylogenyNode lca_; // The LCA of the + + // Nodes in nodes_ + /** + * Default constructor. (Last modified: 11/30/00) + */ + public DistanceCalculator() { + tree_ = null; + nodes_ = null; + n_ = 0; + mean_ = DistanceCalculator.DEFAULT; + variance_ = DistanceCalculator.DEFAULT; + stand_dev_ = DistanceCalculator.DEFAULT; + lca_ = null; + } + + /** + * Constructor. Sets the rooted Phylogeny t for which the mean distance to + * the root and its variance and standard deviation are calculated. (Last + * modified: 12/01/00) + * + * @param t + * the rooted Phylogeny for which the mean distance to the root + * and its variance and standard deviation are calculated + */ + public DistanceCalculator( final Phylogeny t ) { + setTree( t ); + } + + /** + * Constructor. Sets the rooted Phylogeny t and the external Nodes ext_nodes + * for which the mean distance to their lowest common ancestor and its + * variance and standard deviation are calculated. (Last modified: 12/01/00) + * + * @param t + * the rooted Phylogeny containing Nodes in Vector ext_nodes + * @param ext_nodes + * a Vector of Nodes of t, the mean distance to their lowest + * common ancestor and its variance and standard deviation are + * calculated + */ + public DistanceCalculator( final Phylogeny t, final Vector ext_nodes ) { + setTreeAndExtNodes( t, ext_nodes ); + } + + // (Last modified: 12/01/00) + private PhylogenyNode calculateLCA( final ArrayList nodes ) { + if ( ( nodes == null ) || nodes.isEmpty() ) { + return null; + } + PhylogenyNode node = nodes.get( 0 ); + int c = node.getNumberOfExternalNodes(); + final int v = nodes.size(); + while ( !node.isRoot() && ( c < v ) ) { + node = node.getParent(); + c = node.getNumberOfExternalNodes(); + } + ArrayList current_nodes = new ArrayList( node.getAllExternalDescendants() ); + while ( !node.isRoot() && !current_nodes.containsAll( nodes ) ) { + node = node.getParent(); + current_nodes = new ArrayList( node.getAllExternalDescendants() ); + } + return node; + } + + // (Last modified: 11/31/00) + private void calculateMean() { + if ( ( nodes_ == null ) || nodes_.isEmpty() || ( tree_ == null ) || tree_.isEmpty() ) { + return; + } + double sum = 0.0; + final ListIterator li = nodes_.listIterator(); + n_ = 0; + try { + while ( li.hasNext() ) { + n_++; + sum += getDistanceToNode( li.next(), lca_ ); + } + } + catch ( final Exception e ) { + System.err.println( "calculateMean(): " + "Exception: " + e ); + System.exit( -1 ); + } + setMean( sum / n_ ); + } + + // (Last modified: 11/30/00) + private void calculateMeanDistToRoot() { + if ( ( tree_ == null ) || tree_.isEmpty() ) { + return; + } + double sum = 0.0; + PhylogenyNode node = tree_.getFirstExternalNode(); + n_ = 0; + while ( node != null ) { + n_++; + sum += getDistanceToRoot( node ); + node = node.getNextExternalNode(); + } + setMean( sum / n_ ); + } + + // (Last modified: 11/31/00) + private void calculateStandardDeviation() { + if ( ( getVariance() == DistanceCalculator.DEFAULT ) || ( getVariance() < 0.0 ) ) { + return; + } + setStandardDeviation( java.lang.Math.sqrt( getVariance() ) ); + } + + // (Last modified: 11/31/00) + private void calculateVariance() { + if ( ( getMean() == DistanceCalculator.DEFAULT ) || ( nodes_ == null ) || nodes_.isEmpty() || ( tree_ == null ) + || tree_.isEmpty() || ( n_ <= 1.0 ) ) { + return; + } + double x = 0.0, sum = 0.0; + final ListIterator li = nodes_.listIterator(); + try { + while ( li.hasNext() ) { + x = getDistanceToNode( li.next(), lca_ ) - getMean(); + sum += ( x * x ); + } + } + catch ( final Exception e ) { + System.err.println( "calculateVariance(): " + "Exception: " + e ); + System.exit( -1 ); + } + setVariance( sum / ( n_ - 1 ) ); + } + + // (Last modified: 11/31/00) + private void calculateVarianceDistToRoot() { + if ( ( getMean() == DistanceCalculator.DEFAULT ) || ( tree_ == null ) || tree_.isEmpty() || ( n_ <= 1.0 ) ) { + return; + } + double x = 0.0, sum = 0.0; + PhylogenyNode node = tree_.getFirstExternalNode(); + while ( node != null ) { + x = getDistanceToRoot( node ) - getMean(); + sum += ( x * x ); + node = node.getNextExternalNode(); + } + setVariance( sum / ( n_ - 1 ) ); + } + + /** + * Calculates the distance of the PhylogenyNode with seq name seq_name to + * the LCA of ext_nodes, which has been set either with constructor + * DistanceCalculator(Phylogeny,Vector) or method + * setTreeAndExtNodes(Phylogeny,Vector). Throws an exception if no + * PhylogenyNode with seq name_seq name is found or if seq_name is not + * unique. (Last modified: 12/03/00) + * + * @param seq_name + * the seq name for the PhylogenyNode for which the distance to + * the LCA is to be calculated + * @return distance of PhylogenyNode with seq name seq_name to the LCA of + * Nodes in ext_nodes + * @see #DistanceCalculator(Phylogeny,Vector) + * @see #setTreeAndExtNodes(Phylogeny,Vector) + * @see #setTreeAndExtNodes(Phylogeny,ArrayList) + */ + public double getDistanceToLCA( final String seq_name ) { + if ( ( tree_ == null ) || tree_.isEmpty() || ( lca_ == null ) ) { + return 0.0; + } + return getDistanceToNode( seq_name, lca_ ); + } + + /** + * Calculates the distance of PhylogenyNode outer to PhylogenyNode inner. + * PhylogenyNode inner must be closer to the root than PhylogenyNode outer + * and on the same "path". (Last modified: 12/01/00) + * + * @param outer + * a PhylogenyNode + * @param inner + * a PhylogenyNode closer to the root than outer + * @return distance of PhylogenyNode outer to PhylogenyNode inner + */ + public double getDistanceToNode( PhylogenyNode outer, final PhylogenyNode inner ) { + double d = 0.0, dist = 0.0; + while ( ( inner != outer ) && !outer.isRoot() ) { + d = outer.getDistanceToParent(); + if ( d > 0.0 ) { + dist += d; + } + outer = outer.getParent(); + } + if ( !inner.isRoot() && outer.isRoot() ) { + throw new IllegalArgumentException( "getDistanceToNode(PhylogenyNode outer,PhylogenyNode inner): " + + "PhylogenyNode inner is not closer to the root than PhylogenyNode outer " + + "or is not on the same \"subtree\"" ); + } + return dist; + } + + /** + * Calculates the distance of the PhylogenyNode with seq name seq_name to + * PhylogenyNode inner. PhylogenyNode inner must be closer to the root than + * the PhylogenyNode with seq name seq_name and on the same "path". Throws + * an exception if no PhylogenyNode with seq name_seq name is found or if + * seq_name is not unique. (Last modified: 12/01/00) + * + * @param seq_name + * the seq name of a PhylogenyNode further from the root than + * PhylogenyNode inner + * @param inner + * a PhylogenyNode + * @return distance of PhylogenyNode with seq name seq_nam to PhylogenyNode + * inner + */ + public double getDistanceToNode( final String seq_name, final PhylogenyNode inner ) { + if ( ( tree_ == null ) || tree_.isEmpty() ) { + return 0.0; + } + return getDistanceToNode( tree_.getNodeViaSequenceName( seq_name ), inner ); + } + + /** + * Calculates the distance of PhylogenyNode n to the root of Phylogeny t + * which has been set either with a constructor, setTree(Phylogeny), or + * setTreeAndExtNodes(Phylogeny,Vector). (Last modified: 12/01/00) + * + * @param n + * the PhylogenyNode for which the distance to the root is to be + * calculated + * @return distance of PhylogenyNode n to the root + * @see #DistanceCalculator(Phylogeny) + * @see #DistanceCalculator(Phylogeny,Vector) + * @see #setTree(Phylogeny) + * @see #setTreeAndExtNodes(Phylogeny,Vector) + */ + public double getDistanceToRoot( final PhylogenyNode n ) { + if ( ( tree_ == null ) || tree_.isEmpty() ) { + return 0.0; + } + double d = 0.0; + try { + d = getDistanceToNode( n, tree_.getRoot() ); + } + catch ( final Exception e ) { + System.err.println( "getDistanceToRoot(PhylogenyNode): Unexpected " + "exception: " + e ); + System.exit( -1 ); + } + return d; + } + + /** + * Calculates the distance of the PhylogenyNode with seq name seq_name to + * the root of Phylogeny t, which has been set either with a constructor, + * setTree(Phylogeny), or setTreeAndExtNodes(Phylogeny,Vector). Throws an + * exception if no PhylogenyNode with seq name_seq name is found or if + * seq_name is not unique. (Last modified: 12/01/00) + * + * @param seq_name + * the seq name for the PhylogenyNode for which the distance to + * the root is to be calculated + * @return distance of PhylogenyNode with seq name seq_name to the root + * @see #DistanceCalculator(Phylogeny) + * @see #DistanceCalculator(Phylogeny,Vector) + * @see #setTree(Phylogeny) + * @see #setTreeAndExtNodes(Phylogeny,Vector) + * @see #setTreeAndExtNodes(Phylogeny,ArrayList) + */ + public double getDistanceToRoot( final String seq_name ) { + if ( ( tree_ == null ) || tree_.isEmpty() ) { + return 0.0; + } + return getDistanceToNode( seq_name, tree_.getRoot() ); + } + + /** + * Returns the mean distance. If constructor DistanceCalculator(Phylogeny) + * or method setTree(Phylogeny) have been used, it is the mean of the + * distances from the root to all external Nodes. If constructor + * DistanceCalculator(Phylogeny,Vector) or method + * setTreeAndExtNodes(Phylogeny,Vector) have been used, it is the mean of + * the distances from the external nodes ext_nodes to their lowest common + * ancestor. (Last modified: 11/30/00) + * + * @return mean distance + * @see #DistanceCalculator(Phylogeny) + * @see #DistanceCalculator(Phylogeny,Vector) + * @see #setTree(Phylogeny) + * @see #setTreeAndExtNodes(Phylogeny,Vector) + * @see #setTreeAndExtNodes(Phylogeny,ArrayList) + */ + public double getMean() { + return mean_; + } + + /** + * Returns the sum of all Nodes used to calculate the mean. (Last modified: + * 12/01/00) + * + * @return n + */ + public int getN() { + return n_; + } + + /** + * Returns the standard deviation. If constructor + * DistanceCalculator(Phylogeny) or method setTree(Phylogeny) have been + * used, it is the standard deviation of the distances from the root to all + * external Nodes. If constructor DistanceCalculator(Phylogeny,Vector) or + * method setTreeAndExtNodes(Phylogeny,Vector) have been used, it is the + * standard deviation of the distances from the external nodes ext_nodes to + * their lowest common ancestor. (Last modified: 11/30/00) + * + * @return standard deviation + * @see #DistanceCalculator(Phylogeny) + * @see #DistanceCalculator(Phylogeny,Vector) + * @see #setTree(Phylogeny) + * @see #setTreeAndExtNodes(Phylogeny,Vector) + * @see #setTreeAndExtNodes(Phylogeny,ArrayList) + */ + public double getStandardDeviation() { + return stand_dev_; + } + + /** + * Returns the variance. ( 1/(N - 1) * Sum((x-mean)^2) ) If constructor + * DistanceCalculator(Phylogeny) or method setTree(Phylogeny) have been + * used, it is the variance of the distances from the root to all external + * Nodes. If constructor DistanceCalculator(Phylogeny,Vector) or method + * setTreeAndExtNodes(Phylogeny,Vector) have been used, it is the variance + * of the distances from the external nodes ext_nodes to their lowest common + * ancestor. (Last modified: 11/30/00) + * + * @return variance + * @see #DistanceCalculator(Phylogeny) + * @see #DistanceCalculator(Phylogeny,Vector) + * @see #setTree(Phylogeny) + * @see #setTreeAndExtNodes(Phylogeny,Vector) + * @see #setTreeAndExtNodes(Phylogeny,ArrayList) + */ + public double getVariance() { + return variance_; + } + + // (Last modified: 11/30/00) + private void setMean( final double d ) { + mean_ = d; + } + + // (Last modified: 11/30/00) + private void setStandardDeviation( final double d ) { + stand_dev_ = d; + } + + /** + * Sets the rooted Phylogeny t for which the mean distance to the root and + * its variance and standard deviation are calculated. (Last modified: + * 12/01/00) + * + * @param t + * the rooted Phylogeny for which the mean distance to the root + * and its variance and standard deviation are calculated + */ + public void setTree( final Phylogeny t ) { + tree_ = t; + nodes_ = null; + n_ = 0; + mean_ = DistanceCalculator.DEFAULT; + variance_ = DistanceCalculator.DEFAULT; + stand_dev_ = DistanceCalculator.DEFAULT; + lca_ = null; + calculateMeanDistToRoot(); + calculateVarianceDistToRoot(); + calculateStandardDeviation(); + } + + /** + * Sets the rooted Phylogeny t and the external Nodes ext_nodes for which + * the mean distance to their lowest common ancestor and its variance and + * standard deviation are calculated. (Last modified: 12/03/00) + * + * @param t + * the rooted Phylogeny containing Nodes in Vector ext_nodes + * @param ext_nodes + * a ArrayList of Nodes of t, the mean distance to their lowest + * common ancestor and its variance and standard deviation are + * calculated + */ + public void setTreeAndExtNodes( final Phylogeny t, final ArrayList ext_nodes ) { + tree_ = t; + nodes_ = ext_nodes; + n_ = 0; + mean_ = DistanceCalculator.DEFAULT; + variance_ = DistanceCalculator.DEFAULT; + stand_dev_ = DistanceCalculator.DEFAULT; + lca_ = calculateLCA( nodes_ ); + calculateMean(); + calculateVariance(); + calculateStandardDeviation(); + } + + /** + * Sets the rooted Phylogeny t and the external Nodes ext_nodes for which + * the mean distance to their lowest common ancestor and its variance and + * standard deviation are calculated. (Last modified: 12/03/00) + * + * @param t + * the rooted Phylogeny containing Nodes in Vector ext_nodes + * @param ext_nodes + * a Vector of Nodes of t, the mean distance to their lowest + * common ancestor and its variance and standard deviation are + * calculated + */ + public void setTreeAndExtNodes( final Phylogeny t, final Vector ext_nodes ) { + setTreeAndExtNodes( t, new ArrayList( ext_nodes ) ); + } + + // (Last modified: 11/30/00) + private void setVariance( final double d ) { + variance_ = d; + } + + // Main for testing. + public static void main( final String args[] ) { + File tree_file = null; + Phylogeny tree = null; + DistanceCalculator dc = null; + tree_file = new File( args[ 0 ] ); + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( tree_file, true ); + tree = factory.create( tree_file, pp )[ 0 ]; + } + catch ( final Exception e ) { + System.out.println( e.toString() ); + System.exit( -1 ); + } + double time = System.currentTimeMillis(); + dc = new DistanceCalculator( tree ); + final double m = dc.getMean(), var = dc.getVariance(), sd = dc.getStandardDeviation(); + time = ( System.currentTimeMillis() - time ); + System.out.println( "\nn = " + dc.getN() ); + System.out.println( "mea = " + m ); + System.out.println( "var = " + var ); + System.out.println( "sd = " + sd + "\n" ); + System.out.println( "t=" + time + "\n" ); + } +} diff --git a/forester/java/src/org/forester/sdi/GSDI.java b/forester/java/src/org/forester/sdi/GSDI.java new file mode 100644 index 0000000..b6641bb --- /dev/null +++ b/forester/java/src/org/forester/sdi/GSDI.java @@ -0,0 +1,389 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.util.HashMap; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +/* + * Implements our algorithm for speciation - duplication inference (SDI).

    + * The initialization is accomplished by:

    • method + * "linkExtNodesOfG()" of class SDI: setting the links for the external nodes of + * the gene tree
    • "preorderReID(int)" from class Phylogeny: numbering of + * nodes of the species tree in preorder
    • the optional stripping of the + * species tree is accomplished by method "stripTree(Phylogeny,Phylogeny)" of + * class Phylogeny

    The recursion part is accomplished by this class' + * method "geneTreePostOrderTraversal(PhylogenyNode)".

    Requires JDK 1.5 or + * greater. + * + * @see SDI#linkNodesOfG() + * + * @see Phylogeny#preorderReID(int) + * + * @see + * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) + * + * @see #geneTreePostOrderTraversal(PhylogenyNode) + * + * @author Christian M. Zmasek + */ +public class GSDI extends SDI { + + private final HashMap _transversal_counts; + private final boolean _most_parsimonious_duplication_model; + private int _speciation_or_duplication_events_sum; + private int _speciations_sum; + + /** + * Constructor which sets the gene tree and the species tree to be compared. + * species_tree is the species tree to which the gene tree gene_tree will be + * compared to - with method "infer(boolean)". Both Trees must be completely + * binary and rooted. The actual inference is accomplished with method + * "infer(boolean)". The mapping cost L can then be calculated with method + * "computeMappingCost()". + *

    + * + * @see #infer(boolean) + * @see SDI#computeMappingCostL() + * @param gene_tree + * reference to a rooted gene tree to which assign duplication vs + * speciation, must have species names in the species name fields + * for all external nodes + * @param species_tree + * reference to a rooted binary species tree which might get + * stripped in the process, must have species names in the + * species name fields for all external nodes + * + * @param most_parsimonious_duplication_model + * set to true to assign nodes as speciations which would + * otherwise be assiged as unknown because of polytomies in the + * species tree. + * + */ + public GSDI( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean most_parsimonious_duplication_model ) { + super( gene_tree, species_tree ); + _speciation_or_duplication_events_sum = 0; + _speciations_sum = 0; + _most_parsimonious_duplication_model = most_parsimonious_duplication_model; + _transversal_counts = new HashMap(); + _duplications_sum = 0; + getSpeciesTree().preOrderReId(); + linkNodesOfG(); + geneTreePostOrderTraversal( getGeneTree().getRoot() ); + } + + private Event createDuplicationEvent() { + final Event event = Event.createSingleDuplicationEvent(); + ++_duplications_sum; + return event; + } + + private Event createSingleSpeciationOrDuplicationEvent() { + final Event event = Event.createSingleSpeciationOrDuplicationEvent(); + ++_speciation_or_duplication_events_sum; + return event; + } + + private Event createSpeciationEvent() { + final Event event = Event.createSingleSpeciationEvent(); + ++_speciations_sum; + return event; + } + + // s is the node on the species tree g maps to. + private void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) { + Event event = null; + // Determine how many children map to same node as parent. + int sum_g_childs_mapping_to_s = 0; + for( final PhylogenyNodeIterator iter = g.iterateChildNodesForward(); iter.hasNext(); ) { + if ( iter.next().getLink() == s ) { + ++sum_g_childs_mapping_to_s; + } + } + // Determine the sum of traversals. + int traversals_sum = 0; + int max_traversals = 0; + PhylogenyNode max_traversals_node = null; + if ( !s.isExternal() ) { + for( final PhylogenyNodeIterator iter = s.iterateChildNodesForward(); iter.hasNext(); ) { + final PhylogenyNode current_node = iter.next(); + final int traversals = getTraversalCount( current_node ); + traversals_sum += traversals; + if ( traversals > max_traversals ) { + max_traversals = traversals; + max_traversals_node = current_node; + } + } + } + // System.out.println( " sum=" + traversals_sum ); + // System.out.println( " max=" + max_traversals ); + // System.out.println( " m=" + sum_g_childs_mapping_to_s ); + if ( sum_g_childs_mapping_to_s > 0 ) { + if ( traversals_sum == 2 ) { + event = createDuplicationEvent(); + } + else if ( traversals_sum > 2 ) { + if ( max_traversals <= 1 ) { + if ( _most_parsimonious_duplication_model ) { + event = createSpeciationEvent(); + } + else { + event = createSingleSpeciationOrDuplicationEvent(); + } + } + else { + event = createDuplicationEvent(); + _transversal_counts.put( max_traversals_node, 1 ); + } + } + else { + event = createDuplicationEvent(); + } + } + else { + event = createSpeciationEvent(); + } + g.getNodeData().setEvent( event ); + } + + /** + * Traverses the subtree of PhylogenyNode g in postorder, calculating the + * mapping function M, and determines which nodes represent speciation + * events and which ones duplication events. + *

    + * Preconditions: Mapping M for external nodes must have been calculated and + * the species tree must be labeled in preorder. + *

    + * (Last modified: ) + * + * @param g + * starting node of a gene tree - normally the root + */ + void geneTreePostOrderTraversal( final PhylogenyNode g ) { + if ( !g.isExternal() ) { + for( final PhylogenyNodeIterator iter = g.iterateChildNodesForward(); iter.hasNext(); ) { + geneTreePostOrderTraversal( iter.next() ); + } + final PhylogenyNode[] linked_nodes = new PhylogenyNode[ g.getNumberOfDescendants() ]; + for( int i = 0; i < linked_nodes.length; ++i ) { + linked_nodes[ i ] = g.getChildNode( i ).getLink(); + } + final int[] min_max = obtainMinMaxIdIndices( linked_nodes ); + int min_i = min_max[ 0 ]; + int max_i = min_max[ 1 ]; + // initTransversalCounts(); + while ( linked_nodes[ min_i ] != linked_nodes[ max_i ] ) { + increaseTraversalCount( linked_nodes[ max_i ] ); + linked_nodes[ max_i ] = linked_nodes[ max_i ].getParent(); + final int[] min_max_ = obtainMinMaxIdIndices( linked_nodes ); + min_i = min_max_[ 0 ]; + max_i = min_max_[ 1 ]; + } + final PhylogenyNode s = linked_nodes[ max_i ]; + g.setLink( s ); + // Determines whether dup. or spec. + determineEvent( s, g ); + // _transversal_counts.clear(); + } + } + + public int getSpeciationOrDuplicationEventsSum() { + return _speciation_or_duplication_events_sum; + } + + public int getSpeciationsSum() { + return _speciations_sum; + } + + private int getTraversalCount( final PhylogenyNode node ) { + if ( _transversal_counts.containsKey( node ) ) { + return _transversal_counts.get( node ); + } + return 0; + } + + private void increaseTraversalCount( final PhylogenyNode node ) { + if ( _transversal_counts.containsKey( node ) ) { + _transversal_counts.put( node, _transversal_counts.get( node ) + 1 ); + } + else { + _transversal_counts.put( node, 1 ); + } + // System.out.println( "count for node " + node.getID() + " is now " + // + getTraversalCount( node ) ); + } + + /** + * This allows for linking of internal nodes of the species tree (as opposed + * to just external nodes, as in the method it overrides. + * + */ + @Override + void linkNodesOfG() { + final HashMap speciestree_ext_nodes = new HashMap(); + for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + if ( speciestree_ext_nodes.containsKey( n.getNodeData().getTaxonomy() ) ) { + throw new IllegalArgumentException( "taxonomy [" + n.getNodeData().getTaxonomy() + + "] is not unique in species phylogeny" ); + } + speciestree_ext_nodes.put( n.getNodeData().getTaxonomy(), n ); + } + } + // Retrieve the reference to the PhylogenyNode with a matching species + // name. + for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + if ( !g.getNodeData().isHasTaxonomy() ) { + throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" ); + } + final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() ); + if ( s == null ) { + throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy() + + " not present in species tree." ); + } + g.setLink( s ); + } + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Speciations sum : " + getSpeciationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "Duplications sum : " + getDuplicationsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + if ( !_most_parsimonious_duplication_model ) { + sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() ); + sb.append( ForesterUtil.getLineSeparator() ); + } + sb.append( "mapping cost L : " + computeMappingCostL() ); + return sb.toString(); + } + + static int[] obtainMinMaxIdIndices( final PhylogenyNode[] linked_nodes ) { + int max_i = 0; + int min_i = 0; + int max_i_id = -Integer.MAX_VALUE; + int min_i_id = Integer.MAX_VALUE; + for( int i = 0; i < linked_nodes.length; ++i ) { + final int id_i = linked_nodes[ i ].getId(); + if ( id_i > max_i_id ) { + max_i = i; + max_i_id = linked_nodes[ max_i ].getId(); + } + if ( id_i < min_i_id ) { + min_i = i; + min_i_id = linked_nodes[ min_i ].getId(); + } + } + return new int[] { min_i, max_i }; + } + /** + * Updates the mapping function M after the root of the gene tree has been + * moved by one branch. It calculates M for the root of the gene tree and + * one of its two children. + *

    + * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)". + *

    + * (Last modfied: ) + * + * @param prev_root_was_dup + * true if the previous root was a duplication, false otherwise + * @param prev_root_c1 + * child 1 of the previous root + * @param prev_root_c2 + * child 2 of the previous root + * @return number of duplications which have been assigned in gene tree + */ + // int updateM( final boolean prev_root_was_dup, + // final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) { + // final PhylogenyNode root = getGeneTree().getRoot(); + // if ( ( root.getChildNode1() == prev_root_c1 ) + // || ( root.getChildNode2() == prev_root_c1 ) ) { + // calculateMforNode( prev_root_c1 ); + // } + // else { + // calculateMforNode( prev_root_c2 ); + // } + // Event event = null; + // if ( prev_root_was_dup ) { + // event = Event.createSingleDuplicationEvent(); + // } + // else { + // event = Event.createSingleSpeciationEvent(); + // } + // root.getPhylogenyNodeData().setEvent( event ); + // calculateMforNode( root ); + // return getDuplications(); + // } // updateM( boolean, PhylogenyNode, PhylogenyNode ) + // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode ) + // Calculates M for PhylogenyNode n, given that M for the two children + // of n has been calculated. + // (Last modified: 10/02/01) + // private void calculateMforNode( final PhylogenyNode n ) { + // if ( !n.isExternal() ) { + // boolean was_duplication = n.isDuplication(); + // PhylogenyNode a = n.getChildNode1().getLink(), b = n + // .getChildNode2().getLink(); + // while ( a != b ) { + // if ( a.getID() > b.getID() ) { + // a = a.getParent(); + // } + // else { + // b = b.getParent(); + // } + // } + // n.setLink( a ); + // Event event = null; + // if ( ( a == n.getChildNode1().getLink() ) + // || ( a == n.getChildNode2().getLink() ) ) { + // event = Event.createSingleDuplicationEvent(); + // if ( !was_duplication ) { + // increaseDuplications(); + // } + // } + // else { + // event = Event.createSingleSpeciationEvent(); + // if ( was_duplication ) { + // decreaseDuplications(); + // } + // } + // n.getPhylogenyNodeData().setEvent( event ); + // } + // } // calculateMforNode( PhylogenyNode ) +} // End of class GSDI. diff --git a/forester/java/src/org/forester/sdi/ORcount.java b/forester/java/src/org/forester/sdi/ORcount.java new file mode 100644 index 0000000..4271890 --- /dev/null +++ b/forester/java/src/org/forester/sdi/ORcount.java @@ -0,0 +1,382 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +/* + * Allows to

    + * + * @see SDIse + * + * @see SDI + * + * @author Christian M. Zmasek + * + * @version 1.400 -- last modified: 10/29/2005 + */ +public class ORcount { + + private static final String[] group_1 = { "ANOGA", "DROME", "CAEBR", "CAEEL" }; + private static final String[] group_2 = { "CIOIN", "FUGRU", "MOUSE", "RAT", + "HUMAN" }; + private static final String[] all_species = { "ANOGA", "DROME", "CAEBR", "CAEEL", + "CIOIN", "FUGRU", "MOUSE", "RAT", "HUMAN" }; + private final Phylogeny[] _trees; + private HashMap> _species = null; + private ArrayList _names = null; + private int _group1_vs_2_counter = 0; + + /** + * Default contructor which + */ + public ORcount( final Phylogeny[] trees ) { + _trees = trees; + } // ORcount( final Phylogeny[] trees ) + + private void count( final PhylogenyNode node ) { + final List external_nodes = node.getAllExternalDescendants(); + for( int i = 1; i < external_nodes.size(); ++i ) { + for( int j = 0; j < i; ++j ) { + final PhylogenyNode node_i = external_nodes.get( i ); + final PhylogenyNode node_j = external_nodes.get( j ); + final String si = PhylogenyMethods.getSpecies( node_i ); + final String sj = PhylogenyMethods.getSpecies( node_j ); + count( si, sj, node_i.getName(), node_j.getName() ); + } + } + } // count( PhylogenyNode ) + + private void count( final String a, final String b, final String seq_name_a, final String seq_name_b ) { + HashMap h1 = _species.get( a ); + if ( h1 == null ) { + throw new RuntimeException( "Unexpected error: Species \"" + a + "\" not present in species matrix." ); + } + Object h2 = h1.get( b ); + String species_in_h1 = b; + // We only look at the half matrix, and we do not know/care about the + // order + // of the keys (species). + if ( h2 == null ) { + h1 = _species.get( b ); + if ( h1 == null ) { + throw new RuntimeException( "Unexpected error: Species \"" + b + "\" not present in species matrix." ); + } + h2 = h1.get( a ); + species_in_h1 = a; + } + if ( h2 == null ) { + throw new RuntimeException( "Unexpected error: Species \"" + a + "\" not present in species matrix." ); + } + h1.put( species_in_h1, new Integer( ( ( Integer ) h2 ).intValue() + 1 ) ); + _names.add( a + "-" + seq_name_a + " = " + b + "-" + seq_name_b ); + } // count( String, String ) + + public void countSharedAncestralClades( final Phylogeny tree, + final int bootstrap_threshold, + final String[] group_1, + final String[] group_2 ) { + if ( ( group_1 == null ) || ( group_2 == null ) ) { + throw new IllegalArgumentException( "String[](s) in arguments to method \"ORcount.countSharedAncestralClades\" is (are) null." ); + } + if ( !tree.isRooted() ) { + throw new IllegalArgumentException( "Phylogeny must be rooted in order to count shared ancestral clades." ); + } + final PhylogenyNodeIterator it = tree.iteratorPostorder(); + tree.setIndicatorsToZero(); + while ( it.hasNext() ) { + final PhylogenyNode current_node = it.next(); + if ( current_node.getNumberOfDescendants() != 2 ) { + throw new IllegalArgumentException( "Phylogeny can not contain multifurcations in order to count shared ancestral clades." ); + } + if ( !current_node.isExternal() ) { + final PhylogenyNode child1 = current_node.getChildNode1(); + final PhylogenyNode child2 = current_node.getChildNode2(); + if ( ( child1.getIndicator() == 1 ) || ( child2.getIndicator() == 1 ) ) { + current_node.setIndicator( ( byte ) 1 ); + } + else { + final List external_nodes = current_node.getAllExternalDescendants(); + final String[] external_species = new String[ external_nodes.size() ]; + for( int i = 0; i < external_nodes.size(); ++i ) { + final PhylogenyNode n = external_nodes.get( i ); + external_species[ i ] = PhylogenyMethods.getSpecies( n ).trim().toUpperCase(); + } + if ( ForesterUtil.isIntersecting( external_species, group_1 ) + && ForesterUtil.isIntersecting( external_species, group_2 ) ) { + current_node.setIndicator( ( byte ) 1 ); + if ( ( group_1.length == 1 ) && ( group_2.length == 1 ) ) { + count( group_1[ 0 ], group_2[ 0 ], "name a", "name b" ); + } + else { + increaseGroup1Vs2Counter(); + } + } + } + } + } // while + } // countSharedAncestralClades( Phylogeny, int ) + + public void countSharedAncestralClades( final Phylogeny[] trees, final int bootstrap_threshold ) { + for( int i = 1; i < ORcount.all_species.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String all_i = ORcount.all_species[ i ].trim().toUpperCase(); + final String all_j = ORcount.all_species[ j ].trim().toUpperCase(); + final String[] a = { all_i }; + final String[] b = { all_j }; + for( int k = 0; k < trees.length; ++k ) { + countSharedAncestralClades( trees[ k ], bootstrap_threshold, a, b ); + } + } + } + // print(); + if ( ( ORcount.group_1 != null ) && ( ORcount.group_2 != null ) && ( ORcount.group_1.length > 0 ) + && ( ORcount.group_2.length > 0 ) ) { + setGroup1Vs2Counter( 0 ); + for( int k = 0; k < trees.length; ++k ) { + countSharedAncestralClades( trees[ k ], bootstrap_threshold, ORcount.group_1, ORcount.group_2 ); + } + System.out.println( "\nCount [(" + ForesterUtil.stringArrayToString( ORcount.group_1 ) + ") vs (" + + ForesterUtil.stringArrayToString( ORcount.group_2 ) + ")] = " + getGroup1Vs2Counter() ); + } + } + + public void countSuperOrthologousRelations( final int bootstrap_threshold ) { + reset(); + for( int i = 0; i < _trees.length; ++i ) { + countSuperOrthologousRelations( _trees[ i ], bootstrap_threshold ); + } + } + + private void countSuperOrthologousRelations( final Phylogeny tree, final int bootstrap_threshold ) { + final PhylogenyNodeIterator it = tree.iteratorPostorder(); + if ( !tree.isRooted() ) { + throw new IllegalArgumentException( "Phylogeny must be rooted in order to count 1:1 orthologous relationships." ); + } + // The purpose of this is to find all substrees + // which contain only speciation events on all their nodes. + // All nodes in these subtrees are "painted" with 0's, wheres + // the rest od the nodes in painted with 1's. + tree.setIndicatorsToZero(); + it.reset(); + while ( it.hasNext() ) { + final PhylogenyNode current_node = it.next(); + if ( current_node.getNumberOfDescendants() != 2 ) { + throw new IllegalArgumentException( "Phylogeny can not contain multifurcations in order to count 1:1 orthologous relationships." ); + } + if ( !current_node.isExternal() && !current_node.isHasAssignedEvent() ) { + throw new IllegalArgumentException( "All nodes must have duplication or speciation assigned in order to count 1:1 orthologous relationships." ); + } + if ( !current_node.isExternal() + && ( current_node.isDuplication() || ( current_node.getChildNode1().getIndicator() == 1 ) || ( current_node + .getChildNode2().getIndicator() == 1 ) ) ) { + current_node.setIndicator( ( byte ) 1 ); + } + } + // These find the largest subtrees containing only speciations + // and uses their largest nodes to count all possible species + // combinations + // in their extant external nodes. + // ~~~ this could possibly be combined with the first iteration ~~ + // <<<<<<<<<<<~~~~~~~~~~~~~~~<<<<<<<<<<<<<<< + it.reset(); + while ( it.hasNext() ) { + final PhylogenyNode current_node = it.next(); + if ( !current_node.isExternal() + && ( current_node.getIndicator() == 0 ) + && ( current_node.isRoot() || ( current_node.getParent().getIndicator() == 1 ) ) + && ( ( bootstrap_threshold < 1 ) || ( ( PhylogenyMethods.getConfidenceValue( current_node ) >= bootstrap_threshold ) + && ( PhylogenyMethods.getConfidenceValue( current_node.getChildNode1() ) >= bootstrap_threshold ) && ( PhylogenyMethods + .getConfidenceValue( current_node.getChildNode2() ) >= bootstrap_threshold ) ) ) ) { + count( current_node ); + } + } + } // countOneToOneOrthologs( Phylogeny, int ) + + // This puts all the species found in Phylogeny array _trees into + // species HashMap. + private void getAllSpecies() { + if ( ( getTrees() == null ) || ( getTrees().length < 1 ) ) { + throw new RuntimeException( "Phylogeny array in method \"getAllSpecies( HashMap hash )\" is null or empty." ); + } + setSpecies( new HashMap>() ); + for( int i = 0; i < getTrees().length; ++i ) { + PhylogenyNode node = getTrees()[ i ].getFirstExternalNode(); + while ( node != null ) { + getSpecies().put( PhylogenyMethods.getSpecies( node ), null ); + node = node.getNextExternalNode(); + } + } + } // void getAllSpecies( HashMap hash ) + + private int getGroup1Vs2Counter() { + return _group1_vs_2_counter; + } + + private HashMap> getSpecies() { + return _species; + } + + private Phylogeny[] getTrees() { + return _trees; + } + + private void increaseGroup1Vs2Counter() { + _group1_vs_2_counter++; + } + + private void printCount() { + if ( ( _species == null ) || ( _species.size() < 2 ) ) { + throw new RuntimeException( "Species HashMap in method \"setUpCountingMatrix()\" is null or contains less than two species." ); + } + final Object[] species_array = _species.keySet().toArray(); + final int s = species_array.length; + for( int i = 0; i < s - 1; ++i ) { + final String species = ( String ) species_array[ i ]; + System.out.println(); + System.out.println( species + ":" ); + final HashMap h = _species.get( species ); + // Setting up HashMaps linked to by hash (=_species) + // Diagonals are ignored, only half the matrix is needed. + for( int j = 1 + i; j < s; ++j ) { + final String sp = ( String ) species_array[ j ]; + final int c = ( ( Integer ) h.get( sp ) ).intValue(); + System.out.println( species + "-" + sp + ": " + c ); + } + } + } + + private void printNames() { + for( int i = 0; i < _names.size(); ++i ) { + System.out.println( i + ": " + _names.get( i ) ); + } + } + + public void reset() { + getAllSpecies(); + setUpCountingMatrix(); + setGroup1Vs2Counter( 0 ); + _names = new ArrayList(); + } + + private void setGroup1Vs2Counter( final int group1_vs_2_counter ) { + _group1_vs_2_counter = group1_vs_2_counter; + } + + private void setSpecies( final HashMap> species ) { + _species = species; + } + + private void setUpCountingMatrix() { + if ( ( getSpecies() == null ) || ( getSpecies().size() < 2 ) ) { + throw new RuntimeException( "Species HashMap in method \"setUpCountingMatrix()\" is null or contains less than two species." ); + } + final Object[] species_array = getSpecies().keySet().toArray(); + final int s = species_array.length; + for( int i = 0; i < s; ++i ) { + final String species = ( String ) species_array[ i ]; + final HashMap h = new HashMap(); + // Setting up HashMaps linked to by hash (=_species) + // Diagonals are ignored, only half the matrix is needed. + for( int j = 1 + i; j < s; ++j ) { + h.put( species_array[ j ], new Integer( 0 ) ); + } + getSpecies().put( species, h ); + } + } + + private static void errorInCommandLine() { + System.out.println( "\nORcount: Error in command line.\n" ); + System.out.println( "Usage: \"\"" ); + System.out.println( "\nOptions:" ); + System.out.println( " -" ); + System.out.println( "" ); + System.exit( -1 ); + } // errorInCommandLine() + + /** + * Main method for this class. + *

    + * (Last modified: 11/26/03) + * + * @param args[1or2] + * gene tree file name (in NHX format with species names in + * species name fields and sequence names in sequence name + * fields; unless -n option is used) + */ + public static void main( final String args[] ) { + if ( args.length == 0 ) { + ORcount.errorInCommandLine(); + } + final Phylogeny[] trees = new Phylogeny[ args.length ]; + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + for( int i = 0; i < trees.length; ++i ) { + try { + System.out.println( "Reading tree #" + i + " [" + args[ i ] + "]" ); + final PhylogenyParser pp = ForesterUtil.createParserDependingOnFileType( new File( args[ i ] ), true ); + trees[ i ] = factory.create( new File( args[ i ] ), pp )[ 0 ]; + } + catch ( final Exception e ) { + System.out.println( "\nFailed to read \"" + args[ i ] + "\". Terminating.\n" ); + System.exit( -1 ); + } + } + System.out.println( "Finished reading in trees.\n\n" ); + final ORcount or_count = new ORcount( trees ); + try { + System.out.println( "\n\n\n\"1:1 ORTHOLOGOUS GENE PAIRS\":\n" ); + System.out.println( "\n\n\n\"SUPER ORTHOLOGOUS GENE PAIRS\":\n" ); + or_count.countSuperOrthologousRelations( 0 ); + or_count.printNames(); + or_count.printCount(); + // System.out.println( "\n\n\n\"SHARED ANCESTRAL CLADES\":\n"); + // or_count.reset(); + // or_count.countSharedAncestralClades( trees, 0 ); + } + catch ( final Exception e ) { + System.out.println( "\nException. Terminating.\n" ); + System.out.println( "\nException is: " + e + "\n" ); + e.printStackTrace(); + System.exit( -1 ); + } + System.out.println( "\nDone." ); + System.exit( 0 ); + } // main ( String ) +} // End of class ORcount. diff --git a/forester/java/src/org/forester/sdi/RIO.java b/forester/java/src/org/forester/sdi/RIO.java new file mode 100644 index 0000000..d2c79ee --- /dev/null +++ b/forester/java/src/org/forester/sdi/RIO.java @@ -0,0 +1,1126 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.io.parsers.SymmetricalDistanceMatrixParser; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +/* + * @author Christian M. Zmasek + */ +public final class RIO { + + private final static boolean ROOT_BY_MINIMIZING_MAPPING_COST = false; + private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true; + private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true; + private final static boolean TIME = false; + private HashMap> _o_hash_maps; + private HashMap> _so_hash_maps; + private HashMap> _up_hash_maps; + private HashMap> _sn_hash_maps; // HashMap of HashMaps + private DistanceMatrix _m; + private HashMap _l; + private String[] _seq_names; + private int _bootstraps; + private int _ext_nodes_; + private long _time; + + /** + * Default constructor. + */ + public RIO() { + reset(); + } + + /** + * Returns the numbers of trees analyzed. + * + * @return the numbers of trees analyzed + */ + public final int getBootstraps() { + return _bootstraps; + } + + // Helper method for inferredOrthologsToString. + // inferredOrthologsToArrayList, + // and inferredUltraParalogsToString. + private final double getBootstrapValueFromHash( final HashMap h, final String name ) { + if ( !h.containsKey( name ) ) { + return 0.0; + } + final int i = h.get( name ); + return ( i * 100.0 / getBootstraps() ); + } + + /** + * Returns the distance to a sequences/taxa after a distance list file has + * been read in with readDistanceList(File). Throws an exception if name is + * not found or if no list has been read in. + * + * @param name + * a sequence name + */ + public final double getDistance( String name ) { + double distance = 0.0; + name = name.trim(); + if ( _l == null ) { + throw new RuntimeException( "Distance list has probably not been read in (successfully)." ); + } + if ( _l.get( name ) == null ) { + throw new IllegalArgumentException( name + " not found." ); + } + distance = ( _l.get( name ) ).doubleValue(); + return distance; + } + + public final double getDistance( final String name1, final String name2 ) { + try { + return _m.getValue( _m.getIndex( name1 ), _m.getIndex( name2 ) ); + } + catch ( final Exception e ) { + return 1; + } + } + + /** + * Returns the numbers of number of ext nodes in gene trees analyzed (after + * stripping). + * + * @return number of ext nodes in gene trees analyzed (after stripping) + */ + public final int getExtNodesOfAnalyzedGeneTrees() { + return _ext_nodes_; + } + + /** + * Returns a HashMap containing the inferred orthologs of the external gene + * tree node with the sequence name seq_name. Sequence names are the keys + * (String), numbers of observations are the values (Int). Orthologs are to + * be inferred by method "inferOrthologs". Throws an exception if seq_name + * is not found. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @return HashMap containing the inferred orthologs + * (name(String)->value(Int)) + */ + public final HashMap getInferredOrthologs( final String seq_name ) { + if ( _o_hash_maps == null ) { + return null; + } + return _o_hash_maps.get( seq_name ); + } + + private final HashMap getInferredSubtreeNeighbors( final String seq_name ) { + if ( _sn_hash_maps == null ) { + return null; + } + return _sn_hash_maps.get( seq_name ); + } + + /** + * Returns a HashMap containing the inferred "super orthologs" of the + * external gene tree node with the sequence name seq_name. Sequence names + * are the keys (String), numbers of observations are the values (Int). + * Super orthologs are to be inferred by method "inferOrthologs". Throws an + * exception if seq_name is not found. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @return HashMap containing the inferred super orthologs + * (name(String)->value(Int)) + */ + public final HashMap getInferredSuperOrthologs( final String seq_name ) { + if ( _so_hash_maps == null ) { + return null; + } + return _so_hash_maps.get( seq_name ); + } + + /** + * Returns a HashMap containing the inferred "ultra paralogs" of the + * external gene tree node with the sequence name seq_name. Sequence names + * are the keys (String), numbers of observations are the values (Int). + * "ultra paralogs" are to be inferred by method "inferOrthologs". Throws an + * exception if seq_name is not found. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @return HashMap containing the inferred ultra paralogs + * (name(String)->value(Int)) + */ + public final HashMap getInferredUltraParalogs( final String seq_name ) { + if ( _up_hash_maps == null ) { + return null; + } + return _up_hash_maps.get( seq_name ); + } + + /** + * Returns the time (in ms) needed to run "inferOrthologs". Final variable + * TIME needs to be set to true. + * + * @return time (in ms) needed to run method "inferOrthologs" + */ + public long getTime() { + return _time; + } + + /** + * Infers the orthologs (as well the "super orthologs", the "subtree + * neighbors", and the "ultra paralogs") for each external node of the gene + * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR, + * for example). Tallies how many times each sequence is (super-) + * orthologous towards the query. Tallies how many times each sequence is + * ultra paralogous towards the query. Tallies how many times each sequence + * is a subtree neighbor of the query. Gene duplications are inferred using + * SDI. Modifies its argument species_tree. Is a little faster than + * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for + * query. + *

    + * To obtain the results use the methods listed below. + * + * @param gene_trees_file + * a File containing gene Trees in NH format, which is the result + * of performing a bootstrap analysis in PHYLIP + * @param species_tree + * a species Phylogeny, which has species names in its species + * fields + * @param query + * the sequence name of the squence whose orthologs are to be + * inferred + */ + public void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query ) + throws IOException { + int bs = 0; + if ( RIO.TIME ) { + _time = System.currentTimeMillis(); + } + if ( !gene_trees_file.exists() ) { + throw new IllegalArgumentException( gene_trees_file.getAbsolutePath() + " does not exist." ); + } + else if ( !gene_trees_file.isFile() ) { + throw new IllegalArgumentException( gene_trees_file.getAbsolutePath() + " is not a file." ); + } + // Read in first tree to get its sequence names + // and strip species_tree. + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny gene_tree = factory.create( gene_trees_file, new PhyloXmlParser() )[ 0 ]; + // Removes from species_tree all species not found in gene_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_tree, species_tree ); + // Removes from gene_tree all species not found in species_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree ); + _seq_names = getAllExternalSequenceNames( gene_tree ); + if ( ( _seq_names == null ) || ( _seq_names.length < 1 ) ) { + return; + } + _o_hash_maps = new HashMap>(); + _so_hash_maps = new HashMap>(); + _up_hash_maps = new HashMap>(); + _sn_hash_maps = new HashMap>(); + _o_hash_maps.put( query, new HashMap( _seq_names.length ) ); + _so_hash_maps.put( query, new HashMap( _seq_names.length ) ); + _up_hash_maps.put( query, new HashMap( _seq_names.length ) ); + _sn_hash_maps.put( query, new HashMap( _seq_names.length ) ); + // Go through all gene trees in the file. + final Phylogeny[] gene_trees = factory.create( gene_trees_file, new PhyloXmlParser() ); + for( final Phylogeny gt : gene_trees ) { + bs++; + // Removes from gene_tree all species not found in species_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); + inferOrthologsHelper( gt, species_tree, query ); + // System.out.println( bs ); + } + setBootstraps( bs ); + if ( RIO.TIME ) { + _time = ( System.currentTimeMillis() - _time ); + } + } + + // Helper method which performs the actual ortholog inference for + // the external node with seqname query. + private void inferOrthologsHelper( final Phylogeny gene_tree, final Phylogeny species_tree, final String query ) { + Phylogeny assigned_tree = null; + List nodes = null; + final SDIR sdiunrooted = new SDIR(); + List orthologs = null; + List super_orthologs = null; + List ultra_paralogs = null; + List subtree_neighbors = null; + assigned_tree = sdiunrooted.infer( gene_tree, + species_tree, + RIO.ROOT_BY_MINIMIZING_MAPPING_COST, + RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS, + RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT, + true, + 1 )[ 0 ]; + setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() ); + nodes = assigned_tree.getNodesViaSequenceName( query ); + if ( nodes.size() > 1 ) { + throw new IllegalArgumentException( "node named [" + query + "] not unique" ); + } + else if ( nodes.isEmpty() ) { + throw new IllegalArgumentException( "no node containing a sequence named [" + query + "] found" ); + } + final PhylogenyNode query_node = nodes.get( 0 ); + final PhylogenyMethods methods = PhylogenyMethods.getInstance(); + orthologs = methods.getOrthologousNodes( assigned_tree, query_node ); + updateHash( _o_hash_maps, query, orthologs ); + super_orthologs = PhylogenyMethods.getSuperOrthologousNodes( query_node ); + updateHash( _so_hash_maps, query, super_orthologs ); + subtree_neighbors = getSubtreeNeighbors( query_node, 2 ); + updateHash( _sn_hash_maps, query, subtree_neighbors ); + ultra_paralogs = PhylogenyMethods.getUltraParalogousNodes( query_node ); + updateHash( _up_hash_maps, query, ultra_paralogs ); + } + + /** + * Returns an ArrayList containg the names of orthologs of the PhylogenyNode + * with seq name seq_name. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @param threshold_orthologs + * the minimal number of observations for a a sequence to be + * reported as orthologous as percentage (0.0-100.0%) + * @return ArrayList containg the names of orthologs of the PhylogenyNode + * with seq name seq_name + */ + public ArrayList inferredOrthologsToArrayList( final String seq_name, double threshold_orthologs ) { + HashMap o_hashmap = null; + String name = null; + double o = 0.0; + final ArrayList arraylist = new ArrayList(); + if ( _o_hash_maps == null ) { + throw new RuntimeException( "Orthologs have not been calculated (successfully)." ); + } + if ( threshold_orthologs < 0.0 ) { + threshold_orthologs = 0.0; + } + else if ( threshold_orthologs > 100.0 ) { + threshold_orthologs = 100.0; + } + o_hashmap = getInferredOrthologs( seq_name ); + if ( o_hashmap == null ) { + throw new RuntimeException( "Orthologs for " + seq_name + " were not established." ); + } + if ( _seq_names.length > 0 ) { + I: for( int i = 0; i < _seq_names.length; ++i ) { + name = _seq_names[ i ]; + if ( name.equals( seq_name ) ) { + continue I; + } + o = getBootstrapValueFromHash( o_hashmap, name ); + if ( o < threshold_orthologs ) { + continue I; + } + arraylist.add( name ); + } + } + return arraylist; + } + + /** + * Returns a String containg the names of orthologs of the PhylogenyNode + * with seq name query_name. The String also contains how many times a + * particular ortholog has been observed. + *

    + *

      + * The output order is (per line): Name, Ortholog, Subtree neighbor, Super + * ortholog, Distance + *
    + *

    + * The sort priority of this is determined by sort in the following manner: + *

      + *
    • 0 : Ortholog + *
    • 1 : Ortholog, Super ortholog + *
    • 2 : Super ortholog, Ortholog + *
    • 3 : Ortholog, Distance + *
    • 4 : Distance, Ortholog + *
    • 5 : Ortholog, Super ortholog, Distance + *
    • 6 : Ortholog, Distance, Super ortholog + *
    • 7 : Super ortholog, Ortholog, Distance + *
    • 8 : Super ortholog, Distance, Ortholog + *
    • 9 : Distance, Ortholog, Super ortholog + *
    • 10 : Distance, Super ortholog, Ortholog + *
    • 11 : Ortholog, Subtree neighbor, Distance + *
    • 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default) + *
    • 13 : Ortholog, Super ortholog, Subtree neighbor, Distance + *
    • 14 : Subtree neighbor, Ortholog, Super ortholog, Distance + *
    • 15 : Subtree neighbor, Distance, Ortholog, Super ortholog + *
    • 16 : Ortholog, Distance, Subtree neighbor, Super ortholog + *
    • 17 : Ortholog, Subtree neighbor, Distance, Super ortholog + *
    + *

    + * Returns "-" if no putative orthologs have been found (given + * threshold_orthologs). + *

    + * Orthologs are to be inferred by method "inferOrthologs". + *

    + * (Last modified: 05/08/01) + * + * @param query_name + * sequence name of a external node of the gene trees + * @param sort + * order and sort priority + * @param threshold_orthologs + * the minimal number of observations for a a sequence to be + * reported as orthologous, in percents (0.0-100.0%) + * @param threshold_subtreeneighborings + * the minimal number of observations for a a sequence to be + * reported as orthologous, in percents (0.0-100.0%) + * @return String containing the inferred orthologs, String containing "-" + * if no orthologs have been found null in case of error + * @see #inferOrthologs(File,Phylogeny,String) + * @see #inferOrthologs(Phylogeny[],Phylogeny) + * @see #inferOrthologs(File,Phylogeny) + * @see #getOrder(int) + */ + public StringBuffer inferredOrthologsToString( final String query_name, + int sort, + double threshold_orthologs, + double threshold_subtreeneighborings ) { + HashMap o_hashmap = null; + HashMap s_hashmap = null; + HashMap n_hashmap = null; + String name = ""; + double o = 0.0, // Orthologs. + s = 0.0, // Super orthologs. + sn = 0.0, // Subtree neighbors. + value1 = 0.0, value2 = 0.0, value3 = 0.0, value4 = 0.0, d = 0.0; + final ArrayList nv = new ArrayList(); + if ( ( _o_hash_maps == null ) || ( _so_hash_maps == null ) || ( _sn_hash_maps == null ) ) { + throw new RuntimeException( "Orthologs have not been calculated (successfully)" ); + } + if ( ( sort < 0 ) || ( sort > 17 ) ) { + sort = 12; + } + if ( ( sort > 2 ) && ( _m == null ) && ( _l == null ) ) { + throw new RuntimeException( "Distance list or matrix have not been read in (successfully)" ); + } + if ( threshold_orthologs < 0.0 ) { + threshold_orthologs = 0.0; + } + else if ( threshold_orthologs > 100.0 ) { + threshold_orthologs = 100.0; + } + if ( threshold_subtreeneighborings < 0.0 ) { + threshold_subtreeneighborings = 0.0; + } + else if ( threshold_subtreeneighborings > 100.0 ) { + threshold_subtreeneighborings = 100.0; + } + o_hashmap = getInferredOrthologs( query_name ); + s_hashmap = getInferredSuperOrthologs( query_name ); + n_hashmap = getInferredSubtreeNeighbors( query_name ); + if ( ( o_hashmap == null ) || ( s_hashmap == null ) || ( n_hashmap == null ) ) { + throw new RuntimeException( "Orthologs for " + query_name + " were not established" ); + } + final StringBuffer orthologs = new StringBuffer(); + if ( _seq_names.length > 0 ) { + I: for( int i = 0; i < _seq_names.length; ++i ) { + name = _seq_names[ i ]; + if ( name.equals( query_name ) ) { + continue I; + } + o = getBootstrapValueFromHash( o_hashmap, name ); + if ( o < threshold_orthologs ) { + continue I; + } + sn = getBootstrapValueFromHash( n_hashmap, name ); + if ( sn < threshold_subtreeneighborings ) { + continue I; + } + s = getBootstrapValueFromHash( s_hashmap, name ); + if ( sort >= 3 ) { + if ( _m != null ) { + d = getDistance( query_name, name ); + } + else { + d = getDistance( name ); + } + } + switch ( sort ) { + case 0: + nv.add( new Tuplet( name, o, 5 ) ); + break; + case 1: + nv.add( new Tuplet( name, o, s, 5 ) ); + break; + case 2: + nv.add( new Tuplet( name, s, o, 5 ) ); + break; + case 3: + nv.add( new Tuplet( name, o, d, 1 ) ); + break; + case 4: + nv.add( new Tuplet( name, d, o, 0 ) ); + break; + case 5: + nv.add( new Tuplet( name, o, s, d, 2 ) ); + break; + case 6: + nv.add( new Tuplet( name, o, d, s, 1 ) ); + break; + case 7: + nv.add( new Tuplet( name, s, o, d, 2 ) ); + break; + case 8: + nv.add( new Tuplet( name, s, d, o, 1 ) ); + break; + case 9: + nv.add( new Tuplet( name, d, o, s, 0 ) ); + break; + case 10: + nv.add( new Tuplet( name, d, s, o, 0 ) ); + break; + case 11: + nv.add( new Tuplet( name, o, sn, d, 2 ) ); + break; + case 12: + nv.add( new Tuplet( name, o, sn, s, d, 3 ) ); + break; + case 13: + nv.add( new Tuplet( name, o, s, sn, d, 3 ) ); + break; + case 14: + nv.add( new Tuplet( name, sn, o, s, d, 3 ) ); + break; + case 15: + nv.add( new Tuplet( name, sn, d, o, s, 1 ) ); + break; + case 16: + nv.add( new Tuplet( name, o, d, sn, s, 1 ) ); + break; + case 17: + nv.add( new Tuplet( name, o, sn, d, s, 2 ) ); + break; + default: + nv.add( new Tuplet( name, o, 5 ) ); + } + } // End of I for loop. + if ( ( nv != null ) && ( nv.size() > 0 ) ) { + orthologs.append( "[seq name]\t\t[ortho]\t[st-n]\t[sup-o]\t[dist]" + ForesterUtil.LINE_SEPARATOR ); + final Tuplet[] nv_array = new Tuplet[ nv.size() ]; + for( int j = 0; j < nv.size(); ++j ) { + nv_array[ j ] = nv.get( j ); + } + Arrays.sort( nv_array ); + for( int i = 0; i < nv_array.length; ++i ) { + name = nv_array[ i ].getKey(); + value1 = nv_array[ i ].getValue1(); + value2 = nv_array[ i ].getValue2(); + value3 = nv_array[ i ].getValue3(); + value4 = nv_array[ i ].getValue4(); + orthologs.append( addNameAndValues( name, value1, value2, value3, value4, sort ) ); + } + } + } + // No orthologs found. + if ( ( orthologs == null ) || ( orthologs.length() < 1 ) ) { + orthologs.append( "-" ); + } + return orthologs; + } // inferredOrthologsToString( String, int, double ) + + // Helper method for inferredOrthologTableToFile. + // Returns individual rows for the table as String. + private String inferredOrthologsToTableHelper( final String name2, + final String[] names, + final int j, + final boolean super_orthologs ) { + HashMap hashmap = null; + String name = null, orthologs = new String( "" ); + int value = 0; + if ( !super_orthologs ) { + hashmap = getInferredOrthologs( name2 ); + } + else { + hashmap = getInferredSuperOrthologs( name2 ); + } + if ( hashmap == null ) { + throw new RuntimeException( "Unexpected failure in method inferredOrthologsToTableHelper" ); + } + for( int i = 0; i < names.length; ++i ) { + name = names[ i ]; + if ( !hashmap.containsKey( name ) ) { + value = 0; + } + else { + value = hashmap.get( name ); + } + if ( i == j ) { + // Sanity check. + if ( value != 0 ) { + throw new RuntimeException( "Failed sanity check in method inferredOrthologsToTableHelper: value not 0." ); + } + orthologs += ( " " + "\t" ); + } + else { + orthologs += ( value + "\t" ); + } + } + return orthologs; + } + + /** + * Writes the orthologs for each external node of the gene trees to outfile + * in the form of a table. Orthologs are to be inferred by method + * "inferOrthologs". Overwrites without asking! (Last modified: 12/07/00) + * + * @param outfile + * the File to write to + */ + public void inferredOrthologTableToFile( final File outfile ) throws IOException { + if ( _o_hash_maps == null ) { + return; + } + inferredOrthologTableToFile( outfile, false ); + } + + // Helper for inferredOrthologTableToFile(File). + // (Last modified: 11/28/00) + private void inferredOrthologTableToFile( final File outfile, final boolean super_orthologs ) throws IOException { + String name = "", line = ""; + PrintWriter out = null; + if ( _seq_names == null ) { + throw new RuntimeException( "inferredOrthologTableToFile: seq_names_ is null." ); + } + Arrays.sort( _seq_names ); + out = new PrintWriter( new FileWriter( outfile ), true ); + if ( out == null ) { + throw new RuntimeException( "inferredOrthologTableToFile: failure to create PrintWriter." ); + } + line = "\t\t\t\t"; + for( int i = 0; i < _seq_names.length; ++i ) { + line += ( i + ")\t" ); + } + line += "\n"; + out.println( line ); + for( int i = 0; i < _seq_names.length; ++i ) { + name = _seq_names[ i ]; + if ( name.length() < 8 ) { + line = i + ")\t" + name + "\t\t\t"; + } + else if ( name.length() < 16 ) { + line = i + ")\t" + name + "\t\t"; + } + else { + line = i + ")\t" + name + "\t"; + } + line += inferredOrthologsToTableHelper( name, _seq_names, i, super_orthologs ); + out.println( line ); + } + out.close(); + } + + /** + * Writes the "super orthologs" for each external nodes of the gene trees to + * outfile in the form of a table. Super orthologs are to be inferred by + * method "inferOrthologs". Overwrites without asking! + * + * @param outfile + * the File to write to + */ + public void inferredSuperOrthologTableToFile( final File outfile ) throws IOException { + if ( _so_hash_maps == null ) { + return; + } + inferredOrthologTableToFile( outfile, true ); + } + + /** + * Returns a String containg the names of orthologs of the PhylogenyNode + * with seq name query_name. The String also contains how many times a + * particular ortholog has been observed. Returns "-" if no putative + * orthologs have been found (given threshold_orthologs). + *

    + * Orthologs are to be inferred by method "inferOrthologs". + * + * @param query_name + * sequence name of a external node of the gene trees + * @param return_dists + * @param threshold_ultra_paralogs + * between 1 and 100 + * @return String containing the inferred orthologs, String containing "-" + * if no orthologs have been found null in case of error + */ + public String inferredUltraParalogsToString( final String query_name, + final boolean return_dists, + double threshold_ultra_paralogs ) { + HashMap sp_hashmap = null; + String name = "", ultra_paralogs = ""; + int sort = 0; + double sp = 0.0, value1 = 0.0, value2 = 0.0, d = 0.0; + final List nv = new ArrayList(); + if ( threshold_ultra_paralogs < 1.0 ) { + threshold_ultra_paralogs = 1.0; + } + else if ( threshold_ultra_paralogs > 100.0 ) { + threshold_ultra_paralogs = 100.0; + } + if ( _up_hash_maps == null ) { + throw new RuntimeException( "Ultra paralogs have not been calculated (successfully)." ); + } + if ( return_dists && ( _m == null ) && ( _l == null ) ) { + throw new RuntimeException( "Distance list or matrix have not been read in (successfully)." ); + } + sp_hashmap = getInferredUltraParalogs( query_name ); + if ( sp_hashmap == null ) { + throw new RuntimeException( "Ultra paralogs for " + query_name + " were not established" ); + } + if ( _seq_names.length > 0 ) { + I: for( int i = 0; i < _seq_names.length; ++i ) { + name = _seq_names[ i ]; + if ( name.equals( query_name ) ) { + continue I; + } + sp = getBootstrapValueFromHash( sp_hashmap, name ); + if ( sp < threshold_ultra_paralogs ) { + continue I; + } + if ( return_dists ) { + if ( _m != null ) { + d = getDistance( query_name, name ); + } + else { + d = getDistance( name ); + } + nv.add( new Tuplet( name, sp, d, 1 ) ); + } + else { + nv.add( new Tuplet( name, sp, 5 ) ); + } + } // End of I for loop. + if ( ( nv != null ) && ( nv.size() > 0 ) ) { + final Tuplet[] nv_array = new Tuplet[ nv.size() ]; + for( int j = 0; j < nv.size(); ++j ) { + nv_array[ j ] = nv.get( j ); + } + Arrays.sort( nv_array ); + if ( return_dists ) { + sort = 91; + } + else { + sort = 90; + } + for( int i = 0; i < nv_array.length; ++i ) { + name = nv_array[ i ].getKey(); + value1 = nv_array[ i ].getValue1(); + value2 = nv_array[ i ].getValue2(); + ultra_paralogs += addNameAndValues( name, value1, value2, 0.0, 0.0, sort ); + } + } + } + // No ultra paralogs found. + if ( ( ultra_paralogs == null ) || ( ultra_paralogs.length() < 1 ) ) { + ultra_paralogs = "-"; + } + return ultra_paralogs; + } + + public final void readDistanceMatrix( final File matrix_file ) throws IOException { + DistanceMatrix[] matrices = null; + final SymmetricalDistanceMatrixParser parser = SymmetricalDistanceMatrixParser.createInstance(); + matrices = parser.parse( matrix_file ); + if ( ( matrices == null ) || ( matrices.length == 0 ) ) { + throw new IOException( "failed to parse distance matrix from [" + matrix_file + "]" ); + } + if ( matrices.length > 1 ) { + throw new IOException( "[" + matrix_file + "] contains more than once distance matrix" ); + } + _m = matrices[ 0 ]; + } + + /** + * Brings this into the same state as immediately after construction. + */ + private final void reset() { + _o_hash_maps = null; + _so_hash_maps = null; + _up_hash_maps = null; + _seq_names = null; + _m = null; + _l = null; + _bootstraps = 1; + _ext_nodes_ = 0; + _time = 0; + } + + /** + * Sets the numbers of trees analyzed. + * @param the + * numbers of trees analyzed + */ + private void setBootstraps( int i ) { + if ( i < 1 ) { + i = 1; + } + _bootstraps = i; + } + + /** + * Sets number of ext nodes in gene trees analyzed (after stripping). + * @param the + * number of ext nodes in gene trees analyzed (after stripping) + */ + private void setExtNodesOfAnalyzedGeneTrees( int i ) { + if ( i < 1 ) { + i = 0; + } + _ext_nodes_ = i; + } + + // Helper for doInferOrthologs( Phylogeny, Phylogeny, String ) + // and doInferOrthologs( Phylogeny, Phylogeny ). + private void updateHash( final HashMap> counter_map, + final String query_seq_name, + final List nodes ) { + final HashMap hash_map = counter_map.get( query_seq_name ); + if ( hash_map == null ) { + throw new RuntimeException( "Unexpected failure in method updateHash." ); + } + for( int j = 0; j < nodes.size(); ++j ) { + final String seq_name = ( nodes.get( j ) ).getNodeData().getSequence().getName(); + if ( hash_map.containsKey( seq_name ) ) { + hash_map.put( seq_name, hash_map.get( seq_name ) + 1 ); + } + else { + hash_map.put( seq_name, 1 ); + } + } + } + + // Helper method for inferredOrthologsToString + // and inferredUltraParalogsToString. + private final static String addNameAndValues( final String name, + final double value1, + final double value2, + final double value3, + final double value4, + final int sort ) { + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#####" ); + df.setDecimalSeparatorAlwaysShown( false ); + String line = ""; + if ( name.length() < 8 ) { + line += ( name + "\t\t\t" ); + } + else if ( name.length() < 16 ) { + line += ( name + "\t\t" ); + } + else { + line += ( name + "\t" ); + } + switch ( sort ) { + case 0: + line += addToLine( value1, df ); + line += "-\t"; + line += "-\t"; + line += "-\t"; + break; + case 1: + line += addToLine( value1, df ); + line += "-\t"; + line += addToLine( value2, df ); + line += "-\t"; + break; + case 2: + line += addToLine( value2, df ); + line += "-\t"; + line += addToLine( value1, df ); + line += "-\t"; + break; + case 3: + line += addToLine( value1, df ); + line += "-\t"; + line += "-\t"; + line += addToLine( value2, df ); + break; + case 4: + line += addToLine( value2, df ); + line += "-\t"; + line += "-\t"; + line += addToLine( value1, df ); + break; + case 5: + line += addToLine( value1, df ); + line += "-\t"; + line += addToLine( value2, df ); + line += addToLine( value3, df ); + break; + case 6: + line += addToLine( value1, df ); + line += "-\t"; + line += addToLine( value3, df ); + line += addToLine( value2, df ); + break; + case 7: + line += addToLine( value2, df ); + line += "-\t"; + line += addToLine( value1, df ); + line += addToLine( value3, df ); + break; + case 8: + line += addToLine( value3, df ); + line += "-\t"; + line += addToLine( value1, df ); + line += addToLine( value2, df ); + break; + case 9: + line += addToLine( value2, df ); + line += "-\t"; + line += addToLine( value3, df ); + line += addToLine( value1, df ); + break; + case 10: + line += addToLine( value3, df ); + line += "-\t"; + line += addToLine( value2, df ); + line += addToLine( value1, df ); + break; + case 11: + line += addToLine( value1, df ); + line += addToLine( value2, df ); + line += "-\t"; + line += addToLine( value3, df ); + break; + case 12: + line += addToLine( value1, df ); + line += addToLine( value2, df ); + line += addToLine( value3, df ); + line += addToLine( value4, df ); + break; + case 13: + line += addToLine( value1, df ); + line += addToLine( value3, df ); + line += addToLine( value2, df ); + line += addToLine( value4, df ); + break; + case 14: + line += addToLine( value2, df ); + line += addToLine( value1, df ); + line += addToLine( value3, df ); + line += addToLine( value4, df ); + break; + case 15: + line += addToLine( value3, df ); + line += addToLine( value1, df ); + line += addToLine( value4, df ); + line += addToLine( value2, df ); + break; + case 16: + line += addToLine( value1, df ); + line += addToLine( value3, df ); + line += addToLine( value4, df ); + line += addToLine( value2, df ); + break; + case 17: + line += addToLine( value1, df ); + line += addToLine( value2, df ); + line += addToLine( value4, df ); + line += addToLine( value3, df ); + break; + case 90: + line += addToLine( value1, df ); + line += "-\t"; + break; + case 91: + line += addToLine( value1, df ); + line += addToLine( value2, df ); + break; + } + line += ForesterUtil.LINE_SEPARATOR; + return line; + } + + // Helper for addNameAndValues. + private final static String addToLine( final double value, final java.text.DecimalFormat df ) { + String s = ""; + if ( value != Tuplet.DEFAULT ) { + s = df.format( value ) + "\t"; + } + else { + s = "-\t"; + } + return s; + } + + private static String[] getAllExternalSequenceNames( final Phylogeny phy ) { + if ( phy.isEmpty() ) { + return null; + } + int i = 0; + final String[] names = new String[ phy.getNumberOfExternalNodes() ]; + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + names[ i++ ] = iter.next().getNodeData().getSequence().getName(); + } + return names; + } + + /** + * Returns the order in which ortholog (o), "super ortholog" (s) and + * distance (d) are returned and sorted (priority of sort always goes from + * left to right), given sort. For the meaning of sort + * + * @see #inferredOrthologsToString(String,int,double,double) + * + * @param sort + * determines order and sort priority + * @return String indicating the order + */ + public final static String getOrder( final int sort ) { + String order = ""; + switch ( sort ) { + case 0: + order = "orthologies"; + break; + case 1: + order = "orthologies > super orthologies"; + break; + case 2: + order = "super orthologies > orthologies"; + break; + case 3: + order = "orthologies > distance to query"; + break; + case 4: + order = "distance to query > orthologies"; + break; + case 5: + order = "orthologies > super orthologies > distance to query"; + break; + case 6: + order = "orthologies > distance to query > super orthologies"; + break; + case 7: + order = "super orthologies > orthologies > distance to query"; + break; + case 8: + order = "super orthologies > distance to query > orthologies"; + break; + case 9: + order = "distance to query > orthologies > super orthologies"; + break; + case 10: + order = "distance to query > super orthologies > orthologies"; + break; + case 11: + order = "orthologies > subtree neighbors > distance to query"; + break; + case 12: + order = "orthologies > subtree neighbors > super orthologies > distance to query"; + break; + case 13: + order = "orthologies > super orthologies > subtree neighbors > distance to query"; + break; + case 14: + order = "subtree neighbors > orthologies > super orthologies > distance to query"; + break; + case 15: + order = "subtree neighbors > distance to query > orthologies > super orthologies"; + break; + case 16: + order = "orthologies > distance to query > subtree neighbors > super orthologies"; + break; + case 17: + order = "orthologies > subtree neighbors > distance to query > super orthologies"; + break; + default: + order = "orthologies"; + break; + } + return order; + } + + public final static StringBuffer getOrderHelp() { + final StringBuffer sb = new StringBuffer(); + sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 3: orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 4: distance to query > orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 5: orthologies > super orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 6: orthologies > distance to query > super orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 7: super orthologies > orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 8: super orthologies > distance to query > orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 9: distance to query > orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 10: distance to query > super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 11: orthologies > subtree neighbors > distance to query" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 12: orthologies > subtree neighbors > super orthologies > distance to query" + + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 13: orthologies > super orthologies > subtree neighbors > distance to query" + + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 14: subtree neighbors > orthologies > super orthologies > distance to query" + + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 15: subtree neighbors > distance to query > orthologies > super orthologies" + + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 16: orthologies > distance to query > subtree neighbors > super orthologies" + + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 17: orthologies > subtree neighbors > distance to query > super orthologies" + + ForesterUtil.LINE_SEPARATOR ); + return sb; + } + + private final static List getSubtreeNeighbors( final PhylogenyNode query, final int level ) { + PhylogenyNode node = query; + if ( !node.isExternal() ) { + return null; + } + if ( !node.isRoot() ) { + node = node.getParent(); + } + if ( level == 2 ) { + if ( !node.isRoot() ) { + node = node.getParent(); + } + } + else { + throw new IllegalArgumentException( "currently only supporting level 2 subtree neighbors " ); + } + final List sn = node.getAllExternalDescendants(); + sn.remove( query ); + return sn; + } +} diff --git a/forester/java/src/org/forester/sdi/RIOn.java b/forester/java/src/org/forester/sdi/RIOn.java new file mode 100644 index 0000000..7855473 --- /dev/null +++ b/forester/java/src/org/forester/sdi/RIOn.java @@ -0,0 +1,132 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.GeneralTable; + +public class RIOn { + + private final static boolean ROOT_BY_MINIMIZING_MAPPING_COST = false; + private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true; + private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true; + GeneralTable _orthologs = null; + GeneralTable _paralogs = null; + GeneralTable _super_orthologs = null; + GeneralTable _ultra_paralogs = null; + + private void doInferOrthologs( final Phylogeny gene_tree, final Phylogeny species_tree ) { + final SDIR sdiunrooted = new SDIR(); + final Phylogeny assigned_tree = sdiunrooted.infer( gene_tree, + species_tree, + ROOT_BY_MINIMIZING_MAPPING_COST, + ROOT_BY_MINIMIZING_SUM_OF_DUPS, + ROOT_BY_MINIMIZING_TREE_HEIGHT, + true, + 1 )[ 0 ]; + final List external_nodes = new ArrayList(); + for( final PhylogenyNodeIterator iterator = assigned_tree.iteratorExternalForward(); iterator.hasNext(); ) { + external_nodes.add( iterator.next() ); + } + final PhylogenyMethods methods = PhylogenyMethods.getInstance(); + for( int i = 0; i < external_nodes.size(); ++i ) { + for( int j = 0; j < external_nodes.size(); ++j ) { + if ( i != j ) { + final PhylogenyNode node_i = external_nodes.get( i ); + final PhylogenyNode node_j = external_nodes.get( j ); + final PhylogenyNode lca = methods.obtainLCA( node_i, node_j ); + final Event event = lca.getNodeData().getEvent(); + final String node_i_name = node_i.getNodeData().getSequence().getName(); + final String node_j_name = node_j.getNodeData().getSequence().getName(); + if ( event.isDuplication() ) { + increaseCounter( getOrthologs(), node_i_name, node_j_name ); + } + else { + increaseCounter( getParalogs(), node_i_name, node_j_name ); + } + } + } + } + } + + public GeneralTable getOrthologs() { + return _orthologs; + } + + public GeneralTable getParalogs() { + return _paralogs; + } + + public GeneralTable getSuperOrthologs() { + return _super_orthologs; + } + + public GeneralTable getUltraParalogs() { + return _ultra_paralogs; + } + + private void increaseCounter( final GeneralTable table, + final String node_i_name, + final String node_j_name ) { + final Integer value = table.getValue( node_i_name, node_j_name ); + if ( value == null ) { + table.setValue( node_i_name, node_j_name, 1 ); + } + else { + table.setValue( node_i_name, node_j_name, value.intValue() + 1 ); + } + } + + private void init() { + _orthologs = new GeneralTable(); + _paralogs = new GeneralTable(); + _super_orthologs = new GeneralTable(); + _ultra_paralogs = new GeneralTable(); + } + + private void setOrthologs( final GeneralTable orthologs ) { + _orthologs = orthologs; + } + + private void setParalogs( final GeneralTable paralogs ) { + _paralogs = paralogs; + } + + private void setSuperOrthologs( final GeneralTable super_orthologs ) { + _super_orthologs = super_orthologs; + } + + private void setUltraParalogs( final GeneralTable ultra_paralogs ) { + _ultra_paralogs = ultra_paralogs; + } +} diff --git a/forester/java/src/org/forester/sdi/SDI.java b/forester/java/src/org/forester/sdi/SDI.java new file mode 100644 index 0000000..b21b9b5 --- /dev/null +++ b/forester/java/src/org/forester/sdi/SDI.java @@ -0,0 +1,318 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.sdi; + +import java.util.HashMap; +import java.util.Map; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +public abstract class SDI { + + final Phylogeny _gene_tree; + final Phylogeny _species_tree; + int _duplications_sum; // Sum of duplications. + int _mapping_cost; // Mapping cost "L". + + /** + * Constructor which sets the gene tree and the species tree to be compared. + * species_tree is the species tree to which the gene tree gene_tree will be + * compared to. + * Infers for each PhylogenyNode of gene_tree whether + * it represents a speciation or duplication event by calculating and + * interpreting the mapping function M. The most parsimonious sequence of + * speciation and duplication events is assumed. + * The mapping cost L can be + * calculated with method "computeMappingCost()". + *

    + * Conditions: + *

    + *
      + *
    • Both Trees must be rooted + *
    • Both Trees must have species names in the species name fields of all + * their external nodes + *
    + * + * @param gene_tree + * reference to a rooted binary gene Phylogeny to which assign + * duplication vs speciation, must have species names in the + * species name fields for all external nodes + * @param species_tree + * reference to a rooted binary species Phylogeny which might get + * stripped in the process, must have species names in the + * species name fields for all external nodes + */ + public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) { + if ( species_tree.isEmpty() || gene_tree.isEmpty() ) { + throw new IllegalArgumentException( "attempt to infer duplications using empty tree(s)" ); + } + if ( !gene_tree.isRooted() ) { + throw new IllegalArgumentException( "attempt to infer duplications on unrooted gene tree" ); + } + if ( !species_tree.isRooted() ) { + throw new IllegalArgumentException( "attempt to infer duplications on unrooted species tree" ); + } + _gene_tree = gene_tree; + _species_tree = species_tree; + _duplications_sum = 0; + _mapping_cost = -1; + } + + // Helper method for "computeMappingCost()". + private void computeMappingCostHelper( final PhylogenyNode g ) { + if ( !g.isExternal() ) { + computeMappingCostHelper( g.getChildNode1() ); + computeMappingCostHelper( g.getChildNode2() ); + if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) { + _mapping_cost += ( g.getChildNode1().getLink().getId() + g.getChildNode2().getLink().getId() + - ( 2 * g.getLink().getId() ) - 2 ); + } + else if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() == g.getChildNode2().getLink() ) ) { + _mapping_cost += ( g.getChildNode1().getLink().getId() - g.getLink().getId() + 1 ); + } + else if ( ( g.getLink() == g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) { + _mapping_cost += ( g.getChildNode2().getLink().getId() - g.getLink().getId() + 1 ); + } + else { + _mapping_cost++; + } + } + } + + /** + * Computes the cost of mapping the gene tree gene_tree onto the species + * tree species_tree. Before this method can be called, the mapping has to + * be calculated with method "infer(boolean)". + *

    + * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for + * Comparing Molecular Phylogenies. Journal of Computational Biology 4 + * 177-187. + * + * @return the mapping cost "L" + */ + public int computeMappingCostL() { + _species_tree.levelOrderReID(); + _mapping_cost = 0; + computeMappingCostHelper( _gene_tree.getRoot() ); + return _mapping_cost; + } + + private TaxonomyComparisonBase determineTaxonomyComparisonBase() { + TaxonomyComparisonBase base = null; + boolean all_have_id = true; + boolean all_have_code = true; + boolean all_have_sn = true; + boolean all_have_cn = true; + for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + final Taxonomy tax = n.getNodeData().getTaxonomy(); + if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { + all_have_id = false; + } + if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { + all_have_code = false; + } + if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) { + all_have_sn = false; + } + if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) { + all_have_cn = false; + } + } + else { + throw new IllegalArgumentException( "species tree node [" + n + "] has no taxonomic data" ); + } + } + for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + final Taxonomy tax = n.getNodeData().getTaxonomy(); + if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) { + all_have_id = false; + } + if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { + all_have_code = false; + } + if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) { + all_have_sn = false; + } + if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) { + all_have_cn = false; + } + } + else { + throw new IllegalArgumentException( "gene tree node [" + n + "] has no taxonomic data" ); + } + } + if ( all_have_id ) { + base = TaxonomyComparisonBase.ID; + } + else if ( all_have_code ) { + base = TaxonomyComparisonBase.CODE; + } + else if ( all_have_sn ) { + base = TaxonomyComparisonBase.SCIENTIFIC_NAME; + } + else if ( all_have_cn ) { + base = TaxonomyComparisonBase.COMMON_NAME; + } + else { + throw new IllegalArgumentException( "gene tree and species tree have incomparable taxonomies" ); + } + return base; + } + + /** + * Returns the number of duplications. + * + * @return number of duplications + */ + public int getDuplicationsSum() { + return _duplications_sum; + } + + /** + * Returns the gene tree. + * + * @return gene tree + */ + public Phylogeny getGeneTree() { + return _gene_tree; + } + + /** + * Returns the species tree. + * + * @return species tree + */ + public Phylogeny getSpeciesTree() { + return _species_tree; + } + + /** + * Calculates the mapping function for the external nodes of the gene tree: + * links (sets the field "link" of PhylogenyNode) each external + * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree + * which has the same species name. + */ + void linkNodesOfG() { + final Map speciestree_ext_nodes = new HashMap(); + final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase(); + // Put references to all external nodes of the species tree into a map. + // Stringyfied taxonomy is the key, node is the value. + for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode s = iter.next(); + final String tax_str = taxonomyToString( s, tax_comp_base ); + if ( speciestree_ext_nodes.containsKey( tax_str ) ) { + throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy() + + "] is not unique in species phylogeny" ); + } + speciestree_ext_nodes.put( tax_str, s ); + } + // Retrieve the reference to the node with a matching stringyfied taxonomy. + for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + final String tax_str = taxonomyToString( g, tax_comp_base ); + final PhylogenyNode s = speciestree_ext_nodes.get( tax_str ); + if ( s == null ) { + throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy() + + "] not present in species tree" ); + } + g.setLink( s ); + } + } + + /** + * Calculates the mapping function for the external nodes of the gene tree: + * links (sets the field "link" of PhylogenyNode) each external by taxonomy + * identifier + * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree + * which has the same species name. + * Olivier CHABROL : olivier.chabrol@univ-provence.fr + */ + void linkNodesOfGByTaxonomyIdentifier() { + final HashMap speciestree_ext_nodes = new HashMap(); + if ( _species_tree.getFirstExternalNode().isRoot() ) { + speciestree_ext_nodes.put( _species_tree.getFirstExternalNode().getNodeData().getTaxonomy().getIdentifier() + .getValue(), _species_tree.getFirstExternalNode() ); + } + else { + for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode s = iter.next(); + speciestree_ext_nodes.put( s.getNodeData().getTaxonomy().getIdentifier().getValue(), s ); + } + } + for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode g = iter.next(); + final PhylogenyNode s = speciestree_ext_nodes + .get( g.getNodeData().getTaxonomy().getIdentifier().getValue() ); + if ( s == null ) { + String message = "species [" + g.getNodeData().getTaxonomy().getIdentifier().getValue(); + message += "] not present in species tree"; + throw new IllegalArgumentException( message ); + } + g.setLink( s ); + } + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( getClass() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( "Duplications sum : " + getDuplicationsSum() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( "mapping cost L : " + computeMappingCostL() ); + return sb.toString(); + } + + private static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) { + final Taxonomy tax = n.getNodeData().getTaxonomy(); + switch ( base ) { + case ID: + return tax.getIdentifier().getValue(); + case CODE: + return tax.getTaxonomyCode(); + case SCIENTIFIC_NAME: + return tax.getScientificName(); + case COMMON_NAME: + return tax.getCommonName(); + default: + throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base ); + } + } + + enum TaxonomyComparisonBase { + ID, CODE, SCIENTIFIC_NAME, COMMON_NAME; + } +} diff --git a/forester/java/src/org/forester/sdi/SDIR.java b/forester/java/src/org/forester/sdi/SDIR.java new file mode 100644 index 0000000..f75f6f8 --- /dev/null +++ b/forester/java/src/org/forester/sdi/SDIR.java @@ -0,0 +1,579 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyBranch; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +/* + * Allows to infer duplications - speciations on a unrooted gene tree. It + * reroots the gene trees on each of its branches and performs SDIse on each of + * the resulting trees. Trees which minimize a certain criterion are returned as + * the "correctly" rooted ones. The criterions are:

    • Sum of duplications + *
    • Mapping cost L
    • Phylogeny height - which is the largest distance from + * root to external node (minimizing of which is the same as "midpoint rooting") + *
    + * + * @see SDIse + * + * @see SDI + * + * @author Christian M. Zmasek + */ +public class SDIR { + + private final static double ZERO_DIFF = 1.0E-6; // Due to inaccurate + // calculations on + // Java's side, not + // everything that should + // be 0.0 is 0.0. + private int _count; + private int _min_dup; + private int _min_cost; + private double _min_height; + private double _min_diff; + private long _time_sdi; + + /** + * Default contructor which creates an "empty" object.. + */ + public SDIR() { + init(); + } + + /** + * Returns the number of differently rooted trees which minimize the + * (rooting) "criterion" - as determined by method "infer". + * + * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean) + * @return number of differently rooted trees which minimized the criterion + */ + public int getCount() { + return _count; + } + + /** + * Returns the (absolue value of the) minimal difference in tree heights of + * the two subtrees at the root (of the (re)rooted gene tree) - as + * determined by method "infer" - if minimize_height is set to true. + *

    + * If a tree is midpoint rooted this number is zero. + *

    + * IMPORTANT : If minimize_mapping_cost or minimize_sum_of_dup are + * also set to true, then this returns the minimal difference in tree + * heights of the trees which minimize the first criterion, and is therefore + * not necessarily zero. + *

    + * (Last modified: 01/22/00) + * + * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean) + * @return the minimal difference in tree heights -- IF calculated by + * "infer" + */ + public double getMinimalDiffInSubTreeHeights() { + return _min_diff; + } + + /** + * Returns the minimal number of duplications - as determined by method + * "infer". + *

    + * IMPORTANT : If the tree is not rooted by minimizing the sum of + * duplications or the mapping cost L, then this number is NOT NECESSARILY + * the MINIMAL number of duplications. + * + * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean) + * @return (minimal) number of duplications + */ + public int getMinimalDuplications() { + return _min_dup; + } + + /** + * Returns the minimal mapping cost L - as determined by method "infer" - if + * minimize_mapping_cost is set to true. + *

    + * (Last modified: 11/07/00) + * + * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean) + * @return the minimal mapping cost "L" -- IF calculated by "infer" + */ + public int getMinimalMappingCost() { + return _min_cost; + } + + /** + * Returns the minimal tree height - as determined by method "infer" - if + * minimize_height is set to true. IMPORTANT : If + * minimize_mapping_cost or minimize_sum_of_dup are also set to true, then + * this returns the minimal tree height of the trees which minimize the + * first criterion. + *

    + * (Last modified: 01/12/00) + * + * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean) + * @return the minimal tree height -- IF calculated by "infer" + */ + public double getMinimalTreeHeight() { + return _min_height; + } + + /** + * Returns the sum of times (in ms) needed to run method infer of class SDI. + * Final variable TIME needs to be set to true. + * + * @return sum of times (in ms) needed to run method infer of class SDI + */ + public long getTimeSumSDI() { + return _time_sdi; + } + + /** + * Infers gene duplications on a possibly unrooted gene Phylogeny gene_tree. + * The tree is rooted be minimizing either the sum of duplications, the + * mapping cost L, or the tree height (or combinations thereof). If + * return_trees is set to true, it returns an array of possibly more than + * one differently rooted Trees.
    + * The maximal number of returned trees is set with max_trees_to_return. + *
    + * Phylogeny species_tree is a species Phylogeny to which the gene Phylogeny + * gene_tree is compared to.
    + * If both minimize_sum_of_dup and minimize_mapping_cost are true, the tree + * is rooted by minimizing the mapping cost L.
    + * If minimize_sum_of_dup, minimize_mapping_cost, and minimize_height are + * false tree gene_tree is assumed to be alreadty rooted and no attempts at + * rooting are made, and only one tree is returned.
    + *

    + * Conditions: + *

    + *
      + *
    • Both Trees must be completely binary (except deepest node of gene + * tree) + *
    • The species Phylogeny must be rooted + *
    • Both Trees must have species names in the species name fields of + * their nodes + *
    • Both Trees must not have any collapses nodes + *
    + *

    + * (Last modified: 10/01/01) + * + * @param gene_tree + * a binary (except deepest node) gene Phylogeny + * @param species_tree + * a rooted binary species Phylogeny + * @param minimize_mapping_cost + * set to true to root by minimizing the mapping cost L (and also + * the sum of duplications) + * @param minimize_sum_of_dup + * set to true to root by minimizing the sum of duplications + * @param minimize_height + * set to true to root by minimizing the tree height - if + * minimize_mapping_cost is set to true or minimize_sum_of_dup is + * set to true, then out of the resulting trees with minimal + * mapping cost or minimal number of duplications the tree with + * the minimal height is chosen + * @param return_trees + * set to true to return Array of Trees, otherwise null is + * returned + * @param max_trees_to_return + * maximal number of Trees to return (=maximal size of returned + * Array) must be no lower than 1 + * @return array of rooted Trees with duplication vs. speciation assigned if + * return_trees is set to true, null otherwise + */ + public Phylogeny[] infer( final Phylogeny gene_tree, + final Phylogeny species_tree, + final boolean minimize_mapping_cost, + boolean minimize_sum_of_dup, + final boolean minimize_height, + final boolean return_trees, + int max_trees_to_return ) { + init(); + SDIse sdise = null; + final ArrayList trees = new ArrayList(); + Phylogeny[] tree_array = null; + List branches = null; + Phylogeny g = null; + PhylogenyNode prev_root = null; + PhylogenyNode prev_root_c1 = null; + PhylogenyNode prev_root_c2 = null; + int duplications = 0; + int cost = 0; + int counter = 0; + int min_duplications = Integer.MAX_VALUE; + int min_cost = Integer.MAX_VALUE; + int j = 0; + double height = 0.0; + double diff = 0.0; + double min_height = Double.MAX_VALUE; + double min_diff = 0.0; + double[] height__diff = new double[ 2 ]; + boolean smaller = false; + boolean equal = false; + boolean prev_root_was_dup = false; + if ( max_trees_to_return < 1 ) { + max_trees_to_return = 1; + } + if ( minimize_mapping_cost && minimize_sum_of_dup ) { + minimize_sum_of_dup = false; + } + if ( !minimize_mapping_cost && !minimize_sum_of_dup && !minimize_height ) { + throw new IllegalArgumentException( "parameter to minimize not given for rooting of phylogeny" ); + } + g = gene_tree.copy(); + if ( g.getNumberOfExternalNodes() <= 1 ) { + g.setRooted( true ); + setMinimalDuplications( 0 ); + setMinimalTreeHeight( 0.0 ); + tree_array = new Phylogeny[ 1 ]; + tree_array[ 0 ] = g; + return tree_array; + } + for( final PhylogenyNodeIterator iter = g.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.isRoot() ) { + if ( ( n.getNumberOfDescendants() != 2 ) && ( n.getNumberOfDescendants() != 3 ) ) { + throw new IllegalArgumentException( "attempt to run SDI on gene tree with " + + n.getNumberOfDescendants() + " child nodes at its root" ); + } + } + else if ( !n.isExternal() && ( n.getNumberOfDescendants() != 2 ) ) { + throw new IllegalArgumentException( "attempt to run SDI on gene tree which is not completely binary [found node with " + + n.getNumberOfDescendants() + " child nodes]" ); + } + } + for( final PhylogenyNodeIterator iter = species_tree.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( !n.isExternal() && ( n.getNumberOfDescendants() != 2 ) ) { + throw new IllegalArgumentException( "attempt to run SDI with a species tree which is not completely binary (after stripping) [found node with " + + n.getNumberOfDescendants() + " child nodes]" ); + } + } + g.reRoot( g.getFirstExternalNode() ); + branches = SDIR.getBranchesInPreorder( g ); + if ( minimize_mapping_cost || minimize_sum_of_dup ) { + sdise = new SDIse( g, species_tree ); + duplications = sdise.getDuplicationsSum(); + } + final Set used_root_placements = new HashSet(); + F: for( j = 0; j < branches.size(); ++j ) { + prev_root = g.getRoot(); + prev_root_c1 = prev_root.getChildNode1(); + prev_root_c2 = prev_root.getChildNode2(); + prev_root_was_dup = prev_root.isDuplication(); + final PhylogenyBranch current_branch = branches.get( j ); + g.reRoot( current_branch ); + if ( minimize_mapping_cost || minimize_sum_of_dup ) { + duplications = sdise.updateM( prev_root_was_dup, prev_root_c1, prev_root_c2 ); + } + if ( !used_root_placements.contains( current_branch ) ) { + if ( minimize_mapping_cost ) { + cost = sdise.computeMappingCostL(); + if ( minimize_height && ( cost <= min_cost ) ) { + height__diff = SDIR.moveRootOnBranchToMinHeight( g ); + height = height__diff[ 0 ]; + diff = height__diff[ 1 ]; + } + if ( cost == min_cost ) { + if ( minimize_height ) { + smaller = equal = false; + if ( height < min_height ) { + min_height = height; + counter = 1; + smaller = true; + } + else if ( height == min_height ) { + counter++; + equal = true; + } + if ( Math.abs( diff ) < min_diff ) { + min_diff = Math.abs( diff ); + } + } + if ( return_trees ) { + if ( minimize_height ) { + if ( smaller ) { + trees.clear(); + trees.add( g.copy() ); + } + else if ( equal && ( trees.size() < max_trees_to_return ) ) { + trees.add( g.copy() ); + } + } + else { + counter++; + if ( trees.size() < max_trees_to_return ) { + trees.add( g.copy() ); + } + } + } + else if ( !minimize_height ) { + counter++; + } + } + else if ( cost < min_cost ) { + if ( minimize_height ) { + min_height = height; + min_diff = Math.abs( diff ); + } + if ( return_trees ) { + trees.clear(); + trees.add( g.copy() ); + } + counter = 1; + min_cost = cost; + } + if ( duplications < min_duplications ) { + min_duplications = duplications; + } + } + else if ( minimize_sum_of_dup ) { + if ( minimize_height && ( duplications <= min_duplications ) ) { + height__diff = SDIR.moveRootOnBranchToMinHeight( g ); + height = height__diff[ 0 ]; + diff = height__diff[ 1 ]; + } + if ( duplications == min_duplications ) { + if ( minimize_height ) { + smaller = equal = false; + if ( height < min_height ) { + min_height = height; + counter = 1; + smaller = true; + } + else if ( height == min_height ) { + counter++; + equal = true; + } + if ( Math.abs( diff ) < min_diff ) { + min_diff = Math.abs( diff ); + } + } + if ( return_trees ) { + if ( minimize_height ) { + if ( smaller ) { + trees.clear(); + trees.add( g.copy() ); + } + else if ( equal && ( trees.size() < max_trees_to_return ) ) { + trees.add( g.copy() ); + } + } + else { + counter++; + if ( trees.size() < max_trees_to_return ) { + trees.add( g.copy() ); + } + } + } + else if ( !minimize_height ) { + counter++; + } + } + else if ( duplications < min_duplications ) { + if ( minimize_height ) { + min_height = height; + min_diff = Math.abs( diff ); + } + if ( return_trees ) { + trees.clear(); + trees.add( g.copy() ); + } + counter = 1; + min_duplications = duplications; + } + } + else if ( minimize_height ) { + height__diff = SDIR.moveRootOnBranchToMinHeight( g ); + height = height__diff[ 0 ]; + diff = height__diff[ 1 ]; + if ( Math.abs( diff ) < SDIR.ZERO_DIFF ) { + sdise = new SDIse( g, species_tree ); + min_duplications = sdise.getDuplicationsSum(); + min_height = height; + min_diff = Math.abs( diff ); + counter = 1; + if ( return_trees ) { + trees.add( g.copy() ); + } + break F; + } + } + } // if ( used_root_placements.containsKey( current_branch ) ) + used_root_placements.add( current_branch ); + } // End of huge for loop "F". + if ( return_trees ) { + trees.trimToSize(); + tree_array = new Phylogeny[ trees.size() ]; + for( int i = 0; i < trees.size(); ++i ) { + tree_array[ i ] = trees.get( i ); + tree_array[ i ].recalculateNumberOfExternalDescendants( false ); + } + } + setCount( counter ); + setMinimalDuplications( min_duplications ); + setMinimalMappingCost( min_cost ); + setMinimalTreeHeight( min_height ); + setMinimalDiffInSubTreeHeights( Math.abs( min_diff ) ); + return tree_array; + } + + private void init() { + _count = -1; + _min_dup = Integer.MAX_VALUE; + _min_cost = Integer.MAX_VALUE; + _min_height = Double.MAX_VALUE; + _min_diff = Double.MAX_VALUE; + _time_sdi = -1; + } + + private void setCount( final int i ) { + _count = i; + } + + private void setMinimalDiffInSubTreeHeights( final double d ) { + _min_diff = d; + } + + private void setMinimalDuplications( final int i ) { + _min_dup = i; + } + + private void setMinimalMappingCost( final int i ) { + _min_cost = i; + } + + private void setMinimalTreeHeight( final double d ) { + _min_height = d; + } + + // This was totally changed on 2006/10/03. + // Places references to all Branches of Phylogeny t into a List. + // The order is preorder. + // Trees are treated as if they were unrooted (i.e. child 1 and + // child 2 of the root are treated as if they were connected + // directly). + // The resulting List allows to visit all branches without ever + // traversing more than one node at a time. + public static List getBranchesInPreorder( final Phylogeny t ) { + final ArrayList branches = new ArrayList(); + if ( t.isEmpty() || ( t.getNumberOfExternalNodes() <= 1 ) ) { + return branches; + } + if ( t.getNumberOfExternalNodes() == 2 ) { + branches.add( new PhylogenyBranch( t.getRoot().getChildNode1(), t.getRoot().getChildNode2() ) ); + return branches; + } + final Set one = new HashSet(); + final Set two = new HashSet(); + PhylogenyNode node = t.getRoot(); + while ( !node.isRoot() || !two.contains( node.getId() ) ) { + if ( !node.isExternal() && !two.contains( node.getId() ) ) { + if ( !one.contains( node.getId() ) && !two.contains( node.getId() ) ) { + one.add( node.getId() ); + node = node.getChildNode1(); + } + else { + two.add( node.getId() ); + node = node.getChildNode2(); + } + if ( !node.getParent().isRoot() ) { + branches.add( new PhylogenyBranch( node, node.getParent() ) ); + } + else if ( !node.isExternal() ) { + branches.add( new PhylogenyBranch( t.getRoot().getChildNode1(), t.getRoot().getChildNode2() ) ); + } + } + else { + if ( !node.getParent().isRoot() && !node.isExternal() ) { + branches.add( new PhylogenyBranch( node, node.getParent() ) ); + } + node = node.getParent(); + } + } + return branches; + } + + // This places the root of t on its branch in such a way that it + // minimizes the tree height as good as possible. + // Returns the height and the difference in heights of the resulting + // modified Phylogeny t. + private static double[] moveRootOnBranchToMinHeight( final Phylogeny t ) { + final PhylogenyNode root = t.getRoot(); + if ( root.getNumberOfDescendants() != 2 ) { + throw new IllegalArgumentException( "attempt to move root to minimize height on root where number of child nodes does not equal two" ); + } + final PhylogenyNode child0 = root.getChildNode( 0 ); + final PhylogenyNode child1 = root.getChildNode( 1 ); + final double newdist = 0.5 * ( ( child0.getDistanceToParent() > 0 ? child0.getDistanceToParent() : 0 ) + ( child1 + .getDistanceToParent() > 0 ? child1.getDistanceToParent() : 0 ) ); + child0.setDistanceToParent( newdist ); + child1.setDistanceToParent( newdist ); + final double d = child0.getDistanceToParent(); + double diff = 0.0; + double height = 0.0; + final double[] height_diff = new double[ 2 ]; + final double l0 = t.calculateSubtreeHeight( t.getRoot().getChildNode( 0 ) ); + final double l1 = t.calculateSubtreeHeight( t.getRoot().getChildNode( 1 ) ); + diff = l0 - l1; + height = t.getHeight(); + if ( d > 0.0 ) { + if ( ( 2 * d ) > Math.abs( diff ) ) { + child0.setDistanceToParent( d - ( diff / 2.0 ) ); + child1.setDistanceToParent( d + ( diff / 2.0 ) ); + height_diff[ 0 ] = height - Math.abs( diff / 2 ); + height_diff[ 1 ] = 0.0; + } + else { + if ( diff > 0 ) { + child0.setDistanceToParent( 0.0 ); + child1.setDistanceToParent( 2 * d ); + height_diff[ 1 ] = diff - ( 2 * d ); + } + else { + child0.setDistanceToParent( 2 * d ); + child1.setDistanceToParent( 0.0 ); + height_diff[ 1 ] = diff + ( 2 * d ); + } + height_diff[ 0 ] = height - d; + } + } + else { + height_diff[ 0 ] = height; + height_diff[ 1 ] = diff; + } + return height_diff; + } +} diff --git a/forester/java/src/org/forester/sdi/SDIse.java b/forester/java/src/org/forester/sdi/SDIse.java new file mode 100644 index 0000000..4509685 --- /dev/null +++ b/forester/java/src/org/forester/sdi/SDIse.java @@ -0,0 +1,203 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Event; + +/* + * Implements our algorithm for speciation - duplication inference (SDI).

    + * Reference:

    • Zmasek, C.M. and Eddy, S.R. (2001) "A simple + * algorithm to infer gene duplication and speciation events on a gene tree". + * Bioinformatics, in press.

    The initialization is accomplished by: + *

    • method "linkExtNodesOfG()" of class SDI: setting the links for + * the external nodes of the gene tree
    • "preorderReID(int)" from class + * Phylogeny: numbering of nodes of the species tree in preorder
    • the + * optional stripping of the species tree is accomplished by method + * "stripTree(Phylogeny,Phylogeny)" of class Phylogeny

    The recursion + * part is accomplished by this class' method + * "geneTreePostOrderTraversal(PhylogenyNode)".

    Requires JDK 1.2 or greater. + * + * @see SDI#linkNodesOfG() + * + * @see Phylogeny#preorderReID(int) + * + * @see + * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny) + * + * @see #geneTreePostOrderTraversal(PhylogenyNode) + * + * @author Christian M. Zmasek + * + * @version 1.102 -- last modified: 10/02/01 + */ +public class SDIse extends SDI { + + /** + * Constructor which sets the gene tree and the species tree to be compared. + * species_tree is the species tree to which the gene tree gene_tree will be + * compared to - with method "infer(boolean)". Both Trees must be completely + * binary and rooted. The actual inference is accomplished with method + * "infer(boolean)". The mapping cost L can then be calculated with method + * "computeMappingCost()". + *

    + * (Last modified: 01/11/01) + * + * @see #infer(boolean) + * @see SDI#computeMappingCostL() + * @param gene_tree + * reference to a rooted binary gene Phylogeny to which assign + * duplication vs speciation, must have species names in the + * species name fields for all external nodes + * @param species_tree + * reference to a rooted binary species Phylogeny which might get + * stripped in the process, must have species names in the + * species name fields for all external nodes + */ + public SDIse( final Phylogeny gene_tree, final Phylogeny species_tree ) { + super( gene_tree, species_tree ); + _duplications_sum = 0; + getSpeciesTree().preOrderReId(); + linkNodesOfG(); + geneTreePostOrderTraversal( getGeneTree().getRoot() ); + } + + // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode ) + // Calculates M for PhylogenyNode n, given that M for the two children + // of n has been calculated. + // (Last modified: 10/02/01) + private void calculateMforNode( final PhylogenyNode n ) { + if ( !n.isExternal() ) { + final boolean was_duplication = n.isDuplication(); + PhylogenyNode a = n.getChildNode1().getLink(), b = n.getChildNode2().getLink(); + while ( a != b ) { + if ( a.getId() > b.getId() ) { + a = a.getParent(); + } + else { + b = b.getParent(); + } + } + n.setLink( a ); + Event event = null; + if ( ( a == n.getChildNode1().getLink() ) || ( a == n.getChildNode2().getLink() ) ) { + event = Event.createSingleDuplicationEvent(); + if ( !was_duplication ) { + ++_duplications_sum; + } + } + else { + event = Event.createSingleSpeciationEvent(); + if ( was_duplication ) { + --_duplications_sum; + } + } + n.getNodeData().setEvent( event ); + } + } // calculateMforNode( PhylogenyNode ) + + /** + * Traverses the subtree of PhylogenyNode g in postorder, calculating the + * mapping function M, and determines which nodes represent speciation + * events and which ones duplication events. + *

    + * Preconditions: Mapping M for external nodes must have been calculated and + * the species tree must be labelled in preorder. + *

    + * (Last modified: 01/11/01) + * + * @param g + * starting node of a gene tree - normally the root + */ + void geneTreePostOrderTraversal( final PhylogenyNode g ) { + PhylogenyNode a, b; + if ( !g.isExternal() ) { + geneTreePostOrderTraversal( g.getChildNode( 0 ) ); + geneTreePostOrderTraversal( g.getChildNode( 1 ) ); + a = g.getChildNode( 0 ).getLink(); + b = g.getChildNode( 1 ).getLink(); + while ( a != b ) { + if ( a.getId() > b.getId() ) { + a = a.getParent(); + } + else { + b = b.getParent(); + } + } + g.setLink( a ); + // Determines whether dup. or spec. + Event event = null; + if ( ( a == g.getChildNode( 0 ).getLink() ) || ( a == g.getChildNode( 1 ).getLink() ) ) { + event = Event.createSingleDuplicationEvent(); + ++_duplications_sum; + } + else { + event = Event.createSingleSpeciationEvent(); + } + g.getNodeData().setEvent( event ); + } + } // geneTreePostOrderTraversal( PhylogenyNode ) + + /** + * Updates the mapping function M after the root of the gene tree has been + * moved by one branch. It calculates M for the root of the gene tree and + * one of its two children. + *

    + * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)". + *

    + * (Last modfied: 10/02/01) + * + * @param prev_root_was_dup + * true if the previous root was a duplication, false otherwise + * @param prev_root_c1 + * child 1 of the previous root + * @param prev_root_c2 + * child 2 of the previous root + * @return number of duplications which have been assigned in gene tree + */ + int updateM( final boolean prev_root_was_dup, final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) { + final PhylogenyNode root = getGeneTree().getRoot(); + if ( ( root.getChildNode1() == prev_root_c1 ) || ( root.getChildNode2() == prev_root_c1 ) ) { + calculateMforNode( prev_root_c1 ); + } + else { + calculateMforNode( prev_root_c2 ); + } + Event event = null; + if ( prev_root_was_dup ) { + event = Event.createSingleDuplicationEvent(); + } + else { + event = Event.createSingleSpeciationEvent(); + } + root.getNodeData().setEvent( event ); + calculateMforNode( root ); + return getDuplicationsSum(); + } // updateM( boolean, PhylogenyNode, PhylogenyNode ) +} // End of class SDIse. diff --git a/forester/java/src/org/forester/sdi/Shin.java b/forester/java/src/org/forester/sdi/Shin.java new file mode 100644 index 0000000..436e8d5 --- /dev/null +++ b/forester/java/src/org/forester/sdi/Shin.java @@ -0,0 +1,134 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +public class Shin { + + public Shin() { + } + + private void analyze( final Phylogeny gene_tree, + final String gene_tree_file_name, + final Phylogeny[] species_trees, + final File out_dir ) throws IOException { + final boolean minimize_cost = true; + final boolean minimize_sum_of_dup = true; + final boolean minimize_height = true; + final int trees_to_return = 1; + System.out.println( gene_tree_file_name + ": " + gene_tree.getName() ); + final Set species_tree_species = getAllExternalSpecies( species_trees[ 0 ] ); + final PhylogenyWriter w = new PhylogenyWriter(); + for( final Phylogeny species_tree : species_trees ) { + PhylogenyMethods.deleteExternalNodesPositiveSelection( species_tree_species, gene_tree ); + if ( gene_tree.isEmpty() ) { + System.out.println( " >> empty: " + gene_tree_file_name + ": " + gene_tree.getName() ); + continue; + } + final File outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + gene_tree_file_name ); + if ( outfile.exists() ) { + System.out + .println( " >> already exists, skipping: " + gene_tree_file_name + ": " + gene_tree.getName() ); + } + final SDIR sdir = new SDIR(); + final Phylogeny[] analyzed_gene_trees = sdir.infer( gene_tree, + species_tree, + minimize_cost, + minimize_sum_of_dup, + minimize_height, + true, + trees_to_return ); + final int duplications = sdir.getMinimalDuplications(); + final int mapping_cost = sdir.getMinimalMappingCost(); + final List phys = new ArrayList(); + for( final Phylogeny phy : analyzed_gene_trees ) { + phys.add( phy ); + } + w.toPhyloXML( outfile, phys, 0, ForesterUtil.LINE_SEPARATOR ); + } + } + + private void checkSpeciesTreesForEqualNumberOfExtNodes( final Phylogeny[] species_trees ) { + int ext_nodes = -1; + for( final Phylogeny phylogeny : species_trees ) { + if ( ext_nodes < 0 ) { + ext_nodes = phylogeny.getNumberOfExternalNodes(); + } + else if ( ext_nodes != phylogeny.getNumberOfExternalNodes() ) { + throw new IllegalArgumentException( "species trees must have all the same number of external nodes" ); + } + } + } + + public void method1( final List gene_tree_files, final Phylogeny[] species_trees, final File out_dir ) + throws IOException { + checkSpeciesTreesForEqualNumberOfExtNodes( species_trees ); + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + for( final File gene_tree_file : gene_tree_files ) { + if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) { + throw new IOException( "[" + gene_tree_file + "] is not readable" ); + } + Phylogeny[] gene_trees = null; + gene_trees = factory.create( gene_tree_file, new PhyloXmlParser() ); + if ( gene_trees.length != 1 ) { + throw new IOException( "[" + gene_tree_file + "] contains " + gene_trees.length + + " gene trees, expecting precisely one" ); + } + analyze( gene_trees[ 0 ], gene_tree_file.getName(), species_trees, out_dir ); + } + } + + private static Set getAllExternalSpecies( final Phylogeny phy ) { + final Set specs = new HashSet(); + for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + specs.add( n.getNodeData().getTaxonomy() ); + } + else { + throw new IllegalArgumentException( "node " + n.getId() + " has no taxonomic data" ); + } + } + return specs; + } +} diff --git a/forester/java/src/org/forester/sdi/TaxonomyAssigner.java b/forester/java/src/org/forester/sdi/TaxonomyAssigner.java new file mode 100644 index 0000000..3e3fe86 --- /dev/null +++ b/forester/java/src/org/forester/sdi/TaxonomyAssigner.java @@ -0,0 +1,71 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +public class TaxonomyAssigner extends SDI { + + public TaxonomyAssigner( final Phylogeny gene_tree, final Phylogeny species_tree ) { + super( gene_tree, species_tree ); + getSpeciesTree().preOrderReId(); + linkNodesOfG(); + geneTreePostOrderTraversal( getGeneTree().getRoot() ); + } + + void geneTreePostOrderTraversal( final PhylogenyNode g ) { + if ( !g.isExternal() ) { + for( final PhylogenyNodeIterator iter = g.iterateChildNodesForward(); iter.hasNext(); ) { + geneTreePostOrderTraversal( iter.next() ); + } + final PhylogenyNode[] linked_nodes = new PhylogenyNode[ g.getNumberOfDescendants() ]; + for( int i = 0; i < linked_nodes.length; ++i ) { + linked_nodes[ i ] = g.getChildNode( i ).getLink(); + } + final int[] min_max = GSDI.obtainMinMaxIdIndices( linked_nodes ); + int min_i = min_max[ 0 ]; + int max_i = min_max[ 1 ]; + while ( linked_nodes[ min_i ] != linked_nodes[ max_i ] ) { + linked_nodes[ max_i ] = linked_nodes[ max_i ].getParent(); + final int[] min_max_ = GSDI.obtainMinMaxIdIndices( linked_nodes ); + min_i = min_max_[ 0 ]; + max_i = min_max_[ 1 ]; + } + final PhylogenyNode s = linked_nodes[ max_i ]; + g.setLink( s ); + if ( s.getNodeData().isHasTaxonomy() ) { + g.getNodeData().setTaxonomy( ( Taxonomy ) s.getNodeData().getTaxonomy().copy() ); + } + } + } + + public static void execute( final Phylogeny gene_tree, final Phylogeny species_tree ) { + new TaxonomyAssigner( gene_tree, species_tree ); + } +} diff --git a/forester/java/src/org/forester/sdi/TestGSDI.java b/forester/java/src/org/forester/sdi/TestGSDI.java new file mode 100644 index 0000000..6cf3c72 --- /dev/null +++ b/forester/java/src/org/forester/sdi/TestGSDI.java @@ -0,0 +1,1215 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +import java.io.IOException; + +import org.forester.development.DevelopmentTools; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; + +public final class TestGSDI { + + private final static Phylogeny createPhylogeny( final String nhx ) throws IOException { + final Phylogeny p = ParserBasedPhylogenyFactory.getInstance().create( nhx, new NHXParser() )[ 0 ]; + p.setRooted( true ); + return p; + } + + private final static Event getEvent( final Phylogeny p, final String n1, final String n2 ) { + return PhylogenyMethods.getInstance().obtainLCA( p.getNode( n1 ), p.getNode( n2 ) ).getNodeData().getEvent(); + } + + public static boolean test() { + if ( !TestGSDI.testGSDI_general() ) { + return false; + } + if ( !TestGSDI.testGSDI_against_binary_gene_tree() ) { + return false; + } + return true; + } + + private static boolean testGSDI_against_binary_gene_tree() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final String multi_species_2_str = "(((((([&&NHX:S=1],[&&NHX:S=2])," + + "([&&NHX:S=3],[&&NHX:S=4],[&&NHX:S=5]))," + + "([&&NHX:S=6],[&&NHX:S=7],[&&NHX:S=8],[&&NHX:S=9]))," + + "([&&NHX:S=10],[&&NHX:S=11]))," + + "([&&NHX:S=12],[&&NHX:S=13],[&&NHX:S=14]))," + + "([&&NHX:S=15],([&&NHX:S=16],[&&NHX:S=17]),([&&NHX:S=18],[&&NHX:S=19],[&&NHX:S=20]),([&&NHX:S=21],[&&NHX:S=22],[&&NHX:S=23],[&&NHX:S=24])));"; + final String gene_2_1_str = "(((((([&&NHX:S=1],[&&NHX:S=2])1_2,([&&NHX:S=3],[&&NHX:S=4]))," + + "([&&NHX:S=6],[&&NHX:S=7])6_7_8_9)1_9,([&&NHX:S=10],[&&NHX:S=11]))," + + "([&&NHX:S=12],[&&NHX:S=13])12_13_14)1_14," + + "([&&NHX:S=15],([&&NHX:S=21],[&&NHX:S=24])21_22_23_24)15_24);"; + final Phylogeny multi_species_2 = factory.create( multi_species_2_str, new NHXParser() )[ 0 ]; + final Phylogeny gene_2_1 = factory.create( gene_2_1_str, new NHXParser() )[ 0 ]; + multi_species_2.setRooted( true ); + gene_2_1.setRooted( true ); + final GSDI sdi = new GSDI( gene_2_1, multi_species_2, false ); + if ( sdi.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi.getDuplicationsSum() != 0 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testGSDI_general() { + try { + final PhylogenyMethods pm = PhylogenyMethods.getInstance(); + final String s1_ = "((([&&NHX:S=A2],[&&NHX:S=A1]),[&&NHX:S=B],[&&NHX:S=C]),[&&NHX:S=D])"; + final Phylogeny s1 = ParserBasedPhylogenyFactory.getInstance().create( s1_, new NHXParser() )[ 0 ]; + s1.setRooted( true ); + final Phylogeny g1 = TestGSDI + .createPhylogeny( "((((B[&&NHX:S=B],A1[&&NHX:S=A1]),C[&&NHX:S=C]),A2[&&NHX:S=A2]),D[&&NHX:S=D])" ); + final GSDI sdi1 = new GSDI( g1, s1, false ); + if ( sdi1.getDuplicationsSum() != 1 ) { + return false; + } + if ( !pm.obtainLCA( g1.getNode( "B" ), g1.getNode( "A1" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g1.getNode( "C" ), g1.getNode( "A1" ) ).getNodeData().getEvent() + .isSpeciationOrDuplication() ) { + return false; + } + if ( !( pm.obtainLCA( g1.getNode( "A2" ), g1.getNode( "A1" ) ).getNodeData().getEvent().isDuplication() ) ) { + return false; + } + if ( !pm.obtainLCA( g1.getNode( "D" ), g1.getNode( "A1" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2 = TestGSDI + .createPhylogeny( "((((A2[&&NHX:S=A2],A1[&&NHX:S=A1]),B[&&NHX:S=B]),C[&&NHX:S=C]),D[&&NHX:S=D])" ); + final GSDI sdi2 = new GSDI( g2, s1, false ); + if ( sdi2.getDuplicationsSum() != 0 ) { + return false; + } + if ( !pm.obtainLCA( g2.getNode( "A1" ), g2.getNode( "A2" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g2.getNode( "A1" ), g2.getNode( "B" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g2.getNode( "A1" ), g2.getNode( "C" ) ).getNodeData().getEvent() + .isSpeciationOrDuplication() ) { + return false; + } + if ( !pm.obtainLCA( g2.getNode( "A1" ), g2.getNode( "D" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g3 = TestGSDI + .createPhylogeny( "((((A2[&&NHX:S=A2],A1[&&NHX:S=A1]),C[&&NHX:S=C]),B[&&NHX:S=B]),D[&&NHX:S=D])" ); + final GSDI sdi3 = new GSDI( g3, s1, false ); + if ( sdi3.getDuplicationsSum() != 0 ) { + return false; + } + if ( !pm.obtainLCA( g3.getNode( "A1" ), g3.getNode( "A2" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g3.getNode( "A1" ), g3.getNode( "C" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g3.getNode( "A1" ), g3.getNode( "B" ) ).getNodeData().getEvent() + .isSpeciationOrDuplication() ) { + return false; + } + if ( !pm.obtainLCA( g3.getNode( "A1" ), g3.getNode( "D" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g4 = TestGSDI + .createPhylogeny( "(((B[&&NHX:S=B],C1[&&NHX:S=C]),C2[&&NHX:S=C]),D[&&NHX:S=D])" ); + final GSDI sdi4 = new GSDI( g4, s1, false ); + if ( sdi4.getDuplicationsSum() != 1 ) { + return false; + } + if ( !pm.obtainLCA( g4.getNode( "B" ), g4.getNode( "C1" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g4.getNode( "B" ), g4.getNode( "C2" ) ).getNodeData().getEvent().isDuplication() ) { + return false; + } + if ( !pm.obtainLCA( g4.getNode( "B" ), g4.getNode( "D" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g5 = TestGSDI + .createPhylogeny( "(((D1[&&NHX:S=D],A1[&&NHX:S=A1]),B[&&NHX:S=B]),((D2[&&NHX:S=D],D3[&&NHX:S=D]),C[&&NHX:S=C]))" ); + final GSDI sdi5 = new GSDI( g5, s1, false ); + if ( sdi5.getDuplicationsSum() != 3 ) { + return false; + } + if ( !pm.obtainLCA( g5.getNode( "D1" ), g5.getNode( "A1" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g5.getNode( "D1" ), g5.getNode( "B" ) ).getNodeData().getEvent().isDuplication() ) { + return false; + } + if ( !pm.obtainLCA( g5.getNode( "D1" ), g5.getNode( "D2" ) ).getNodeData().getEvent().isDuplication() ) { + return false; + } + if ( !pm.obtainLCA( g5.getNode( "D2" ), g5.getNode( "D3" ) ).getNodeData().getEvent().isDuplication() ) { + return false; + } + if ( !pm.obtainLCA( g5.getNode( "C" ), g5.getNode( "D3" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny species7 = TestGSDI.createPhylogeny( "(((((((([&&NHX:S=a1],[&&NHX:S=a2])," + + "([&&NHX:S=b1],[&&NHX:S=b2])),[&&NHX:S=x]),(([&&NHX:S=m1],[&&NHX:S=m2])," + + "([&&NHX:S=n1],[&&NHX:S=n2]))),(([&&NHX:S=i1],[&&NHX:S=i2])," + + "([&&NHX:S=j1],[&&NHX:S=j2]))),(([&&NHX:S=e1],[&&NHX:S=e2])," + + "([&&NHX:S=f1],[&&NHX:S=f2]))),[&&NHX:S=y]),[&&NHX:S=z])" ); + final Phylogeny gene7_2 = TestGSDI + .createPhylogeny( "(((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),x[&&NHX:S=x]),m1[&&NHX:S=m1]),i1[&&NHX:S=i1]),j2[&&NHX:S=j2]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" ); + gene7_2.setRooted( true ); + final GSDI sdi7_2 = new GSDI( gene7_2, species7, false ); + if ( sdi7_2.getDuplicationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "a2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "x" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "m1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "i1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "j2" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "e1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "y" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( gene7_2, "a1", "z" ).isSpeciation() ) { + return false; + } + final String s2_ = "((" + "([&&NHX:S=a1],[&&NHX:S=a2],[&&NHX:S=a3],[&&NHX:S=a4])," + + "([&&NHX:S=b1],[&&NHX:S=b2],[&&NHX:S=b3],[&&NHX:S=b4])," + + "([&&NHX:S=c1],[&&NHX:S=c2],[&&NHX:S=c3],[&&NHX:S=c4])," + + "([&&NHX:S=d1],[&&NHX:S=d2],[&&NHX:S=d3],[&&NHX:S=d4])),(" + + "([&&NHX:S=e1],[&&NHX:S=e2],[&&NHX:S=e3],[&&NHX:S=e4])," + + "([&&NHX:S=f1],[&&NHX:S=f2],[&&NHX:S=f3],[&&NHX:S=f4])," + + "([&&NHX:S=g1],[&&NHX:S=g2],[&&NHX:S=g3],[&&NHX:S=g4])," + + "([&&NHX:S=h1],[&&NHX:S=h2],[&&NHX:S=h3],[&&NHX:S=h4])),(" + + "([&&NHX:S=i1],[&&NHX:S=i2],[&&NHX:S=i3],[&&NHX:S=i4])," + + "([&&NHX:S=j1],[&&NHX:S=j2],[&&NHX:S=j3],[&&NHX:S=j4])," + + "([&&NHX:S=k1],[&&NHX:S=k2],[&&NHX:S=k3],[&&NHX:S=k4])," + + "([&&NHX:S=l1],[&&NHX:S=l2],[&&NHX:S=l3],[&&NHX:S=l4])),(" + + "([&&NHX:S=m1],[&&NHX:S=m2],[&&NHX:S=m3],[&&NHX:S=m4])," + + "([&&NHX:S=n1],[&&NHX:S=n2],[&&NHX:S=n3],[&&NHX:S=n4])," + + "([&&NHX:S=o1],[&&NHX:S=o2],[&&NHX:S=o3],[&&NHX:S=o4])," + + "([&&NHX:S=p1],[&&NHX:S=p2],[&&NHX:S=p3],[&&NHX:S=p4])" + + "),[&&NHX:S=x],[&&NHX:S=y],[&&NHX:S=z])"; + final Phylogeny s2 = ParserBasedPhylogenyFactory.getInstance().create( s2_, new NHXParser() )[ 0 ]; + s2.setRooted( true ); + final Phylogeny g2_0 = TestGSDI.createPhylogeny( "(m1[&&NHX:S=m1],m3[&&NHX:S=m3])" ); + final GSDI sdi2_0 = new GSDI( g2_0, s2, false ); + if ( sdi2_0.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_0.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_0.getSpeciationsSum() != 1 ) { + return false; + } + if ( !pm.obtainLCA( g2_0.getNode( "m1" ), g2_0.getNode( "m3" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2_1 = TestGSDI.createPhylogeny( "(e2[&&NHX:S=e2],h2[&&NHX:S=h2])" ); + final GSDI sdi2_1 = new GSDI( g2_1, s2, false ); + if ( sdi2_1.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_1.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_1.getSpeciationsSum() != 1 ) { + return false; + } + if ( !pm.obtainLCA( g2_1.getNode( "e2" ), g2_1.getNode( "h2" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2_2 = TestGSDI.createPhylogeny( "(e2[&&NHX:S=e2],p4[&&NHX:S=p4])" ); + final GSDI sdi2_2 = new GSDI( g2_2, s2, false ); + if ( sdi2_2.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_2.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_2.getSpeciationsSum() != 1 ) { + return false; + } + if ( !pm.obtainLCA( g2_2.getNode( "e2" ), g2_2.getNode( "p4" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2_3 = TestGSDI.createPhylogeny( "(e2a[&&NHX:S=e2],e2b[&&NHX:S=e2])" ); + final GSDI sdi2_3 = new GSDI( g2_3, s2, false ); + if ( sdi2_3.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_3.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_3.getSpeciationsSum() != 0 ) { + return false; + } + if ( !pm.obtainLCA( g2_3.getNode( "e2a" ), g2_3.getNode( "e2b" ) ).getNodeData().getEvent().isDuplication() ) { + return false; + } + final Phylogeny g2_4 = TestGSDI.createPhylogeny( "((j1[&&NHX:S=j1],j4[&&NHX:S=j4]),i3[&&NHX:S=i3])" ); + final GSDI sdi2_4 = new GSDI( g2_4, s2, false ); + if ( sdi2_4.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_4.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_4.getSpeciationsSum() != 2 ) { + return false; + } + if ( !pm.obtainLCA( g2_4.getNode( "j1" ), g2_4.getNode( "j4" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g2_4.getNode( "j1" ), g2_4.getNode( "i3" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2_5 = TestGSDI.createPhylogeny( "((j1[&&NHX:S=j1],j4[&&NHX:S=j4]),f3[&&NHX:S=f3])" ); + final GSDI sdi2_5 = new GSDI( g2_5, s2, false ); + if ( sdi2_5.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_5.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_5.getSpeciationsSum() != 2 ) { + return false; + } + if ( !pm.obtainLCA( g2_5.getNode( "j1" ), g2_5.getNode( "j4" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g2_5.getNode( "j1" ), g2_5.getNode( "f3" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2_6 = TestGSDI.createPhylogeny( "((j3[&&NHX:S=j3],i4[&&NHX:S=i4]),f3[&&NHX:S=f3])" ); + final GSDI sdi2_6 = new GSDI( g2_6, s2, false ); + if ( sdi2_6.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_6.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_6.getSpeciationsSum() != 2 ) { + return false; + } + if ( !pm.obtainLCA( g2_6.getNode( "j3" ), g2_6.getNode( "i4" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g2_6.getNode( "j3" ), g2_6.getNode( "f3" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2_7 = TestGSDI.createPhylogeny( "((j1[&&NHX:S=j1],k1[&&NHX:S=k1]),i1[&&NHX:S=i1])" ); + final GSDI sdi2_7 = new GSDI( g2_7, s2, false ); + if ( sdi2_7.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_7.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_7.getSpeciationsSum() != 1 ) { + return false; + } + if ( !pm.obtainLCA( g2_7.getNode( "j1" ), g2_7.getNode( "k1" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + if ( !pm.obtainLCA( g2_7.getNode( "j1" ), g2_7.getNode( "i1" ) ).getNodeData().getEvent() + .isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_8 = TestGSDI.createPhylogeny( "(j1[&&NHX:S=j1],(k1[&&NHX:S=k1],i1[&&NHX:S=i1]))" ); + final GSDI sdi2_8 = new GSDI( g2_8, s2, false ); + if ( sdi2_8.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_8.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_8.getSpeciationsSum() != 1 ) { + return false; + } + if ( !pm.obtainLCA( g2_8.getNode( "j1" ), g2_8.getNode( "k1" ) ).getNodeData().getEvent() + .isSpeciationOrDuplication() ) { + return false; + } + if ( !pm.obtainLCA( g2_8.getNode( "k1" ), g2_8.getNode( "i1" ) ).getNodeData().getEvent().isSpeciation() ) { + return false; + } + final Phylogeny g2_9 = TestGSDI.createPhylogeny( "((j1[&&NHX:S=j1],k4[&&NHX:S=k4]),f2[&&NHX:S=f2])" ); + final GSDI sdi2_9 = new GSDI( g2_9, s2, false ); + if ( sdi2_9.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_9.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_9.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_9, "j1", "k4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_9, "j1", "f2" ).isSpeciation() ) { + return false; + } + final Phylogeny g2_10 = TestGSDI.createPhylogeny( "((m1[&&NHX:S=m1],k4[&&NHX:S=k4]),f2[&&NHX:S=f2])" ); + final GSDI sdi2_10 = new GSDI( g2_10, s2, false ); + if ( sdi2_10.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_10.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_10.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_10, "m1", "k4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_10, "m1", "f2" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_11 = TestGSDI.createPhylogeny( "((m1[&&NHX:S=m1],k4[&&NHX:S=k4]),x[&&NHX:S=x])" ); + final GSDI sdi2_11 = new GSDI( g2_11, s2, false ); + if ( sdi2_11.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_11.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_11.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_11, "m1", "k4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_11, "m1", "x" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_12 = TestGSDI.createPhylogeny( "(m1[&&NHX:S=m1],(k4[&&NHX:S=k4],x[&&NHX:S=x]))" ); + final GSDI sdi2_12 = new GSDI( g2_12, s2, false ); + if ( sdi2_12.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_12.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_12.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_12, "x", "k4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_12, "m1", "x" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_13 = TestGSDI.createPhylogeny( "(x[&&NHX:S=x],(y[&&NHX:S=y],z[&&NHX:S=z]))" ); + final GSDI sdi2_13 = new GSDI( g2_13, s2, false ); + if ( sdi2_13.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_13.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_13.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_13, "y", "z" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_13, "x", "z" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_14 = TestGSDI.createPhylogeny( "(a1_1[&&NHX:S=a1],(b1[&&NHX:S=b1],a1[&&NHX:S=a1]))" ); + final GSDI sdi2_14 = new GSDI( g2_14, s2, false ); + if ( sdi2_14.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_14.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_14.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_14, "b1", "a1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_14, "b1", "a1_1" ).isDuplication() ) { + return false; + } + final Phylogeny g2_15 = TestGSDI.createPhylogeny( "(a2[&&NHX:S=a2],(b1[&&NHX:S=b1],a1[&&NHX:S=a1]))" ); + final GSDI sdi2_15 = new GSDI( g2_15, s2, false ); + if ( sdi2_15.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_15.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_15.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_15, "b1", "a1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_15, "b1", "a2" ).isDuplication() ) { + return false; + } + final Phylogeny g2_16 = TestGSDI.createPhylogeny( "(n2[&&NHX:S=n2],(j3[&&NHX:S=j3],n1[&&NHX:S=n1]))" ); + final GSDI sdi2_16 = new GSDI( g2_16, s2, false ); + if ( sdi2_16.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_16.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_16.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_16, "j3", "n1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_16, "j3", "n2" ).isDuplication() ) { + return false; + } + final Phylogeny g2_17 = TestGSDI.createPhylogeny( "(p4[&&NHX:S=p4],(j3[&&NHX:S=j3],n1[&&NHX:S=n1]))" ); + final GSDI sdi2_17 = new GSDI( g2_17, s2, false ); + if ( sdi2_17.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_17.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_17.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_17, "j3", "n1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_17, "j3", "p4" ).isDuplication() ) { + return false; + } + final Phylogeny g2_18 = TestGSDI + .createPhylogeny( "((n11[&&NHX:S=n1],n12[&&NHX:S=n1]),(n13[&&NHX:S=n1],n14[&&NHX:S=n1]))" ); + final GSDI sdi2_18 = new GSDI( g2_18, s2, false ); + if ( sdi2_18.getDuplicationsSum() != 3 ) { + return false; + } + if ( sdi2_18.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_18.getSpeciationsSum() != 0 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_18, "n11", "n12" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_18, "n13", "n14" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_18, "n11", "n13" ).isDuplication() ) { + return false; + } + final Phylogeny g2_19 = TestGSDI + .createPhylogeny( "((n11[&&NHX:S=n1],n21[&&NHX:S=n2]),(n12[&&NHX:S=n1],n22[&&NHX:S=n2]))" ); + final GSDI sdi2_19 = new GSDI( g2_19, s2, false ); + if ( sdi2_19.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_19.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_19.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_19, "n11", "n21" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_19, "n12", "n22" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_19, "n11", "n12" ).isDuplication() ) { + return false; + } + final Phylogeny g2_20 = TestGSDI + .createPhylogeny( "((n11[&&NHX:S=n1],n2[&&NHX:S=n2]),(n12[&&NHX:S=n1],n3[&&NHX:S=n3]))" ); + final GSDI sdi2_20 = new GSDI( g2_20, s2, false ); + if ( sdi2_20.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_20.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_20.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_20, "n11", "n2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_20, "n12", "n3" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_20, "n11", "n12" ).isDuplication() ) { + return false; + } + final Phylogeny g2_21 = TestGSDI + .createPhylogeny( "((n1[&&NHX:S=n1],n2[&&NHX:S=n2]),(n3[&&NHX:S=n3],a1[&&NHX:S=a1]))" ); + final GSDI sdi2_21 = new GSDI( g2_21, s2, false ); + if ( sdi2_21.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_21.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_21.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_21, "n1", "n2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_21, "n3", "a1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_21, "n2", "a1" ).isDuplication() ) { + return false; + } + final Phylogeny g2_22 = TestGSDI + .createPhylogeny( "((n1[&&NHX:S=n1],n2[&&NHX:S=n2]),(n3[&&NHX:S=n3],n4[&&NHX:S=n4]))" ); + final GSDI sdi2_22 = new GSDI( g2_22, s2, false ); + if ( sdi2_22.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_22.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_22.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_22, "n1", "n2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_22, "n3", "n4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_22, "n1", "n3" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_23 = TestGSDI + .createPhylogeny( "((a1[&&NHX:S=a1],b1[&&NHX:S=b1]),(c1[&&NHX:S=c1],d1[&&NHX:S=d1]))" ); + final GSDI sdi2_23 = new GSDI( g2_23, s2, false ); + if ( sdi2_23.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_23.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_23.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_23, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_23, "c1", "d1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_23, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_24 = TestGSDI + .createPhylogeny( "((a1[&&NHX:S=a1],e1[&&NHX:S=e1]),(i1[&&NHX:S=i1],m1[&&NHX:S=m1]))" ); + final GSDI sdi2_24 = new GSDI( g2_24, s2, false ); + if ( sdi2_24.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_24.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_24.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_24, "a1", "e1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_24, "i1", "m1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_24, "a1", "i1" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_25 = TestGSDI + .createPhylogeny( "((a1[&&NHX:S=a1],a4[&&NHX:S=a4]),(b1[&&NHX:S=b1],c1[&&NHX:S=c1]))" ); + final GSDI sdi2_25 = new GSDI( g2_25, s2, false ); + if ( sdi2_25.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_25.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_25.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_25, "a1", "a4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_25, "b1", "c1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_25, "a1", "b1" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_26 = TestGSDI + .createPhylogeny( "(((a1[&&NHX:S=a1],a4[&&NHX:S=a4]),b1[&&NHX:S=b1]),e1[&&NHX:S=e1])" ); + final GSDI sdi2_26 = new GSDI( g2_26, s2, false ); + if ( sdi2_26.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_26.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_26.getSpeciationsSum() != 3 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_26, "a1", "a4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_26, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_26, "a1", "e1" ).isSpeciation() ) { + return false; + } + final Phylogeny g2_27 = TestGSDI + .createPhylogeny( "(((a1[&&NHX:S=a1],a4[&&NHX:S=a4]),b1[&&NHX:S=b1]),c1[&&NHX:S=c1])" ); + final GSDI sdi2_27 = new GSDI( g2_27, s2, false ); + if ( sdi2_27.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_27.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_27.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_27, "a1", "a4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_27, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_27, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_28 = TestGSDI + .createPhylogeny( "(((a1[&&NHX:S=a1],b1[&&NHX:S=b1]),c1[&&NHX:S=c1]),e1[&&NHX:S=e1])" ); + final GSDI sdi2_28 = new GSDI( g2_28, s2, false ); + if ( sdi2_28.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_28.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_28.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_28, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_28, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_28, "a1", "e1" ).isSpeciation() ) { + return false; + } + final Phylogeny g2_29 = TestGSDI + .createPhylogeny( "(((a1[&&NHX:S=a1],b1[&&NHX:S=b1]),c1[&&NHX:S=c1]),d1[&&NHX:S=d1])" ); + final GSDI sdi2_29 = new GSDI( g2_29, s2, false ); + if ( sdi2_29.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_29.getSpeciationOrDuplicationEventsSum() != 2 ) { + return false; + } + if ( sdi2_29.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_29, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_29, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_29, "a1", "d1" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_30 = TestGSDI + .createPhylogeny( "(((a1[&&NHX:S=a1],b1[&&NHX:S=b1]),c1[&&NHX:S=c1]),a2[&&NHX:S=a2])" ); + final GSDI sdi2_30 = new GSDI( g2_30, s2, false ); + if ( sdi2_30.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_30.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_30.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_30, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_30, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_30, "a1", "a2" ).isDuplication() ) { + return false; + } + final Phylogeny g2_31 = TestGSDI + .createPhylogeny( "(((a1[&&NHX:S=a1],b1[&&NHX:S=b1]),c1[&&NHX:S=c1]),c2[&&NHX:S=c2])" ); + final GSDI sdi2_31 = new GSDI( g2_31, s2, false ); + if ( sdi2_31.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_31.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_31.getSpeciationsSum() != 1 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_31, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_31, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_31, "a1", "c2" ).isDuplication() ) { + return false; + } + final Phylogeny g2_32 = TestGSDI + .createPhylogeny( "((((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),c1[&&NHX:S=c1]),d1[&&NHX:S=d1]),x[&&NHX:S=x]),p1[&&NHX:S=p1]),i1[&&NHX:S=i1]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" ); + final GSDI sdi2_32 = new GSDI( g2_32, s2, false ); + if ( sdi2_32.getDuplicationsSum() != 0 ) { + return false; + } + if ( sdi2_32.getSpeciationOrDuplicationEventsSum() != 7 ) { + return false; + } + if ( sdi2_32.getSpeciationsSum() != 3 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "a2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "d1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "x" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "p1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "i1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "e1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "y" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_32, "a1", "z" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_33 = TestGSDI + .createPhylogeny( "(((((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),c1[&&NHX:S=c1]),d1[&&NHX:S=d1]),x[&&NHX:S=x]),p1[&&NHX:S=p1]),i1[&&NHX:S=i1]),k2[&&NHX:S=k2]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" ); + final GSDI sdi2_33 = new GSDI( g2_33, s2, false ); + if ( sdi2_33.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_33.getSpeciationOrDuplicationEventsSum() != 7 ) { + return false; + } + if ( sdi2_33.getSpeciationsSum() != 3 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "a2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "c1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "d1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "x" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "p1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "i1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "k2" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "e1" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "y" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_33, "a1", "z" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_34 = TestGSDI + .createPhylogeny( "(((n1_0[&&NHX:S=n1],n2_0[&&NHX:S=n2]),(n1_1[&&NHX:S=n1],n3_0[&&NHX:S=n3])),n4_0[&&NHX:S=n4])" ); + final GSDI sdi2_34 = new GSDI( g2_34, s2, false ); + if ( sdi2_34.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_34.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_34.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_34, "n1_0", "n2_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_34, "n1_1", "n3_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_34, "n1_0", "n1_1" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_34, "n1_0", "n4_0" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_35 = TestGSDI + .createPhylogeny( "((((n1_0[&&NHX:S=n1],n2_0[&&NHX:S=n2]),(n1_1[&&NHX:S=n1],n3_0[&&NHX:S=n3])),n4_0[&&NHX:S=n4]),a1_0[&&NHX:S=a1])" ); + final GSDI sdi2_35 = new GSDI( g2_35, s2, false ); + if ( sdi2_35.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_35.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_35.getSpeciationsSum() != 3 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_35, "n1_0", "n2_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_35, "n1_1", "n3_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_35, "n1_0", "n1_1" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_35, "n1_0", "n4_0" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_35, "n1_0", "a1_0" ).isSpeciation() ) { + return false; + } + final Phylogeny g2_36 = TestGSDI + .createPhylogeny( "(((a1_0[&&NHX:S=a1],b1_0[&&NHX:S=b1]),(a1_1[&&NHX:S=a1],c1_0[&&NHX:S=c1])),d1_0[&&NHX:S=d1])" ); + final GSDI sdi2_36 = new GSDI( g2_36, s2, false ); + if ( sdi2_36.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_36.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_36.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_36, "a1_0", "b1_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_36, "a1_1", "c1_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_36, "a1_0", "c1_0" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_36, "a1_0", "d1_0" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_37 = TestGSDI + .createPhylogeny( "(((a1_0[&&NHX:S=a1],b1_0[&&NHX:S=b1]),(a2_0[&&NHX:S=a2],c1_0[&&NHX:S=c1])),d1_0[&&NHX:S=d1])" ); + final GSDI sdi2_37 = new GSDI( g2_37, s2, false ); + if ( sdi2_37.getDuplicationsSum() != 1 ) { + return false; + } + if ( sdi2_37.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_37.getSpeciationsSum() != 2 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_37, "a1_0", "b1_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_37, "a2_0", "c1_0" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_37, "a1_0", "c1_0" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_37, "a1_0", "d1_0" ).isSpeciationOrDuplication() ) { + return false; + } + final Phylogeny g2_38 = TestGSDI + .createPhylogeny( "(((([&&NHX:S=n1],[&&NHX:S=n1]),([&&NHX:S=n1],[&&NHX:S=n1])),[&&NHX:S=n1]),[&&NHX:S=n1])" ); + final GSDI sdi2_38 = new GSDI( g2_38, s2, false ); + if ( sdi2_38.getDuplicationsSum() != 5 ) { + return false; + } + if ( sdi2_38.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_38.getSpeciationsSum() != 0 ) { + return false; + } + final Phylogeny g2_100 = TestGSDI + .createPhylogeny( "(((e1[&&NHX:S=e1],f2[&&NHX:S=f2]),(d3[&&NHX:S=d3],g4[&&NHX:S=g4])),(((a1[&&NHX:S=a1],h2[&&NHX:S=h2]),c3[&&NHX:S=c3]),(i4[&&NHX:S=i4],b1[&&NHX:S=b1])))" ); + final GSDI sdi2_100 = new GSDI( g2_100, s2, false ); + if ( sdi2_100.getDuplicationsSum() != 4 ) { + return false; + } + if ( sdi2_100.getSpeciationOrDuplicationEventsSum() != 0 ) { + return false; + } + if ( sdi2_100.getSpeciationsSum() != 4 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "e1", "f2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "d3", "g4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "e1", "d3" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "a1", "h2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "a1", "c3" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "i4", "b1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "a1", "i4" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_100, "e1", "a1" ).isDuplication() ) { + return false; + } + final Phylogeny g2_101 = TestGSDI + .createPhylogeny( "(((e1[&&NHX:S=e1],f2[&&NHX:S=f2]),(d3[&&NHX:S=d3],g4[&&NHX:S=g4])),(((a1[&&NHX:S=a1],b2[&&NHX:S=b2]),c3[&&NHX:S=c3]),(i4[&&NHX:S=i4],j1[&&NHX:S=j1])))" ); + final GSDI sdi2_101 = new GSDI( g2_101, s2, false ); + if ( sdi2_101.getDuplicationsSum() != 2 ) { + return false; + } + if ( sdi2_101.getSpeciationOrDuplicationEventsSum() != 1 ) { + return false; + } + if ( sdi2_101.getSpeciationsSum() != 5 ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "e1", "f2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "d3", "g4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "e1", "d3" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "a1", "b2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "a1", "c3" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "i4", "j1" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "a1", "i4" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g2_101, "e1", "a1" ).isDuplication() ) { + return false; + } + final Phylogeny s_7_4 = DevelopmentTools.createBalancedPhylogeny( 7, 4 ); + DevelopmentTools.numberSpeciesInOrder( s_7_4 ); + final Phylogeny g_7_4_1 = TestGSDI + .createPhylogeny( "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((" + + "1[&&NHX:S=1],2[&&NHX:S=2]),3[&&NHX:S=3]),4[&&NHX:S=4]),5[&&NHX:S=5])," + + "6[&&NHX:S=6]),7[&&NHX:S=7]),8[&&NHX:S=8]),9[&&NHX:S=9]),10[&&NHX:S=10]),11[&&NHX:S=11])," + + "12[&&NHX:S=12]),13[&&NHX:S=13]),14[&&NHX:S=14]),15[&&NHX:S=15]),16[&&NHX:S=16]),17[&&NHX:S=17])," + + "18[&&NHX:S=18]),19[&&NHX:S=19]),20[&&NHX:S=20]),21[&&NHX:S=21]),22[&&NHX:S=22]),23[&&NHX:S=23])," + + "24[&&NHX:S=24]),25[&&NHX:S=25]),26[&&NHX:S=26]),27[&&NHX:S=27]),28[&&NHX:S=28]),29[&&NHX:S=29])," + + "30[&&NHX:S=30]),31[&&NHX:S=31]),32[&&NHX:S=32]),33[&&NHX:S=33]),34[&&NHX:S=34]),35[&&NHX:S=35])," + + "36[&&NHX:S=36]),37[&&NHX:S=37]),38[&&NHX:S=38]),39[&&NHX:S=39]),40[&&NHX:S=40]),41[&&NHX:S=41])," + + "42[&&NHX:S=42]),43[&&NHX:S=43]),44[&&NHX:S=44]),45[&&NHX:S=45]),46[&&NHX:S=46]),47[&&NHX:S=47])," + + "48[&&NHX:S=48]),49[&&NHX:S=49]),50[&&NHX:S=50]),51[&&NHX:S=51]),52[&&NHX:S=52]),53[&&NHX:S=53])," + + "54[&&NHX:S=54]),55[&&NHX:S=55]),56[&&NHX:S=56]),57[&&NHX:S=57]),58[&&NHX:S=58]),59[&&NHX:S=59])," + + "60[&&NHX:S=60]),61[&&NHX:S=61]),62[&&NHX:S=62]),63[&&NHX:S=63]),64[&&NHX:S=64]),65[&&NHX:S=65])" ); + final GSDI sdi7_4_1 = new GSDI( g_7_4_1, s_7_4, false ); + if ( sdi7_4_1.getDuplicationsSum() != 54 ) { + return false; + } + if ( sdi7_4_1.getSpeciationOrDuplicationEventsSum() != 6 ) { + return false; + } + if ( sdi7_4_1.getSpeciationsSum() != 4 ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "2" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "3" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "4" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "5" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "6" ).isDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "9" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "13" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "17" ).isSpeciation() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "33" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "49" ).isSpeciationOrDuplication() ) { + return false; + } + if ( !TestGSDI.getEvent( g_7_4_1, "1", "65" ).isSpeciation() ) { + return false; + } + final Phylogeny g_7_4_2 = TestGSDI + .createPhylogeny( "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((" + + "1[&&NHX:S=1],2[&&NHX:S=2]),3[&&NHX:S=3]),4[&&NHX:S=4]),5[&&NHX:S=5])," + + "6[&&NHX:S=6]),7[&&NHX:S=7]),8[&&NHX:S=8]),9[&&NHX:S=9]),10[&&NHX:S=10]),11[&&NHX:S=11])," + + "12[&&NHX:S=12]),13[&&NHX:S=13]),14[&&NHX:S=14]),15[&&NHX:S=15]),16[&&NHX:S=16]),17[&&NHX:S=17])," + + "18[&&NHX:S=18]),19[&&NHX:S=19]),20[&&NHX:S=20]),21[&&NHX:S=21]),22[&&NHX:S=22]),23[&&NHX:S=23])," + + "24[&&NHX:S=24]),25[&&NHX:S=25]),26[&&NHX:S=26]),27[&&NHX:S=27]),28[&&NHX:S=28]),29[&&NHX:S=29])," + + "30[&&NHX:S=30]),31[&&NHX:S=31]),32[&&NHX:S=32]),33[&&NHX:S=33]),34[&&NHX:S=34]),35[&&NHX:S=35])," + + "36[&&NHX:S=36]),37[&&NHX:S=37]),38[&&NHX:S=38]),39[&&NHX:S=39]),40[&&NHX:S=40]),41[&&NHX:S=41])," + + "42[&&NHX:S=42]),43[&&NHX:S=43]),44[&&NHX:S=44]),45[&&NHX:S=45]),46[&&NHX:S=46]),47[&&NHX:S=47])," + + "48[&&NHX:S=48]),49[&&NHX:S=49]),50[&&NHX:S=50]),51[&&NHX:S=51]),52[&&NHX:S=52]),53[&&NHX:S=53])," + + "54[&&NHX:S=54]),55[&&NHX:S=55]),56[&&NHX:S=56]),57[&&NHX:S=57]),58[&&NHX:S=58]),59[&&NHX:S=59])," + + "60[&&NHX:S=60]),61[&&NHX:S=61]),62[&&NHX:S=62]),63[&&NHX:S=63]),64[&&NHX:S=64]),65[&&NHX:S=65])," + + "66[&&NHX:S=66]),257[&&NHX:S=257]),258[&&NHX:S=258]),513[&&NHX:S=513]),514[&&NHX:S=514]),769[&&NHX:S=769]),770[&&NHX:S=770])" ); + final GSDI sdi7_4_2 = new GSDI( g_7_4_2, s_7_4, false ); + if ( sdi7_4_2.getDuplicationsSum() != 58 ) { + return false; + } + if ( sdi7_4_2.getSpeciationOrDuplicationEventsSum() != 8 ) { + return false; + } + if ( sdi7_4_2.getSpeciationsSum() != 5 ) { + return false; + } + // final String g2_0_ = + // "(([&&NHX:S=a1],[&&NHX:S=a2]),([&&NHX:S=o2],[&&NHX:S=o4]))"; + // final Phylogeny g2_0 = factory.create( g2_0_, new NHXParser() )[ + // 0 ]; + // g2_0.setRooted( true ); + // final GSDI sdi2_0 = new GSDI( g2_0, s2, false ); + // if ( sdi2_0.getDuplicationsSum() != 0 ) { + // return false; + // } + // final String g2_1_= ""; + // final Phylogeny g2_1 = factory.create( g2_1_, new NHXParser() )[ + // 0 ]; + // g2_1.setRooted( true ); + // final GSDI sdi2_1 = new GSDI( g2_1, s2, false ); + // if ( sdi2_1.getDuplicationsSum() != 0 ) { + // return false; + // } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } +} diff --git a/forester/java/src/org/forester/sdi/Tuplet.java b/forester/java/src/org/forester/sdi/Tuplet.java new file mode 100644 index 0000000..37426e4 --- /dev/null +++ b/forester/java/src/org/forester/sdi/Tuplet.java @@ -0,0 +1,168 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sdi; + +class Tuplet implements Comparable { + + public static final int DEFAULT = -999; + private final String _key; + private final double _value1; + private final double _value2; + private final double _value3; + private final double _value4; + private int[] _p; // Since + + Tuplet() { + setSigns(); + _key = ""; + _value1 = Tuplet.DEFAULT; + _value2 = Tuplet.DEFAULT; + _value3 = Tuplet.DEFAULT; + _value4 = Tuplet.DEFAULT; + } + + // distance + // needs to be + // sorted in + // different + // direction than other values, and it is not + // known which value will be the distance. + Tuplet( final String name, + final double value1, + final double value2, + final double value3, + final double value4, + final int c ) { + setSigns(); + _key = name; + _value1 = value1; + _value2 = value2; + _value3 = value3; + _value4 = value4; + if ( ( c >= 0 ) && ( c <= 3 ) ) { + _p[ c ] = -1; + } + } + + Tuplet( final String name, final double value1, final double value2, final double value3, final int c ) { + setSigns(); + _key = name; + _value1 = value1; + _value2 = value2; + _value3 = value3; + _value4 = Tuplet.DEFAULT; + if ( ( c >= 0 ) && ( c <= 2 ) ) { + _p[ c ] = -1; + } + } + + Tuplet( final String name, final double value1, final double value2, final int c ) { + setSigns(); + _key = name; + _value1 = value1; + _value2 = value2; + _value3 = Tuplet.DEFAULT; + _value4 = Tuplet.DEFAULT; + if ( ( c >= 0 ) && ( c <= 1 ) ) { + _p[ c ] = -1; + } + } + + Tuplet( final String name, final double value1, final int c ) { + setSigns(); + _key = name; + _value1 = value1; + _value2 = Tuplet.DEFAULT; + _value3 = Tuplet.DEFAULT; + _value4 = Tuplet.DEFAULT; + if ( c == 0 ) { + _p[ 0 ] = -1; + } + } + + public int compareTo( final Tuplet n ) { + if ( ( getValue1() != Tuplet.DEFAULT ) && ( n.getValue1() != Tuplet.DEFAULT ) ) { + if ( getValue1() < n.getValue1() ) { + return _p[ 0 ]; + } + if ( getValue1() > n.getValue1() ) { + return ( -_p[ 0 ] ); + } + } + if ( ( getValue2() != Tuplet.DEFAULT ) && ( n.getValue2() != Tuplet.DEFAULT ) ) { + if ( getValue2() < n.getValue2() ) { + return _p[ 1 ]; + } + if ( getValue2() > n.getValue2() ) { + return ( -_p[ 1 ] ); + } + } + if ( ( getValue3() != Tuplet.DEFAULT ) && ( n.getValue3() != Tuplet.DEFAULT ) ) { + if ( getValue3() < n.getValue3() ) { + return _p[ 2 ]; + } + if ( getValue3() > n.getValue3() ) { + return ( -_p[ 2 ] ); + } + } + if ( ( getValue4() != Tuplet.DEFAULT ) && ( n.getValue4() != Tuplet.DEFAULT ) ) { + if ( getValue4() < n.getValue4() ) { + return _p[ 3 ]; + } + if ( getValue4() > n.getValue4() ) { + return ( -_p[ 3 ] ); + } + } + return ( getKey().compareTo( n.getKey() ) ); + } + + String getKey() { + return _key; + } + + double getValue1() { + return _value1; + } + + double getValue2() { + return _value2; + } + + double getValue3() { + return _value3; + } + + double getValue4() { + return _value4; + } + + private void setSigns() { + _p = new int[ 4 ]; + _p[ 0 ] = _p[ 1 ] = _p[ 2 ] = _p[ 3 ] = +1; + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/sequence/BasicSequence.java b/forester/java/src/org/forester/sequence/BasicSequence.java new file mode 100644 index 0000000..4cc03a7 --- /dev/null +++ b/forester/java/src/org/forester/sequence/BasicSequence.java @@ -0,0 +1,91 @@ +// $Id: +// +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sequence; + +public class BasicSequence implements Sequence { + + private final char[] _mol_sequence; + private final Object _identifier; + private final TYPE _type; + + private BasicSequence( final Object identifier, final String mol_sequence, final TYPE type ) { + _mol_sequence = mol_sequence.toCharArray(); + _identifier = identifier; + _type = type; + } + + // Only use if you know what you are doing! + public BasicSequence( final Object identifier, final char[] mol_sequence, final TYPE type ) { + _mol_sequence = mol_sequence; + _identifier = identifier; + _type = type; + } + + public Object getIdentifier() { + return _identifier; + } + + public int getLength() { + return _mol_sequence.length; + } + + public char[] getMolecularSequence() { + return _mol_sequence; + } + + public char getResidueAt( final int position ) { + return _mol_sequence[ position ]; + } + + public TYPE getType() { + return _type; + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( _identifier.toString() ); + sb.append( " " ); + sb.append( new String( _mol_sequence ) ); + return sb.toString(); + } + + public static Sequence createAaSequence( final Object identifier, final String mol_sequence ) { + return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) + .replaceAll( AA_REGEXP, Character.toString( UNSPECIFIED_AA ) ), TYPE.AA ); + } + + public static Sequence createDnaSequence( final Object identifier, final String mol_sequence ) { + return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) + .replaceAll( DNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.DNA ); + } + + public static Sequence createRnaSequence( final Object identifier, final String mol_sequence ) { + return new BasicSequence( identifier, mol_sequence.toUpperCase().replaceAll( "\\.", GAP_STR ) + .replaceAll( RNA_REGEXP, Character.toString( UNSPECIFIED_NUC ) ), TYPE.RNA ); + } +} diff --git a/forester/java/src/org/forester/sequence/Sequence.java b/forester/java/src/org/forester/sequence/Sequence.java new file mode 100644 index 0000000..3ee5893 --- /dev/null +++ b/forester/java/src/org/forester/sequence/Sequence.java @@ -0,0 +1,53 @@ +// $Id: +// +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.sequence; + +public interface Sequence { + + public static final char UNSPECIFIED_AA = 'X'; + public static final char UNSPECIFIED_NUC = 'N'; + public static final char GAP = '-'; + public static final String GAP_STR = Character.toString( GAP ); + public static final char TERMINATE = '*'; + static final String AA_REGEXP = "[^ARNDBCQEZGHILKMFPSTWYVXU\\-\\*]"; + static final String DNA_REGEXP = "[^ACGTRYMKWSN\\-\\*]"; + static final String RNA_REGEXP = "[^ACGURYMKWSN\\-\\*]"; + + public abstract Object getIdentifier(); + + public abstract int getLength(); + + public abstract char[] getMolecularSequence(); + + public abstract char getResidueAt( final int position ); + + public abstract TYPE getType(); + + public enum TYPE { + RNA, DNA, AA; + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/AdjactantDirectedBinaryDomainCombination.java b/forester/java/src/org/forester/surfacing/AdjactantDirectedBinaryDomainCombination.java new file mode 100644 index 0000000..43bb0e3 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/AdjactantDirectedBinaryDomainCombination.java @@ -0,0 +1,54 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public class AdjactantDirectedBinaryDomainCombination extends BasicBinaryDomainCombination { + + public AdjactantDirectedBinaryDomainCombination( final DomainId n_terminal, final DomainId c_terminal ) { + super(); + if ( ( n_terminal == null ) || ( c_terminal == null ) ) { + throw new IllegalArgumentException( "attempt to create binary domain combination using null" ); + } + _id_0 = n_terminal; + _id_1 = c_terminal; + } + + public AdjactantDirectedBinaryDomainCombination( final String n_terminal, final String c_terminal ) { + this( new DomainId( n_terminal ), new DomainId( c_terminal ) ); + } + + public static AdjactantDirectedBinaryDomainCombination createInstance( final String ids ) { + if ( ids.indexOf( BinaryDomainCombination.SEPARATOR ) < 1 ) { + throw new IllegalArgumentException( "Unexpected format for binary domain combination [" + ids + "]" ); + } + final String[] ids_ary = ids.split( BinaryDomainCombination.SEPARATOR ); + if ( ids_ary.length != 2 ) { + throw new IllegalArgumentException( "Unexpected format for binary domain combination [" + ids + "]" ); + } + return new AdjactantDirectedBinaryDomainCombination( ids_ary[ 0 ], ids_ary[ 1 ] ); + } +} diff --git a/forester/java/src/org/forester/surfacing/AdjactantDirectedCombinableDomains.java b/forester/java/src/org/forester/surfacing/AdjactantDirectedCombinableDomains.java new file mode 100644 index 0000000..adfd02c --- /dev/null +++ b/forester/java/src/org/forester/surfacing/AdjactantDirectedCombinableDomains.java @@ -0,0 +1,49 @@ +// $Id: +// cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.surfacing; + +import java.util.ArrayList; +import java.util.List; + +public class AdjactantDirectedCombinableDomains extends BasicCombinableDomains { + + public AdjactantDirectedCombinableDomains( final DomainId n_terminal_key_domain, final Species species ) { + super( n_terminal_key_domain, species ); + } + + @Override + public List toBinaryDomainCombinations() { + final List binary_combinations = new ArrayList( getNumberOfCombinableDomains() ); + for( final DomainId domain : getCombiningDomains().keySet() ) { + // Precondition (!): key domain is most upstream domain. + //TODO ensure this is true. + binary_combinations.add( new AdjactantDirectedBinaryDomainCombination( getKeyDomain(), domain ) ); + } + return binary_combinations; + } +} diff --git a/forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java b/forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java new file mode 100644 index 0000000..9436ce9 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BasicBinaryDomainCombination.java @@ -0,0 +1,170 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import org.forester.util.ForesterUtil; + +public class BasicBinaryDomainCombination implements BinaryDomainCombination { + + DomainId _id_0; + DomainId _id_1; + + BasicBinaryDomainCombination() { + _id_0 = null; + _id_1 = null; + } + + public BasicBinaryDomainCombination( final DomainId id_0, final DomainId id_1 ) { + if ( ( id_0 == null ) || ( id_1 == null ) ) { + throw new IllegalArgumentException( "attempt to create binary domain combination using null" ); + } + if ( id_0.compareTo( id_1 ) < 0 ) { + _id_0 = id_0; + _id_1 = id_1; + } + else { + _id_0 = id_1; + _id_1 = id_0; + } + } + + public BasicBinaryDomainCombination( final String id_0, final String id_1 ) { + this( new DomainId( id_0 ), new DomainId( id_1 ) ); + } + + @Override + public int compareTo( final BinaryDomainCombination binary_domain_combination ) { + if ( binary_domain_combination.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to compare [" + binary_domain_combination.getClass() + "] to " + + "[" + this.getClass() + "]" ); + } + if ( equals( binary_domain_combination ) ) { + return 0; + } + final int x = getId0().compareTo( binary_domain_combination.getId0() ); + if ( x != 0 ) { + return x; + } + else { + return getId1().compareTo( binary_domain_combination.getId1() ); + } + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to [" + + o.getClass() + "]" ); + } + else { + return ( getId0().equals( ( ( BinaryDomainCombination ) o ).getId0() ) ) + && ( getId1().equals( ( ( BinaryDomainCombination ) o ).getId1() ) ); + } + } + + public DomainId getId0() { + return _id_0; + } + + public DomainId getId1() { + return _id_1; + } + + @Override + public int hashCode() { + return getId0().hashCode() + ( 19 * getId1().hashCode() ); + } + + public StringBuffer toGraphDescribingLanguage( final OutputFormat format, + final String node_attribute, + final String edge_attribute ) { + final StringBuffer sb = new StringBuffer(); + switch ( format ) { + case DOT: + if ( ForesterUtil.isEmpty( node_attribute ) ) { + sb.append( getId0() ); + sb.append( " -- " ); + sb.append( getId1() ); + if ( !ForesterUtil.isEmpty( edge_attribute ) ) { + sb.append( " " ); + sb.append( edge_attribute ); + } + sb.append( ";" ); + } + else { + sb.append( getId0() ); + sb.append( " " ); + sb.append( node_attribute ); + sb.append( ";" ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( getId1() ); + sb.append( " " ); + sb.append( node_attribute ); + sb.append( ";" ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( getId0() ); + sb.append( " -- " ); + sb.append( getId1() ); + if ( !ForesterUtil.isEmpty( edge_attribute ) ) { + sb.append( " " ); + sb.append( edge_attribute ); + } + sb.append( ";" ); + } + break; + default: + throw new AssertionError( "unknown format:" + format ); + } + return sb; + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + sb.append( getId0() ); + sb.append( BinaryDomainCombination.SEPARATOR ); + sb.append( getId1() ); + return sb.toString(); + } + + public static BinaryDomainCombination createInstance( final String ids ) { + if ( ids.indexOf( BinaryDomainCombination.SEPARATOR ) < 1 ) { + throw new IllegalArgumentException( "Unexpected format for binary domain combination [" + ids + "]" ); + } + final String[] ids_ary = ids.split( BinaryDomainCombination.SEPARATOR ); + if ( ids_ary.length != 2 ) { + throw new IllegalArgumentException( "Unexpected format for binary domain combination [" + ids + "]" ); + } + return new BasicBinaryDomainCombination( ids_ary[ 0 ], ids_ary[ 1 ] ); + } +} diff --git a/forester/java/src/org/forester/surfacing/BasicCombinableDomains.java b/forester/java/src/org/forester/surfacing/BasicCombinableDomains.java new file mode 100644 index 0000000..9f108cc --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BasicCombinableDomains.java @@ -0,0 +1,185 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.util.DescriptiveStatistics; + +public class BasicCombinableDomains implements CombinableDomains { + + final private DomainId _key_domain; + private int _key_domain_count; + private int _key_domain_proteins_count; + final private Species _species; + final private TreeMap _combining_domains; + private DescriptiveStatistics _key_domain_confidence_statistics; + + public BasicCombinableDomains( final DomainId key_domain, final Species species ) { + _key_domain = key_domain; + _species = species; + _combining_domains = new TreeMap(); + init(); + } + + public void addCombinableDomain( final DomainId protein_domain ) { + if ( getCombiningDomains().containsKey( protein_domain ) ) { + getCombiningDomains().put( protein_domain, getCombiningDomains().get( protein_domain ) + 1 ); + } + else { + getCombiningDomains().put( protein_domain, 1 ); + } + } + + public List getAllDomains() { + final List domains = getCombinableDomains(); + if ( !domains.contains( getKeyDomain() ) ) { + domains.add( getKeyDomain() ); + } + return domains; + } + + public List getCombinableDomains() { + final List domains = new ArrayList( getNumberOfCombinableDomains() ); + for( final DomainId domain : getCombiningDomains().keySet() ) { + domains.add( domain ); + } + return domains; + } + + public SortedMap getCombinableDomainsIds() { + final SortedMap ids = new TreeMap(); + for( final DomainId domain : getCombiningDomains().keySet() ) { + final DomainId pd = domain; + ids.put( pd, getCombiningDomains().get( pd ) ); + } + return ids; + } + + public StringBuilder getCombiningDomainIdsAsStringBuilder() { + final StringBuilder sb = new StringBuilder(); + for( final Iterator iter = getCombiningDomains().keySet().iterator(); iter.hasNext(); ) { + final DomainId key = iter.next(); + sb.append( key.toString() ); + sb.append( " [" ); + final int count = getCombiningDomains().get( key ); + sb.append( count ); + sb.append( "]" ); + if ( iter.hasNext() ) { + sb.append( ", " ); + } + } + return sb; + } + + protected TreeMap getCombiningDomains() { + return _combining_domains; + } + + public DomainId getKeyDomain() { + return _key_domain; + } + + public DescriptiveStatistics getKeyDomainConfidenceDescriptiveStatistics() { + return _key_domain_confidence_statistics; + } + + public int getKeyDomainCount() { + return _key_domain_count; + } + + public int getKeyDomainProteinsCount() { + return _key_domain_proteins_count; + } + + public int getNumberOfCombinableDomains() { + return _combining_domains.size(); + } + + public int getNumberOfProteinsExhibitingCombination( final DomainId protein_domain ) { + if ( getCombiningDomains().containsKey( protein_domain ) ) { + return getCombiningDomains().get( protein_domain ); + } + else { + return 0; + } + } + + public Species getSpecies() { + return _species; + } + + private void init() { + _key_domain_count = 0; + _key_domain_proteins_count = 0; + _key_domain_confidence_statistics = null; + } + + public boolean isCombinable( final DomainId protein_domain ) { + return getCombiningDomains().containsKey( protein_domain ); + } + + public void setKeyDomainConfidenceDescriptiveStatistics( final DescriptiveStatistics key_domain_confidence_statistics ) { + _key_domain_confidence_statistics = key_domain_confidence_statistics; + } + + public void setKeyDomainCount( final int key_domain_count ) { + _key_domain_count = key_domain_count; + } + + public void setKeyDomainProteinsCount( final int key_domain_proteins_count ) { + _key_domain_proteins_count = key_domain_proteins_count; + } + + @Override + public List toBinaryDomainCombinations() { + final List binary_combinations = new ArrayList( getNumberOfCombinableDomains() ); + for( final DomainId domain : getCombiningDomains().keySet() ) { + binary_combinations.add( new BasicBinaryDomainCombination( getKeyDomain(), domain ) ); + } + return binary_combinations; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append( getKeyDomain() ); + sb.append( " [" ); + sb.append( getKeyDomainCount() ); + sb.append( ", " ); + sb.append( getKeyDomainProteinsCount() ); + sb.append( ", " ); + sb.append( getNumberOfCombinableDomains() ); + sb.append( "]: " ); + sb.append( getCombiningDomainIdsAsStringBuilder() ); + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/surfacing/BasicDomain.java b/forester/java/src/org/forester/surfacing/BasicDomain.java new file mode 100644 index 0000000..a5a2da5 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BasicDomain.java @@ -0,0 +1,224 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import org.forester.go.GoId; +import org.forester.util.ForesterUtil; + +public class BasicDomain implements Domain { + + final private DomainId _id; + final private int _from; + final private int _to; + final private short _number; + final private short _total_count; + final private double _per_sequence_evalue; + final private double _per_sequence_score; + final private double _per_domain_evalue; + final private double _per_domain_score; + + public BasicDomain( final String id_str ) { + if ( ForesterUtil.isEmpty( id_str ) ) { + throw new IllegalArgumentException( "attempt to create protein domain with null or empty id" ); + } + _id = new DomainId( id_str ); + _from = -1; + _to = -1; + _number = -1; + _total_count = -1; + _per_sequence_evalue = -1; + _per_sequence_score = -1; + _per_domain_evalue = -1; + _per_domain_score = -1; + } + + public BasicDomain( final String id_str, + final int from, + final int to, + final short number, + final short total_count, + final double per_sequence_evalue, + final double per_sequence_score ) { + if ( ( from >= to ) || ( from < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain from " + from + " to " + to ); + } + if ( ForesterUtil.isEmpty( id_str ) ) { + throw new IllegalArgumentException( "attempt to create protein domain with null or empty id" ); + } + if ( ( number > total_count ) || ( number < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain number " + number + " out of " + + total_count ); + } + if ( per_sequence_evalue < 0.0 ) { + throw new IllegalArgumentException( "attempt to create protein domain with E-value" ); + } + _id = new DomainId( id_str ); + _from = from; + _to = to; + _number = number; + _total_count = total_count; + _per_sequence_evalue = per_sequence_evalue; + _per_sequence_score = per_sequence_score; + _per_domain_evalue = -1; + _per_domain_score = -1; + } + + public BasicDomain( final String id_str, + final int from, + final int to, + final short number, + final short total_count, + final double per_sequence_evalue, + final double per_sequence_score, + final double per_domain_evalue, + final double per_domain_score ) { + if ( ( from >= to ) || ( from < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain from " + from + " to " + to ); + } + if ( ForesterUtil.isEmpty( id_str ) ) { + throw new IllegalArgumentException( "attempt to create protein domain with null or empty id" ); + } + if ( ( number > total_count ) || ( number < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain number " + number + " out of " + + total_count ); + } + if ( ( per_sequence_evalue < 0.0 ) || ( per_domain_evalue < 0.0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain with E-value" ); + } + _id = new DomainId( id_str ); + _from = from; + _to = to; + _number = number; + _total_count = total_count; + _per_sequence_evalue = per_sequence_evalue; + _per_sequence_score = per_sequence_score; + _per_domain_evalue = per_domain_evalue; + _per_domain_score = per_domain_score; + } + + public void addGoId( final GoId go_id ) { + getDomainId().getGoIds().add( go_id ); + } + + /** + * Basic domains are compared/sorted based upon their identifiers (case + * insensitive) and their numbers. + * + */ + public int compareTo( final Domain domain ) { + if ( domain.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to compare [" + domain.getClass() + "] to " + "[" + + this.getClass() + "]" ); + } + if ( this == domain ) { + return 0; + } + return getDomainId().compareTo( domain.getDomainId() ); + } + + /** + * Basic domains are considered equal if they have the same identifier (case + * sensitive). + * + */ + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return getDomainId().equals( ( ( Domain ) o ).getDomainId() ); + } + } + + public DomainId getDomainId() { + return _id; + } + + public int getFrom() { + return _from; + } + + public GoId getGoId( final int i ) { + return getDomainId().getGoIds().get( i ); + } + + public short getNumber() { + return _number; + } + + public int getNumberOfGoIds() { + return getDomainId().getGoIds().size(); + } + + @Override + public double getPerDomainEvalue() { + return _per_domain_evalue; + } + + @Override + public double getPerDomainScore() { + return _per_domain_score; + } + + public double getPerSequenceEvalue() { + return _per_sequence_evalue; + } + + public double getPerSequenceScore() { + return _per_sequence_score; + } + + public int getTo() { + return _to; + } + + public short getTotalCount() { + return _total_count; + } + + @Override + public int hashCode() { + return getDomainId().getId().hashCode(); + } + + @Override + public String toString() { + return toStringBuffer().toString(); + } + + public StringBuffer toStringBuffer() { + return new StringBuffer( getDomainId().getId() ); + } +} diff --git a/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java new file mode 100644 index 0000000..5b5a91e --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BasicDomainSimilarityCalculator.java @@ -0,0 +1,242 @@ +// $Id: +// Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.ArrayList; +import java.util.List; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; + +public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator { + + final DomainSimilarity.DomainSimilaritySortField _sort; + private final boolean _sort_by_species_count_first; + private final boolean _treat_as_binary_comparison; + + public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort, + final boolean sort_by_species_count_first, + final boolean treat_as_binary_comparison ) { + _sort = sort; + _sort_by_species_count_first = sort_by_species_count_first; + _treat_as_binary_comparison = treat_as_binary_comparison; + } + + public SortedSet calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator, + final List cdc_list, + final boolean ignore_domains_without_combinations_in_any_genome, + final boolean ignore_domains_specific_to_one_genome ) { + if ( cdc_list.size() < 2 ) { + throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" ); + } + final SortedSet similarities = new TreeSet(); + final SortedSet keys = new TreeSet(); + for( final GenomeWideCombinableDomains cdc : cdc_list ) { + keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() ); + } + for( final DomainId key : keys ) { + final List same_id_cd_list = new ArrayList( cdc_list.size() ); + final List species_with_key_id_domain = new ArrayList(); + for( final GenomeWideCombinableDomains cdc : cdc_list ) { + if ( cdc.contains( key ) ) { + same_id_cd_list.add( cdc.get( key ) ); + species_with_key_id_domain.add( cdc.getSpecies() ); + } + } + if ( ignore_domains_without_combinations_in_any_genome ) { //TODO: test me..........................................<<<<<<<<<<<<< + boolean without_combinations = true; + for( final CombinableDomains cd : same_id_cd_list ) { + if ( cd.getNumberOfCombinableDomains() > 0 ) { + without_combinations = false; + break; + } + } + if ( without_combinations ) { + continue; + } + } + // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55 + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // OLD: if ( same_id_cd_list.size() > 1 ) { + if ( same_id_cd_list.size() > 0 ) { + if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) { + final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list ); + if ( s != null ) { + similarities.add( s ); + } + else { + throw new RuntimeException( "similarity is null: this should not have happened" ); + } + } + } + // ~~~ NEW: + else { + throw new RuntimeException( "this should not have happened" ); + } + // ~~~ OLD: + // else if ( same_id_cd_list.size() == 1 ) { + // TODO need to go in file + // System.out.println( "only in one species [" + + // species_with_key_id_domain.get( 0 ) + "]: " + key_id ); + //} + //else { + // throw new RuntimeException( "this should not have happened" ); + // } + } + return similarities; + } + + private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator, + final List domains_list ) { + if ( domains_list.size() == 1 ) { + // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55 + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // ~~~OLD: + //throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinable domains" ); + // ~~~new: + final SortedMap species_data = new TreeMap(); + species_data.put( domains_list.get( 0 ).getSpecies(), + createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) ); + return new PrintableDomainSimilarity( domains_list.get( 0 ), + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + 0, + 0, + 0, + species_data, + getSort(), + isSortBySpeciesCountFirst(), + isTreatAsBinaryComparison() ); + } + final DescriptiveStatistics stat = new BasicDescriptiveStatistics(); + final SortedMap species_data = new TreeMap(); + species_data.put( domains_list.get( 0 ).getSpecies(), createSpeciesSpecificDomainSimilariyData( domains_list + .get( 0 ) ) ); + int max_difference_in_counts = 0; + int max_difference = 0; + final boolean is_domain_combination_based = pairwise_calculator instanceof CombinationsBasedPairwiseDomainSimilarityCalculator; + for( int i = 1; i < domains_list.size(); ++i ) { + species_data.put( domains_list.get( i ).getSpecies(), + createSpeciesSpecificDomainSimilariyData( domains_list.get( i ) ) ); + final CombinableDomains domains_i = domains_list.get( i ); + for( int j = 0; j < i; ++j ) { + final PairwiseDomainSimilarity pairwise_similarity = pairwise_calculator + .calculateSimilarity( domains_i, domains_list.get( j ) ); + final int difference_in_counts = pairwise_similarity.getDifferenceInCounts(); + int difference = 0; + if ( is_domain_combination_based ) { + difference = ( ( CombinationsBasedPairwiseDomainSimilarity ) pairwise_similarity ) + .getNumberOfDifferentDomains(); + } + else { + difference = difference_in_counts; + } + if ( Math.abs( difference_in_counts ) > Math.abs( max_difference_in_counts ) ) { + max_difference_in_counts = difference_in_counts; + } + if ( Math.abs( difference ) > Math.abs( max_difference ) ) { + max_difference = difference; + } + stat.addValue( pairwise_similarity.getSimilarityScore() ); + } + } + if ( stat.getN() < 1 ) { + throw new AssertionError( "empty descriptive statistics: this should not have happened" ); + } + if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) { + throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" ); + } + if ( ( /*stat.getN() != 1 ||*/!isTreatAsBinaryComparison() ) && ( max_difference_in_counts < 0 ) ) { + max_difference_in_counts = Math.abs( max_difference_in_counts ); + if ( !is_domain_combination_based ) { + max_difference = Math.abs( max_difference ); //=max_difference_in_counts for !is_domain_combination_based. + } + } + DomainSimilarity similarity = null; + if ( stat.getN() == 1 ) { + similarity = new PrintableDomainSimilarity( domains_list.get( 0 ), + stat.getMin(), + stat.getMax(), + stat.arithmeticMean(), + stat.median(), + 0.0, + stat.getN(), + max_difference_in_counts, + max_difference, + species_data, + getSort(), + isSortBySpeciesCountFirst(), + isTreatAsBinaryComparison() ); + } + else { + similarity = new PrintableDomainSimilarity( domains_list.get( 0 ), + stat.getMin(), + stat.getMax(), + stat.arithmeticMean(), + stat.median(), + stat.sampleStandardDeviation(), + stat.getN(), + max_difference_in_counts, + max_difference, + species_data, + getSort(), + isSortBySpeciesCountFirst(), + isTreatAsBinaryComparison() ); + } + return similarity; + } + + private DomainSimilarity.DomainSimilaritySortField getSort() { + return _sort; + } + + private boolean isSortBySpeciesCountFirst() { + return _sort_by_species_count_first; + } + + private boolean isTreatAsBinaryComparison() { + return _treat_as_binary_comparison; + } + + private static SpeciesSpecificDomainSimilariyData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) { + final SpeciesSpecificDomainSimilariyData sd = new PrintableSpeciesSpecificDomainSimilariyData( cd + .getKeyDomainProteinsCount(), cd.getKeyDomainCount(), cd.getNumberOfCombinableDomains(), cd + .getKeyDomainConfidenceDescriptiveStatistics() ); + for( final DomainId domain : cd.getCombinableDomains() ) { + sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) ); + } + return sd; + } +} diff --git a/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java b/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java new file mode 100644 index 0000000..ca8bff0 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java @@ -0,0 +1,365 @@ + +package org.forester.surfacing; + +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.go.GoId; +import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains { + + private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" ); + private static final Comparator DESCENDING_KEY_DOMAIN_COUNT_ORDER = new Comparator() { + + public int compare( final CombinableDomains d1, + final CombinableDomains d2 ) { + if ( d1 + .getKeyDomainCount() < d2 + .getKeyDomainCount() ) { + return 1; + } + else if ( d1 + .getKeyDomainCount() > d2 + .getKeyDomainCount() ) { + return -1; + } + else { + return d1 + .getKeyDomain() + .getId() + .compareTo( d2 + .getKeyDomain() + .getId() ); + } + } + }; + private static final Comparator DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator() { + + public int compare( final CombinableDomains d1, + final CombinableDomains d2 ) { + if ( d1 + .getKeyDomainProteinsCount() < d2 + .getKeyDomainProteinsCount() ) { + return 1; + } + else if ( d1 + .getKeyDomainProteinsCount() > d2 + .getKeyDomainProteinsCount() ) { + return -1; + } + else { + return d1 + .getKeyDomain() + .getId() + .compareTo( d2 + .getKeyDomain() + .getId() ); + } + } + }; + private static final Comparator DESCENDING_COMBINATIONS_COUNT_ORDER = new Comparator() { + + public int compare( final CombinableDomains d1, + final CombinableDomains d2 ) { + if ( d1 + .getNumberOfCombinableDomains() < d2 + .getNumberOfCombinableDomains() ) { + return 1; + } + else if ( d1 + .getNumberOfCombinableDomains() > d2 + .getNumberOfCombinableDomains() ) { + return -1; + } + else { + return d1 + .getKeyDomain() + .getId() + .compareTo( d2 + .getKeyDomain() + .getId() ); + } + } + }; + final private SortedMap _combinable_domains_map; + final private Species _species; + final private DomainCombinationType _dc_type; + + private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) { + _combinable_domains_map = new TreeMap(); + _species = species; + _dc_type = dc_type; + } + + private void add( final DomainId key, final CombinableDomains cdc ) { + _combinable_domains_map.put( key, cdc ); + } + + public boolean contains( final DomainId key_id ) { + return _combinable_domains_map.containsKey( key_id ); + } + + public CombinableDomains get( final DomainId key_id ) { + return _combinable_domains_map.get( key_id ); + } + + public SortedMap getAllCombinableDomainsIds() { + return _combinable_domains_map; + } + + @Override + public SortedSet getAllDomainIds() { + final SortedSet domains = new TreeSet(); + for( final DomainId key : getAllCombinableDomainsIds().keySet() ) { + final CombinableDomains cb = getAllCombinableDomainsIds().get( key ); + final List ds = cb.getAllDomains(); + for( final DomainId d : ds ) { + domains.add( d ); + } + } + return domains; + } + + @Override + public DomainCombinationType getDomainCombinationType() { + return _dc_type; + } + + @Override + public SortedSet getMostPromiscuosDomain() { + final SortedSet doms = new TreeSet(); + final int max = ( int ) getPerGenomeDomainPromiscuityStatistics().getMax(); + for( final DomainId key : getAllCombinableDomainsIds().keySet() ) { + final CombinableDomains cb = getAllCombinableDomainsIds().get( key ); + if ( cb.getNumberOfCombinableDomains() == max ) { + doms.add( key ); + } + } + return doms; + } + + @Override + public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics() { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final DomainId key : getAllCombinableDomainsIds().keySet() ) { + final CombinableDomains cb = getAllCombinableDomainsIds().get( key ); + stats.addValue( cb.getNumberOfCombinableDomains() ); + } + return stats; + } + + public int getSize() { + return _combinable_domains_map.size(); + } + + public Species getSpecies() { + return _species; + } + + @Override + public SortedSet toBinaryDomainCombinations() { + final SortedSet binary_combinations = new TreeSet(); + for( final DomainId key : getAllCombinableDomainsIds().keySet() ) { + final CombinableDomains cb = getAllCombinableDomainsIds().get( key ); + for( final BinaryDomainCombination b : cb.toBinaryDomainCombinations() ) { + binary_combinations.add( b ); + } + } + return binary_combinations; + } + + @Override + public String toString() { + return toStringBuilder( GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID ).toString(); + } + + // Produces something like: + // 2-oxoacid_dh 5 5 2 4.8E-67 Biotin_lipoyl [4], E3_binding [3] + public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) { + final StringBuilder sb = new StringBuilder(); + final List combinable_domains = new ArrayList(); + for( final DomainId key : getAllCombinableDomainsIds().keySet() ) { + final CombinableDomains cb = getAllCombinableDomainsIds().get( key ); + combinable_domains.add( cb ); + } + if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT ) { + Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_COUNT_ORDER ); + } + else if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT ) { + Collections.sort( combinable_domains, + BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER ); + } + else if ( sort_order == GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT ) { + Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_COMBINATIONS_COUNT_ORDER ); + } + for( final CombinableDomains cb : combinable_domains ) { + sb.append( ForesterUtil.pad( new StringBuffer( cb.getKeyDomain().toString() ), 18, ' ', false ) ); + sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) ); + sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) ); + sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) ); + sb + .append( ForesterUtil + .pad( new StringBuffer( "" + + FORMATTER + .format( cb.getKeyDomainConfidenceDescriptiveStatistics().median() ) ), + 10, + ' ', + false ) ); + sb.append( cb.getCombiningDomainIdsAsStringBuilder() ); + sb.append( ForesterUtil.getLineSeparator() ); + } + return sb; + } + + private static void countDomains( final Map domain_counts, + final Map domain_protein_counts, + final Map stats, + final Set saw_c, + final DomainId id_i, + final double support ) { + if ( domain_counts.containsKey( id_i ) ) { + domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) ); + if ( !saw_c.contains( id_i ) ) { + domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) ); + } + } + else { + stats.put( id_i, new BasicDescriptiveStatistics() ); + domain_counts.put( id_i, 1 ); + domain_protein_counts.put( id_i, 1 ); + } + stats.get( id_i ).addValue( support ); + saw_c.add( id_i ); + } + + public static BasicGenomeWideCombinableDomains createInstance( final List protein_list, + final boolean ignore_combination_with_same_domain, + final Species species ) { + return createInstance( protein_list, + ignore_combination_with_same_domain, + species, + null, + DomainCombinationType.BASIC ); + } + + public static BasicGenomeWideCombinableDomains createInstance( final List protein_list, + final boolean ignore_combination_with_same_domain, + final Species species, + final DomainCombinationType dc_type ) { + return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type ); + } + + public static BasicGenomeWideCombinableDomains createInstance( final List protein_list, + final boolean ignore_combination_with_same_domain, + final Species species, + final Map> domain_id_to_go_ids_map, + final DomainCombinationType dc_type ) { + final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type ); + final Map domain_counts = new HashMap(); + final Map domain_protein_counts = new HashMap(); + final Map stats = new HashMap(); + for( final Protein protein : protein_list ) { + if ( !protein.getSpecies().equals( species ) ) { + throw new IllegalArgumentException( "species (" + protein.getSpecies() + + ") does not match species of combinable domains collection (" + species + ")" ); + } + final Set saw_i = new HashSet(); + final Set saw_c = new HashSet(); + for( int i = 0; i < protein.getProteinDomains().size(); ++i ) { + final Domain pd_i = protein.getProteinDomain( i ); + final DomainId id_i = pd_i.getDomainId(); + final int current_start = pd_i.getFrom(); + BasicGenomeWideCombinableDomains.countDomains( domain_counts, + domain_protein_counts, + stats, + saw_c, + id_i, + pd_i.getPerSequenceEvalue() ); + if ( !saw_i.contains( id_i ) ) { + if ( dc_type == DomainCombinationType.BASIC ) { + saw_i.add( id_i ); + } + CombinableDomains domain_combination = null; + if ( instance.contains( id_i ) ) { + domain_combination = instance.get( id_i ); + } + else { + if ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) { + domain_combination = new AdjactantDirectedCombinableDomains( pd_i.getDomainId(), species ); + } + else if ( dc_type == DomainCombinationType.DIRECTED ) { + domain_combination = new DirectedCombinableDomains( pd_i.getDomainId(), species ); + } + else { + domain_combination = new BasicCombinableDomains( pd_i.getDomainId(), species ); + } + if ( ( domain_id_to_go_ids_map != null ) + && domain_id_to_go_ids_map.containsKey( pd_i.getDomainId() ) ) { + final List go_ids = domain_id_to_go_ids_map.get( pd_i.getDomainId() ); + for( final GoId go_id : go_ids ) { + domain_combination.getKeyDomain().addGoId( go_id ); + } + } + instance.add( id_i, domain_combination ); + } + final Set saw_j = new HashSet(); + if ( ignore_combination_with_same_domain ) { + saw_j.add( id_i ); + } + Domain closest = null; + for( int j = 0; j < protein.getNumberOfProteinDomains(); ++j ) { + if ( ( dc_type != DomainCombinationType.BASIC ) + && ( current_start >= protein.getProteinDomain( j ).getFrom() ) ) { + continue; + } + if ( i != j ) { + final DomainId id = protein.getProteinDomain( j ).getDomainId(); + if ( !saw_j.contains( id ) ) { + saw_j.add( id ); + if ( dc_type != DomainCombinationType.DIRECTED_ADJACTANT ) { + domain_combination + .addCombinableDomain( protein.getProteinDomain( j ).getDomainId() ); + } + else { + if ( closest == null ) { + closest = protein.getProteinDomain( j ); + } + else { + if ( protein.getProteinDomain( j ).getFrom() < closest.getFrom() ) { + closest = protein.getProteinDomain( j ); + } + } + } + } + } + } + if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) { + domain_combination.addCombinableDomain( closest.getDomainId() ); + } + } + } + } + for( final DomainId key_id : domain_counts.keySet() ) { + instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) ); + instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) ); + instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) ); + } + return instance; + } +} diff --git a/forester/java/src/org/forester/surfacing/BasicProtein.java b/forester/java/src/org/forester/surfacing/BasicProtein.java new file mode 100644 index 0000000..bc67c18 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BasicProtein.java @@ -0,0 +1,175 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.ArrayList; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +public class BasicProtein implements Protein { + + private final ProteinId _id; + private final Species _species; + private String _name; + private String _desc; + private String _accession; + private final List _protein_domains; + + public BasicProtein( final String id_str, final String species_str ) { + _id = new ProteinId( id_str ); + _species = new BasicSpecies( species_str ); + _protein_domains = new ArrayList(); + init(); + } + + public void addProteinDomain( final Domain protein_domain ) { + getProteinDomains().add( protein_domain ); + } + + @Override + /** + * If in_nc_order is set to true, this returns true only and only if + * the order in List 'domains' and this protein (as determined by the start positions + * of the domains of this proteins, _not_ by their index) are the same + * (interspersing, 'other', domains in this are ignored). + * If in_nc_order is set to false, this returns true only and only if + * this contains all domains listed in 'domains' (order and count do not matter). + * + * @param domains a list of domain ids in a certain order. + * @param in_nc_order to consider order + * @return + */ + public boolean contains( final List query_domain_ids, final boolean in_nc_order ) { + if ( !in_nc_order ) { + for( final DomainId query_domain_id : query_domain_ids ) { + if ( !getProteinDomainIds().contains( query_domain_id ) ) { + return false; + } + } + return true; + } + else { + int current_start_position = -1; + I: for( final DomainId query_domain_id : query_domain_ids ) { + if ( getProteinDomainIds().contains( query_domain_id ) ) { + final List found_domains = getProteinDomains( query_domain_id ); + final SortedSet ordered_start_positions = new TreeSet(); + for( final Domain found_domain : found_domains ) { + ordered_start_positions.add( found_domain.getFrom() ); + } + for( final int start_position : ordered_start_positions ) { + if ( start_position > current_start_position ) { + current_start_position = start_position; + continue I; + } + } + return false; + } + else { + return false; + } + } + return true; + } + } + + @Override + public String getAccession() { + return _accession; + } + + @Override + public String getDescription() { + return _desc; + } + + @Override + public String getName() { + return _name; + } + + public int getNumberOfProteinDomains() { + return getProteinDomains().size(); + } + + public Domain getProteinDomain( final int index ) { + return _protein_domains.get( index ); + } + + public int getProteinDomainCount( final DomainId domain_id ) { + return getProteinDomains( domain_id ).size(); + } + + private List getProteinDomainIds() { + final List ids = new ArrayList( getProteinDomains().size() ); + for( final Domain domain : getProteinDomains() ) { + ids.add( domain.getDomainId() ); + } + return ids; + } + + public List getProteinDomains() { + return _protein_domains; + } + + public List getProteinDomains( final DomainId domain_id ) { + final List domains = new ArrayList(); + for( final Domain domain : getProteinDomains() ) { + if ( domain.getDomainId().equals( domain_id ) ) { + domains.add( domain ); + } + } + return domains; + } + + public ProteinId getProteinId() { + return _id; + } + + public Species getSpecies() { + return _species; + } + + private void init() { + _desc = ""; + _accession = ""; + _name = ""; + } + + public void setAccession( final String accession ) { + _accession = accession; + } + + public void setDescription( final String description ) { + _desc = description; + } + + public void setName( final String name ) { + _name = name; + } +} diff --git a/forester/java/src/org/forester/surfacing/BasicSpecies.java b/forester/java/src/org/forester/surfacing/BasicSpecies.java new file mode 100644 index 0000000..e425b39 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BasicSpecies.java @@ -0,0 +1,83 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import org.forester.util.ForesterUtil; + +public class BasicSpecies implements Species { + + final private String _species_id; + + public BasicSpecies( final String species_id ) { + if ( ForesterUtil.isEmpty( species_id ) ) { + throw new IllegalArgumentException( "attempt to create new species from empty or null string" ); + } + _species_id = species_id.trim(); + } + + @Override + public int compareTo( final Species species ) { + if ( this == species ) { + return 0; + } + return getSpeciesId().toLowerCase().compareTo( species.getSpeciesId().toLowerCase() ); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return getSpeciesId().equals( ( ( Species ) o ).getSpeciesId() ); + } + } + + /* (non-Javadoc) + * @see org.forester.surfacing.Species#getSpeciesId() + */ + public String getSpeciesId() { + return _species_id; + } + + @Override + public int hashCode() { + return getSpeciesId().hashCode(); + } + + @Override + public String toString() { + return getSpeciesId(); + } +} diff --git a/forester/java/src/org/forester/surfacing/BinaryDomainCombination.java b/forester/java/src/org/forester/surfacing/BinaryDomainCombination.java new file mode 100644 index 0000000..3637dfe --- /dev/null +++ b/forester/java/src/org/forester/surfacing/BinaryDomainCombination.java @@ -0,0 +1,56 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public interface BinaryDomainCombination extends Comparable { + + public static final String SEPARATOR = "="; + + public DomainId getId0(); + + public DomainId getId1(); + + public abstract StringBuffer toGraphDescribingLanguage( final OutputFormat format, + final String node_attribute, + String edge_attribute ); + + /** + * This has to return a String representation + * in the following format: + * id0 - id1 + * + * @return a String representation in the form id0 - id1 + */ + public String toString(); + + public static enum DomainCombinationType { + BASIC, DIRECTED, DIRECTED_ADJACTANT; + } + + public static enum OutputFormat { + DOT + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/CombinableDomains.java b/forester/java/src/org/forester/surfacing/CombinableDomains.java new file mode 100644 index 0000000..05fffbf --- /dev/null +++ b/forester/java/src/org/forester/surfacing/CombinableDomains.java @@ -0,0 +1,138 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.List; +import java.util.SortedMap; + +import org.forester.util.DescriptiveStatistics; + +public interface CombinableDomains { + + /** + * To add a new combinable domain. + * + * @param protein_domain + */ + public void addCombinableDomain( final DomainId protein_domain ); + + /** + * + * This must return all domains in this set of combinable domains (i.e. + * the key domain and all domains which can combine with the key domain). + * + * @return all domains + */ + List getAllDomains(); + + List getCombinableDomains(); + + /** + * Returns the combinable domain identifiers sorted in alphabetical manner: - + * keys are the combinable domain identifiers - values are the counts of + * proteins exhibiting a particular combination + * + * @return combining domain identifiers sorted in alphabetical manner + */ + public SortedMap getCombinableDomainsIds(); + + public StringBuilder getCombiningDomainIdsAsStringBuilder(); + + /** + * Returns the domain whose combinable domains are in stored in this + * combinable domains. + * + * @return the domain identifier + */ + public DomainId getKeyDomain(); + + /** + * Gets descriptive statistics for the confidence (i.e. E-values) of the key + * domain. + * + * + * @return descriptive statistics for the confidence of the key domain + */ + public DescriptiveStatistics getKeyDomainConfidenceDescriptiveStatistics(); + + /** + * Returns how many times the key domain is present in a given species + * genome. + * + * @return key domain count in species + */ + public int getKeyDomainCount(); + + /** + * Returns how many proteins with the key domain are present in a given + * species genome. + * + * @return key domain proteins count in species + */ + public int getKeyDomainProteinsCount(); + + public int getNumberOfCombinableDomains(); + + public int getNumberOfProteinsExhibitingCombination( final DomainId protein_domain ); + + /** + * Returns the species of this combinable domains. + * + * @return the species + */ + public Species getSpecies(); + + public boolean isCombinable( final DomainId protein_domain ); + + /** + * This is to set descriptive statistics for the confidence (i.e. E-values) + * of the key domain. + * + * + * @param statistics + */ + void setKeyDomainConfidenceDescriptiveStatistics( final DescriptiveStatistics statistics ); + + /** + * Sets how many times the key domain is present in a given species genome. + * + * @param key_domain_count + * key domain count in species + */ + void setKeyDomainCount( final int key_domain_count ); + + /** + * Sets how many proteins with the key domain are present in a given species + * genome. + * + * @param key_domain_proteins_count + * key domain protein count in species + */ + void setKeyDomainProteinsCount( final int key_domain_proteins_count ); + + public List toBinaryDomainCombinations(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java b/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java new file mode 100644 index 0000000..b3035c5 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarity.java @@ -0,0 +1,70 @@ +// $Id: +// cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public class CombinationsBasedPairwiseDomainSimilarity implements PairwiseDomainSimilarity { + + private final int _same_domains; + private final int _different_domains; + private final int _difference_in_counts; + private final double _score; + + public CombinationsBasedPairwiseDomainSimilarity( final int same_domains, + final int different_domains, + final int difference_in_counts ) { + if ( ( same_domains < 0 ) || ( different_domains < 0 ) ) { + throw new IllegalArgumentException( "attempt to use domain counts less than 0" ); + } + _difference_in_counts = difference_in_counts; + _same_domains = same_domains; + _different_domains = different_domains; + if ( _different_domains == 0 ) { + _score = 1.0; + } + else { + _score = ( double ) _same_domains / ( _different_domains + _same_domains ); + } + } + + @Override + public int getDifferenceInCounts() { + return _difference_in_counts; + } + + public int getNumberOfDifferentDomains() { + return _different_domains; + } + + public int getNumberOfSameDomains() { + return _same_domains; + } + + public double getSimilarityScore() { + return _score; + } +} diff --git a/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarityCalculator.java new file mode 100644 index 0000000..a33527d --- /dev/null +++ b/forester/java/src/org/forester/surfacing/CombinationsBasedPairwiseDomainSimilarityCalculator.java @@ -0,0 +1,59 @@ +// $Id: +// 22:43:35 cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.List; + +public class CombinationsBasedPairwiseDomainSimilarityCalculator implements PairwiseDomainSimilarityCalculator { + + public PairwiseDomainSimilarity calculateSimilarity( final CombinableDomains domains_1, + final CombinableDomains domains_2 ) { + if ( !domains_1.getKeyDomain().equals( domains_2.getKeyDomain() ) ) { + throw new IllegalArgumentException( "attempt to calculate similarity between domain collection with different keys" ); + } + final List d1 = domains_1.getCombinableDomains(); + final List d2 = domains_2.getCombinableDomains(); + int same = 0; + int different = 0; + for( final DomainId domain : d1 ) { + if ( d2.contains( domain ) ) { + same++; + } + else { + different++; + } + } + for( final DomainId domain : d2 ) { + if ( !( d1.contains( domain ) ) ) { + different++; + } + } + final int difference = domains_1.getNumberOfCombinableDomains() - domains_2.getNumberOfCombinableDomains(); + return new CombinationsBasedPairwiseDomainSimilarity( same, different, difference ); + } +} diff --git a/forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java b/forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java new file mode 100644 index 0000000..b1abfeb --- /dev/null +++ b/forester/java/src/org/forester/surfacing/CountsBasedPairwiseDomainSimilarity.java @@ -0,0 +1,65 @@ +// $Id: +// cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public class CountsBasedPairwiseDomainSimilarity implements PairwiseDomainSimilarity { + + private final double _score; + private final int _copy_number_difference; + + /** + * counts_difference: (counts for domain 1) minus (counts for domain 2). + * + * + * @param counts_difference value of domain_1 minus value of domain_2 + * @param counts_sum + */ + public CountsBasedPairwiseDomainSimilarity( final int counts_difference, final int counts_sum ) { + if ( counts_sum <= 0 ) { + throw new IllegalArgumentException( "attempt to use copy sum of less than or equal to 0: " + counts_sum ); + } + _copy_number_difference = counts_difference; + final int abs_copy_number_difference = Math.abs( counts_difference ); + if ( abs_copy_number_difference > counts_sum ) { + throw new IllegalArgumentException( "attempt to use absolute copy number difference larger than copy number sum" ); + } + _score = 1.0 - ( double ) abs_copy_number_difference / counts_sum; + } + + /** + * Returns (counts for domain 1) minus (counts for domain 2). + * + */ + public int getDifferenceInCounts() { + return _copy_number_difference; + } + + public double getSimilarityScore() { + return _score; + } +} diff --git a/forester/java/src/org/forester/surfacing/DirectedBinaryDomainCombination.java b/forester/java/src/org/forester/surfacing/DirectedBinaryDomainCombination.java new file mode 100644 index 0000000..4fa9179 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DirectedBinaryDomainCombination.java @@ -0,0 +1,54 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public class DirectedBinaryDomainCombination extends BasicBinaryDomainCombination { + + public DirectedBinaryDomainCombination( final DomainId n_terminal, final DomainId c_terminal ) { + super(); + if ( ( n_terminal == null ) || ( c_terminal == null ) ) { + throw new IllegalArgumentException( "attempt to create binary domain combination using null" ); + } + _id_0 = n_terminal; + _id_1 = c_terminal; + } + + public DirectedBinaryDomainCombination( final String n_terminal, final String c_terminal ) { + this( new DomainId( n_terminal ), new DomainId( c_terminal ) ); + } + + public static BinaryDomainCombination createInstance( final String ids ) { + if ( ids.indexOf( BinaryDomainCombination.SEPARATOR ) < 1 ) { + throw new IllegalArgumentException( "Unexpected format for binary domain combination [" + ids + "]" ); + } + final String[] ids_ary = ids.split( BinaryDomainCombination.SEPARATOR ); + if ( ids_ary.length != 2 ) { + throw new IllegalArgumentException( "Unexpected format for binary domain combination [" + ids + "]" ); + } + return new DirectedBinaryDomainCombination( ids_ary[ 0 ], ids_ary[ 1 ] ); + } +} diff --git a/forester/java/src/org/forester/surfacing/DirectedCombinableDomains.java b/forester/java/src/org/forester/surfacing/DirectedCombinableDomains.java new file mode 100644 index 0000000..2103d26 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DirectedCombinableDomains.java @@ -0,0 +1,48 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.ArrayList; +import java.util.List; + +public class DirectedCombinableDomains extends BasicCombinableDomains { + + public DirectedCombinableDomains( final DomainId n_terminal_key_domain, final Species species ) { + super( n_terminal_key_domain, species ); + } + + @Override + public List toBinaryDomainCombinations() { + final List binary_combinations = new ArrayList( getNumberOfCombinableDomains() ); + for( final DomainId domain : getCombiningDomains().keySet() ) { + // Precondition (!): key domain is most upstream domain. + //TODO ensure this is true. + binary_combinations.add( new DirectedBinaryDomainCombination( getKeyDomain(), domain ) ); + } + return binary_combinations; + } +} diff --git a/forester/java/src/org/forester/surfacing/Domain.java b/forester/java/src/org/forester/surfacing/Domain.java new file mode 100644 index 0000000..c2baf20 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/Domain.java @@ -0,0 +1,56 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import org.forester.go.GoId; + +public interface Domain extends Comparable { + + public void addGoId( GoId go_id ); + + public DomainId getDomainId(); + + public int getFrom(); + + public GoId getGoId( int i ); + + public short getNumber(); + + public int getNumberOfGoIds(); + + public double getPerDomainEvalue(); + + public double getPerDomainScore(); + + public double getPerSequenceEvalue(); + + public double getPerSequenceScore(); + + public int getTo(); + + public short getTotalCount(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java new file mode 100644 index 0000000..317bec3 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainArchitectureBasedGenomeSimilarityCalculator.java @@ -0,0 +1,333 @@ +// $Id: +// 19:38:35 cmzmasek Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.HashSet; +import java.util.Set; + +public class DomainArchitectureBasedGenomeSimilarityCalculator { + + public static final double MAX_SIMILARITY_SCORE = 1.0; + public static final double MIN_SIMILARITY_SCORE = 0.0; + final private GenomeWideCombinableDomains _combinable_domains_genome_0; + final private GenomeWideCombinableDomains _combinable_domains_genome_1; + private Set _domain_ids_to_ignore; + private boolean _allow_domains_to_be_ignored; + private Set _all_domains; + private Set _shared_domains; + private Set _domains_specific_to_0; + private Set _domains_specific_to_1; + private Set _all_binary_domain_combinations; + private Set _shared_binary_domain_combinations; + private Set _binary_domain_combinations_specific_to_0; + private Set _binary_domain_combinations_specific_to_1; + + public DomainArchitectureBasedGenomeSimilarityCalculator( final GenomeWideCombinableDomains combinable_domains_genome_0, + final GenomeWideCombinableDomains combinable_domains_genome_1 ) { + if ( ( combinable_domains_genome_0 == null ) || ( combinable_domains_genome_0.getSize() < 1 ) + || ( combinable_domains_genome_1 == null ) || ( combinable_domains_genome_1.getSize() < 1 ) ) { + throw new IllegalArgumentException( "attempt to compare null or empty combinable domains collection" ); + } + if ( combinable_domains_genome_0.getSpecies().equals( combinable_domains_genome_1.getSpecies() ) ) { + throw new IllegalArgumentException( "attempt to compare combinable domains collection from the same species" ); + } + _combinable_domains_genome_0 = combinable_domains_genome_0; + _combinable_domains_genome_1 = combinable_domains_genome_1; + init(); + forceRecalculation(); + } + + public void addDomainIdToIgnore( final DomainId domain_id_to_ignore ) { + forceRecalculation(); + getDomainIdsToIgnore().add( domain_id_to_ignore ); + } + + /** + * This returns a score between 0.0 (no binary domain combination in common) + * and 1.0 (all binary domain combinations in common) measuring the similarity between two + * genomes based on the number of shared binary domain combinations: + * + * t: sum of (distinct) binary domain combinations + * s: sum of shared (distinct) binary domain combinations + * + * 1 - ( ( t - s ) / t ) + * + * @return shared binary domain combinations based similarity score + */ + public double calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() { + final double t = getAllBinaryDomainCombinations().size(); + final double s = getSharedBinaryDomainCombinations().size(); + if ( t == 0.0 ) { + return MIN_SIMILARITY_SCORE; + } + return ( MAX_SIMILARITY_SCORE - ( ( t - s ) / t ) ); + } + + /** + * This returns a score between 0.0 (no domains in common) + * and 1.0 (all domains in common) measuring the similarity between two + * genomes based on the number of shared domains: + * + * t: sum of (distinct) domains + * s: sum of shared (distinct) domains + * + * 1 - ( ( t - s ) / t ) + * + * @return shared domains based similarity score + */ + public double calculateSharedDomainsBasedGenomeSimilarityScore() { + final double t = getAllDomains().size(); + final double s = getSharedDomains().size(); + if ( t == 0.0 ) { + return MIN_SIMILARITY_SCORE; + } + return ( MAX_SIMILARITY_SCORE - ( ( t - s ) / t ) ); + } + + public void deleteAllDomainIdsToIgnore() { + forceRecalculation(); + setDomainIdsToIgnore( new HashSet() ); + } + + private void forceRecalculation() { + _all_domains = null; + _shared_domains = null; + _domains_specific_to_0 = null; + _domains_specific_to_1 = null; + _all_binary_domain_combinations = null; + _shared_binary_domain_combinations = null; + _binary_domain_combinations_specific_to_0 = null; + _binary_domain_combinations_specific_to_1 = null; + } + + /** + * Does not return binary combinations which contain one or two domains + * to be ignored -- if ignoring is allowed. + * + * @return SortedSet + */ + public Set getAllBinaryDomainCombinations() { + if ( _all_binary_domain_combinations == null ) { + final Set all = new HashSet(); + all.addAll( getCombinableDomainsGenome0().toBinaryDomainCombinations() ); + all.addAll( getCombinableDomainsGenome1().toBinaryDomainCombinations() ); + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + _all_binary_domain_combinations = pruneBinaryCombinations( all ); + } + else { + _all_binary_domain_combinations = all; + } + } + return _all_binary_domain_combinations; + } + + /** + * Does not return domains which are to be + * ignored -- if ignoring is allowed. + * + * + * @return + */ + public Set getAllDomains() { + if ( _all_domains == null ) { + final Set all = new HashSet(); + all.addAll( getCombinableDomainsGenome0().getAllDomainIds() ); + all.addAll( getCombinableDomainsGenome1().getAllDomainIds() ); + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + _all_domains = pruneDomains( all ); + } + else { + _all_domains = all; + } + } + return _all_domains; + } + + private Set getBinaryDomainCombinationsSpecificToGenome( final boolean specific_to_genome_0 ) { + final Set specific = new HashSet(); + final Set bc0 = getCombinableDomainsGenome0().toBinaryDomainCombinations(); + final Set bc1 = getCombinableDomainsGenome1().toBinaryDomainCombinations(); + if ( specific_to_genome_0 ) { + for( final BinaryDomainCombination binary_domain_combination0 : bc0 ) { + if ( !bc1.contains( binary_domain_combination0 ) ) { + specific.add( binary_domain_combination0 ); + } + } + } + else { + for( final BinaryDomainCombination binary_domain_combination1 : bc1 ) { + if ( !bc0.contains( binary_domain_combination1 ) ) { + specific.add( binary_domain_combination1 ); + } + } + } + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + return pruneBinaryCombinations( specific ); + } + return specific; + } + + public Set getBinaryDomainCombinationsSpecificToGenome0() { + if ( _binary_domain_combinations_specific_to_0 == null ) { + _binary_domain_combinations_specific_to_0 = getBinaryDomainCombinationsSpecificToGenome( true ); + } + return _binary_domain_combinations_specific_to_0; + } + + public Set getBinaryDomainCombinationsSpecificToGenome1() { + if ( _binary_domain_combinations_specific_to_1 == null ) { + _binary_domain_combinations_specific_to_1 = getBinaryDomainCombinationsSpecificToGenome( false ); + } + return _binary_domain_combinations_specific_to_1; + } + + private GenomeWideCombinableDomains getCombinableDomainsGenome0() { + return _combinable_domains_genome_0; + } + + private GenomeWideCombinableDomains getCombinableDomainsGenome1() { + return _combinable_domains_genome_1; + } + + private Set getDomainIdsToIgnore() { + return _domain_ids_to_ignore; + } + + private Set getDomainsSpecificToGenome( final boolean specific_to_genome_0 ) { + final Set specific = new HashSet(); + final Set d0 = getCombinableDomainsGenome0().getAllDomainIds(); + final Set d1 = getCombinableDomainsGenome1().getAllDomainIds(); + if ( specific_to_genome_0 ) { + for( final DomainId domain0 : d0 ) { + if ( !d1.contains( domain0 ) ) { + specific.add( domain0 ); + } + } + } + else { + for( final DomainId domain1 : d1 ) { + if ( !d0.contains( domain1 ) ) { + specific.add( domain1 ); + } + } + } + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + return pruneDomains( specific ); + } + return specific; + } + + public Set getDomainsSpecificToGenome0() { + if ( _domains_specific_to_0 == null ) { + _domains_specific_to_0 = getDomainsSpecificToGenome( true ); + } + return _domains_specific_to_0; + } + + public Set getDomainsSpecificToGenome1() { + if ( _domains_specific_to_1 == null ) { + _domains_specific_to_1 = getDomainsSpecificToGenome( false ); + } + return _domains_specific_to_1; + } + + public Set getSharedBinaryDomainCombinations() { + if ( _shared_binary_domain_combinations == null ) { + final Set shared = new HashSet(); + final Set bc0 = getCombinableDomainsGenome0().toBinaryDomainCombinations(); + final Set bc1 = getCombinableDomainsGenome1().toBinaryDomainCombinations(); + for( final BinaryDomainCombination binary_domain_combination0 : bc0 ) { + if ( bc1.contains( binary_domain_combination0 ) ) { + shared.add( binary_domain_combination0 ); + } + } + _shared_binary_domain_combinations = shared; + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + _shared_binary_domain_combinations = pruneBinaryCombinations( shared ); + } + } + return _shared_binary_domain_combinations; + } + + public Set getSharedDomains() { + if ( _shared_domains == null ) { + final Set shared = new HashSet(); + final Set d0 = getCombinableDomainsGenome0().getAllDomainIds(); + final Set d1 = getCombinableDomainsGenome1().getAllDomainIds(); + for( final DomainId domain0 : d0 ) { + if ( d1.contains( domain0 ) ) { + shared.add( domain0 ); + } + } + _shared_domains = shared; + if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { + _shared_domains = pruneDomains( shared ); + } + } + return _shared_domains; + } + + private void init() { + deleteAllDomainIdsToIgnore(); + setAllowDomainsToBeIgnored( false ); + } + + private boolean isAllowDomainsToBeIgnored() { + return _allow_domains_to_be_ignored; + } + + private Set pruneBinaryCombinations( final Set all ) { + final Set pruned = new HashSet(); + for( final BinaryDomainCombination bc : all ) { + if ( ( !getDomainIdsToIgnore().contains( bc.getId0() ) ) + && ( !getDomainIdsToIgnore().contains( bc.getId1() ) ) ) { + pruned.add( bc ); + } + } + return pruned; + } + + private Set pruneDomains( final Set all ) { + final Set pruned = new HashSet(); + for( final DomainId d : all ) { + if ( !getDomainIdsToIgnore().contains( d ) ) { + pruned.add( d ); + } + } + return pruned; + } + + public void setAllowDomainsToBeIgnored( final boolean allow_domains_to_be_ignored ) { + forceRecalculation(); + _allow_domains_to_be_ignored = allow_domains_to_be_ignored; + } + + void setDomainIdsToIgnore( final Set domain_ids_to_ignore ) { + forceRecalculation(); + _domain_ids_to_ignore = domain_ids_to_ignore; + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java new file mode 100644 index 0000000..d902e75 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainCountsBasedPairwiseSimilarityCalculator.java @@ -0,0 +1,41 @@ +// $Id: +// 04:20:19 cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public class DomainCountsBasedPairwiseSimilarityCalculator implements PairwiseDomainSimilarityCalculator { + + public PairwiseDomainSimilarity calculateSimilarity( final CombinableDomains domains_1, + final CombinableDomains domains_2 ) { + if ( !domains_1.getKeyDomain().equals( domains_2.getKeyDomain() ) ) { + throw new IllegalArgumentException( "attempt to calculate similarity between domain collection with different keys" ); + } + final int dc1 = domains_1.getKeyDomainCount(); + final int dc2 = domains_2.getKeyDomainCount(); + return new CountsBasedPairwiseDomainSimilarity( dc1 - dc2, dc1 + dc2 ); + } +} diff --git a/forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java b/forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java new file mode 100644 index 0000000..a36aa12 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainCountsDifferenceUtil.java @@ -0,0 +1,825 @@ +// $Id: +// $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.forester.go.GoId; +import org.forester.go.GoTerm; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +/* + * Poorly designed static class which essential has one method: + * calculateCopyNumberDifferences. + */ +public final class DomainCountsDifferenceUtil { + + private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" ); + private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES = COPY_CALCULATION_MODE.MIN; + private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES = COPY_CALCULATION_MODE.MIN; + private static final COPY_CALCULATION_MODE COPY_CALC_MODE_FOR_LOW_COPY_SPECIES = COPY_CALCULATION_MODE.MAX; + private static final String PLUS_MINUS_PROTEINS_FILE_DOM_SUFFIX = ".prot"; + + //FIXME really needs to be tested! + private static void addCounts( final SortedMap> copy_counts, + final BinaryDomainCombination dc, + final GenomeWideCombinableDomains genome, + final Set bdc ) { + if ( !copy_counts.containsKey( dc ) ) { + copy_counts.put( dc, new ArrayList() ); + } + if ( bdc.contains( dc ) + && ( ( ( BasicCombinableDomains ) genome.get( dc.getId0() ) ).getCombiningDomains().get( dc.getId1() ) != null ) ) { + final int count = ( ( BasicCombinableDomains ) genome.get( dc.getId0() ) ).getCombiningDomains().get( dc + .getId1() ); + copy_counts.get( dc ).add( count ); + } + else { + copy_counts.get( dc ).add( 0 ); + } + } + + private static void addCounts( final SortedMap> copy_counts, + final DomainId domain, + final GenomeWideCombinableDomains genome ) { + if ( !copy_counts.containsKey( domain ) ) { + copy_counts.put( domain, new ArrayList() ); + } + if ( genome.contains( domain ) ) { + copy_counts.get( domain ).add( genome.get( domain ).getKeyDomainProteinsCount() ); + } + else { + copy_counts.get( domain ).add( 0 ); + } + } + + private static StringBuilder addGoInformation( final DomainId d, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map ) { + final StringBuilder sb = new StringBuilder(); + if ( ( domain_id_to_go_ids_map == null ) || domain_id_to_go_ids_map.isEmpty() + || !domain_id_to_go_ids_map.containsKey( d ) ) { + return sb; + } + final List go_ids = domain_id_to_go_ids_map.get( d ); + for( int i = 0; i < go_ids.size(); ++i ) { + final GoId go_id = go_ids.get( i ); + if ( go_id_to_term_map.containsKey( go_id ) ) { + appendGoTerm( sb, go_id_to_term_map.get( go_id ) ); + sb.append( "
    " ); + } + else { + sb.append( "go id \"" + go_id + "\" not found [" + d.getId() + "]" ); + } + } + return sb; + } + + private static void appendGoTerm( final StringBuilder sb, final GoTerm go_term ) { + final GoId go_id = go_term.getGoId(); + sb.append( "" + go_id + + "" ); + sb.append( ":" ); + sb.append( go_term.getName() ); + sb.append( " [" ); + sb.append( go_term.getGoNameSpace().toShortString() ); + sb.append( "]" ); + } + + public static void calculateCopyNumberDifferences( final List genomes, + final SortedMap> protein_lists_per_species, + final List high_copy_base_species, + final List high_copy_target_species, + final List low_copy_species, + final int min_diff, + final Double factor, + final File plain_output_dom, + final File html_output_dom, + final File html_output_dc, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final File all_domains_go_ids_out_dom, + final File passing_domains_go_ids_out_dom, + final File proteins_file_base ) throws IOException { + if ( genomes.size() < 1 ) { + throw new IllegalArgumentException( "attempt to use empty list of genomes for domain difference calculation" ); + } + if ( ( high_copy_base_species.size() < 1 ) || ( low_copy_species.size() < 1 ) ) { + throw new IllegalArgumentException( "attempt to use empty list of species for domain difference calculation" ); + } + if ( high_copy_base_species.contains( high_copy_target_species ) + || low_copy_species.contains( high_copy_target_species ) ) { + throw new IllegalArgumentException( "species [" + high_copy_target_species + + "] appears in other list as well" ); + } + if ( min_diff < 0 ) { + throw new IllegalArgumentException( "attempt to use negative addition [" + min_diff + "]" ); + } + if ( factor <= 0.0 ) { + throw new IllegalArgumentException( "attempt to use factor equal or smaller than 0.0 [" + factor + "]" ); + } + SurfacingUtil.checkForOutputFileWriteability( plain_output_dom ); + SurfacingUtil.checkForOutputFileWriteability( html_output_dom ); + SurfacingUtil.checkForOutputFileWriteability( html_output_dc ); + SurfacingUtil.checkForOutputFileWriteability( all_domains_go_ids_out_dom ); + SurfacingUtil.checkForOutputFileWriteability( passing_domains_go_ids_out_dom ); + final Writer plain_writer = new BufferedWriter( new FileWriter( plain_output_dom ) ); + final Writer html_writer = new BufferedWriter( new FileWriter( html_output_dom ) ); + final Writer html_writer_dc = new BufferedWriter( new FileWriter( html_output_dc ) ); + final Writer all_gos_writer = new BufferedWriter( new FileWriter( all_domains_go_ids_out_dom ) ); + final Writer passing_gos_writer = new BufferedWriter( new FileWriter( passing_domains_go_ids_out_dom ) ); + final SortedMap high_copy_base_values = new TreeMap(); + final SortedMap high_copy_target_values = new TreeMap(); + final SortedMap low_copy_values = new TreeMap(); + final SortedMap> high_copy_base_copy_counts = new TreeMap>(); + final SortedMap> high_copy_target_copy_counts = new TreeMap>(); + final SortedMap> low_copy_copy_counts = new TreeMap>(); + final SortedSet all_domains = new TreeSet(); + final SortedMap high_copy_base_values_dc = new TreeMap(); + final SortedMap high_copy_target_values_dc = new TreeMap(); + final SortedMap low_copy_values_dc = new TreeMap(); + final SortedMap> high_copy_base_copy_counts_dc = new TreeMap>(); + final SortedMap> high_copy_target_copy_counts_dc = new TreeMap>(); + final SortedMap> low_copy_copy_counts_dc = new TreeMap>(); + final SortedSet all_dcs = new TreeSet(); + final Map> bdcs_per_genome = new HashMap>(); + final SortedSet go_ids_of_passing_domains = new TreeSet(); + final SortedSet go_ids_all = new TreeSet(); + for( final GenomeWideCombinableDomains genome : genomes ) { + final SortedSet domains = genome.getAllDomainIds(); + final SortedSet dcs = genome.toBinaryDomainCombinations(); + final String species = genome.getSpecies().getSpeciesId(); + bdcs_per_genome.put( species, genome.toBinaryDomainCombinations() ); + for( final DomainId d : domains ) { + all_domains.add( d ); + if ( domain_id_to_go_ids_map.containsKey( d ) ) { + go_ids_all.addAll( domain_id_to_go_ids_map.get( d ) ); + } + } + for( final BinaryDomainCombination dc : dcs ) { + all_dcs.add( dc ); + } + } + for( final DomainId domain : all_domains ) { + for( final GenomeWideCombinableDomains genome : genomes ) { + final String species = genome.getSpecies().getSpeciesId(); + if ( high_copy_base_species.contains( species ) ) { + DomainCountsDifferenceUtil.addCounts( high_copy_base_copy_counts, domain, genome ); + } + if ( high_copy_target_species.contains( species ) ) { + DomainCountsDifferenceUtil.addCounts( high_copy_target_copy_counts, domain, genome ); + } + if ( low_copy_species.contains( species ) ) { + DomainCountsDifferenceUtil.addCounts( low_copy_copy_counts, domain, genome ); + } + } + } + for( final BinaryDomainCombination dc : all_dcs ) { + for( final GenomeWideCombinableDomains genome : genomes ) { + final String species = genome.getSpecies().getSpeciesId(); + if ( high_copy_base_species.contains( species ) ) { + DomainCountsDifferenceUtil.addCounts( high_copy_base_copy_counts_dc, dc, genome, bdcs_per_genome + .get( species ) ); + } + if ( high_copy_target_species.contains( species ) ) { + DomainCountsDifferenceUtil.addCounts( high_copy_target_copy_counts_dc, dc, genome, bdcs_per_genome + .get( species ) ); + } + if ( low_copy_species.contains( species ) ) { + DomainCountsDifferenceUtil.addCounts( low_copy_copy_counts_dc, dc, genome, bdcs_per_genome + .get( species ) ); + } + } + } + for( final DomainId domain : all_domains ) { + calculateDomainCountsBasedValue( high_copy_target_values, + high_copy_target_copy_counts, + domain, + COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES ); + calculateDomainCountsBasedValue( high_copy_base_values, + high_copy_base_copy_counts, + domain, + COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES ); + calculateDomainCountsBasedValue( low_copy_values, + low_copy_copy_counts, + domain, + COPY_CALC_MODE_FOR_LOW_COPY_SPECIES ); + } + for( final BinaryDomainCombination dc : all_dcs ) { + calculateDomainCountsBasedValue( high_copy_target_values_dc, + high_copy_target_copy_counts_dc, + dc, + COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES ); + calculateDomainCountsBasedValue( high_copy_base_values_dc, + high_copy_base_copy_counts_dc, + dc, + COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES ); + calculateDomainCountsBasedValue( low_copy_values_dc, + low_copy_copy_counts_dc, + dc, + COPY_CALC_MODE_FOR_LOW_COPY_SPECIES ); + } + writeDomainValuesToFiles( genomes, + high_copy_base_species, + high_copy_target_species, + low_copy_species, + min_diff, + factor, + domain_id_to_go_ids_map, + go_id_to_term_map, + plain_writer, + html_writer, + proteins_file_base, + high_copy_base_values, + high_copy_target_values, + low_copy_values, + all_domains, + go_ids_of_passing_domains, + protein_lists_per_species ); + writeDomainCombinationValuesToFiles( genomes, + high_copy_base_species, + high_copy_target_species, + low_copy_species, + min_diff, + factor, + html_writer_dc, + high_copy_base_values_dc, + high_copy_target_values_dc, + low_copy_values_dc, + all_dcs, + bdcs_per_genome ); + writeGoIdsToFile( all_gos_writer, go_ids_all ); + writeGoIdsToFile( passing_gos_writer, go_ids_of_passing_domains ); + } + + private static void calculateDomainCountsBasedValue( final SortedMap copy_values, + final SortedMap> copy_counts, + final BinaryDomainCombination bdc, + final COPY_CALCULATION_MODE copy_calc_mode ) { + if ( copy_counts.containsKey( bdc ) ) { + switch ( copy_calc_mode ) { + case MAX: + DomainCountsDifferenceUtil.calculateMaxCount( copy_values, copy_counts, bdc ); + break; + case MIN: + DomainCountsDifferenceUtil.calculateMinCount( copy_values, copy_counts, bdc ); + break; + case MEAN: + DomainCountsDifferenceUtil.calculateMeanCount( copy_values, copy_counts, bdc ); + break; + case MEDIAN: + DomainCountsDifferenceUtil.calculateMedianCount( copy_values, copy_counts, bdc ); + break; + default: + throw new IllegalArgumentException(); + } + } + else { + copy_values.put( bdc, Double.valueOf( 0.0 ) ); + } + } + + private static void calculateDomainCountsBasedValue( final SortedMap copy_values, + final SortedMap> copy_counts, + final DomainId domain, + final COPY_CALCULATION_MODE copy_calc_mode ) { + if ( copy_counts.containsKey( domain ) ) { + switch ( copy_calc_mode ) { + case MAX: + DomainCountsDifferenceUtil.calculateMaxCount( copy_values, copy_counts, domain ); + break; + case MIN: + DomainCountsDifferenceUtil.calculateMinCount( copy_values, copy_counts, domain ); + break; + case MEAN: + DomainCountsDifferenceUtil.calculateMeanCount( copy_values, copy_counts, domain ); + break; + case MEDIAN: + DomainCountsDifferenceUtil.calculateMedianCount( copy_values, copy_counts, domain ); + break; + default: + throw new IllegalArgumentException(); + } + } + else { + copy_values.put( domain, Double.valueOf( 0.0 ) ); + } + } + + private static void calculateMaxCount( final SortedMap results, + final SortedMap> copy_counts, + final BinaryDomainCombination bdc ) { + final List counts = copy_counts.get( bdc ); + int max = 0; + for( final Integer count : counts ) { + if ( count > max ) { + max = count; + } + } + results.put( bdc, ( double ) max ); + } + + private static void calculateMaxCount( final SortedMap results, + final SortedMap> copy_counts, + final DomainId domain ) { + final List counts = copy_counts.get( domain ); + int max = 0; + for( final Integer count : counts ) { + if ( count > max ) { + max = count; + } + } + results.put( domain, ( double ) max ); + } + + private static void calculateMeanCount( final SortedMap results, + final SortedMap> copy_counts, + final BinaryDomainCombination bdc ) { + final List counts = copy_counts.get( bdc ); + int sum = 0; + for( final Integer count : counts ) { + sum += count; + } + results.put( bdc, ( ( double ) sum ) / ( ( double ) counts.size() ) ); + } + + private static void calculateMeanCount( final SortedMap results, + final SortedMap> copy_counts, + final DomainId domain ) { + final List counts = copy_counts.get( domain ); + int sum = 0; + for( final Integer count : counts ) { + sum += count; + } + results.put( domain, ( ( double ) sum ) / ( ( double ) counts.size() ) ); + } + + private static void calculateMedianCount( final SortedMap results, + final SortedMap> copy_counts, + final BinaryDomainCombination bdc ) { + final List counts = copy_counts.get( bdc ); + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Integer count : counts ) { + stats.addValue( count ); + } + results.put( bdc, stats.median() ); + } + + private static void calculateMedianCount( final SortedMap results, + final SortedMap> copy_counts, + final DomainId domain ) { + final List counts = copy_counts.get( domain ); + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Integer count : counts ) { + stats.addValue( count ); + } + results.put( domain, stats.median() ); + } + + private static void calculateMinCount( final SortedMap results, + final SortedMap> copy_counts, + final BinaryDomainCombination bdc ) { + final List counts = copy_counts.get( bdc ); + int min = Integer.MAX_VALUE; + for( final Integer count : counts ) { + if ( count < min ) { + min = count; + } + } + results.put( bdc, ( double ) min ); + } + + private static void calculateMinCount( final SortedMap results, + final SortedMap> copy_counts, + final DomainId domain ) { + final List counts = copy_counts.get( domain ); + int min = Integer.MAX_VALUE; + for( final Integer count : counts ) { + if ( count < min ) { + min = count; + } + } + results.put( domain, ( double ) min ); + } + + private static String combinableDomaindToString( final CombinableDomains cd ) { + final StringBuilder sb = new StringBuilder(); + sb.append( cd.getKeyDomainProteinsCount() ); + sb.append( "\t[" ); + sb.append( FORMATTER.format( cd.getKeyDomainConfidenceDescriptiveStatistics().median() ) ); + sb.append( "]" ); + return sb.toString(); + } + + private static String combinableDomaindToStringHtml( final CombinableDomains cd ) { + final StringBuilder sb = new StringBuilder(); + sb.append( "[" ); + sb.append( cd.getKeyDomainCount() ); + sb.append( ", " ); + sb.append( cd.getKeyDomainProteinsCount() ); + sb.append( ", " ); + sb.append( cd.getNumberOfCombinableDomains() ); + sb.append( "]

  • " ); + } + + private static void writeCopyNumberValues( final SortedMap copy_means, + final DomainId domain, + final GenomeWideCombinableDomains genome, + final String species, + final Writer plain_writer, + final Writer html_writer, + final String color ) throws IOException { + plain_writer.write( " " + species + "\t" ); + html_writer.write( "" ); + plain_writer.write( SurfacingConstants.NL ); + } + + private static void writeDomainCombinationValuesToFiles( final List genomes, + final List high_copy_base_species, + final List high_copy_target_species, + final List low_copy_species, + final int min_diff, + final Double factor, + final Writer html_writer, + final SortedMap high_copy_base_values, + final SortedMap high_copy_target_values, + final SortedMap low_copy_values, + final SortedSet all_bdcs, + final Map> bdcs_per_genome ) + throws IOException { + int counter = 0; + int total_absense_counter = 0; + int not_total_absense_counter = 0; + SurfacingUtil.addHtmlHead( html_writer, "Binary Domain Combination Copy Differences" ); + html_writer.write( "
    " ); + w.write( "GO term name" ); + w.write( "" ); + w.write( "GO id" ); + w.write( "" ); + w.write( "P adjusted" ); + w.write( "" ); + w.write( "P" ); + w.write( "" ); + w.write( "Pop total" ); + w.write( "" ); + w.write( "Pop term" ); + w.write( "" ); + w.write( "Study total" ); + w.write( "" ); + w.write( "Study term" ); + w.write( "" ); + w.write( "Domains" ); + w.write( "" ); + w.write( "trivial?" ); + w.write( "

    " ); + writer.write( species ); + SurfacingUtil.writeTaxonomyLinks( writer, species ); + writer.write( "

    " ); + writer.write( "" ); + writer.write( go_term.getName() ); + writer.write( "" ); + writer.write( "" ); + writer.write( "" + ontologizer_result.getGoId().getId() + "" ); + writer.write( "" ); + writer.write( "" ); + writer.write( FORMATER.format( ontologizer_result.getPAdjusted() ) ); + writer.write( "" ); + writer.write( "" ); + writer.write( "" ); + writer.write( FORMATER.format( ontologizer_result.getP() ) ); + writer.write( "" ); + writer.write( "" ); + writer.write( String.valueOf( ontologizer_result.getPopTotal() ) ); + writer.write( "" ); + writer.write( String.valueOf( ontologizer_result.getPopTerm() ) ); + writer.write( "" ); + writer.write( String.valueOf( ontologizer_result.getStudyTotal() ) ); + writer.write( "" ); + writer.write( String.valueOf( ontologizer_result.getStudyTerm() ) ); + writer.write( "" ); + if ( domains_per_species != null ) { + final StringBuilder sb = obtainDomainsForGoId( pfam_to_go, domains_per_species, go_id_to_terms, go_term + .getGoId(), domain_ids_with_go_annot ); + writer.write( sb.toString() ); + } + else { + writer.write( " " ); + } + writer.write( "" ); + if ( ontologizer_result.isTrivial() ) { + writer.write( "trivial" ); + } + else { + writer.write( " " ); + } + writer.write( "
    [" ); + sb.append( FORMATTER.format( cd.getKeyDomainConfidenceDescriptiveStatistics().median() ) ); + sb.append( "]" ); + sb.append( cd.getCombiningDomainIdsAsStringBuilder() ); + return sb.toString(); + } + + private static void writeCopyNumberValues( final SortedMap copy_means, + final BinaryDomainCombination bdc, + final GenomeWideCombinableDomains genome, + final Map> bdcs_per_genome, + final String species, + final Writer html_writer, + final String color ) throws IOException { + html_writer.write( " " ); + if ( !ForesterUtil.isEmpty( color ) ) { + html_writer.write( "" ); + } + html_writer.write( "" + species + ": " ); + if ( !ForesterUtil.isEmpty( color ) ) { + html_writer.write( "" ); + } + html_writer.write( "" ); + if ( bdcs_per_genome.get( species ).contains( bdc ) && ( copy_means.get( bdc ) > 0 ) ) { + final int count = ( ( BasicCombinableDomains ) genome.get( bdc.getId0() ) ).getCombiningDomains().get( bdc + .getId1() ); + html_writer.write( count + "" ); + } + else { + html_writer.write( "0" ); + } + html_writer.write( " " ); + if ( !ForesterUtil.isEmpty( color ) ) { + html_writer.write( "" ); + } + html_writer.write( "" + species + ": " ); + if ( !ForesterUtil.isEmpty( color ) ) { + html_writer.write( "" ); + } + html_writer.write( "" ); + if ( genome.contains( domain ) && ( copy_means.get( domain ) > 0 ) ) { + plain_writer.write( DomainCountsDifferenceUtil.combinableDomaindToString( genome.get( domain ) ) ); + html_writer.write( DomainCountsDifferenceUtil.combinableDomaindToStringHtml( genome.get( domain ) ) ); + } + else { + plain_writer.write( "0" ); + html_writer.write( "0" ); + } + html_writer.write( "
    " ); + for( final BinaryDomainCombination bdc : all_bdcs ) { + if ( ( high_copy_base_values.get( bdc ) > 0 ) && ( high_copy_target_values.get( bdc ) > 0 ) + && ( high_copy_base_values.get( bdc ) >= low_copy_values.get( bdc ) ) ) { + if ( high_copy_target_values.get( bdc ) >= min_diff + ( factor * low_copy_values.get( bdc ) ) ) { + if ( low_copy_values.get( bdc ) <= 0.0 ) { + ++total_absense_counter; + } + else { + ++not_total_absense_counter; + } + ++counter; + html_writer.write( "" ); + html_writer.write( SurfacingConstants.NL ); + } + } + } + html_writer.write( "
    " + bdc.getId0() + " = " + bdc.getId1() + "" ); + html_writer.write( "" ); + html_writer.write( "" ); + for( final GenomeWideCombinableDomains genome : genomes ) { + final String species = genome.getSpecies().getSpeciesId(); + if ( high_copy_target_species.contains( species ) ) { + html_writer.write( "" ); + writeCopyNumberValues( high_copy_target_values, + bdc, + genome, + bdcs_per_genome, + species, + html_writer, + "#0000FF" ); + html_writer.write( "" ); + } + else if ( low_copy_species.contains( species ) ) { + html_writer.write( "" ); + writeCopyNumberValues( low_copy_values, + bdc, + genome, + bdcs_per_genome, + species, + html_writer, + "#A0A0A0" ); + html_writer.write( "" ); + } + else if ( high_copy_base_species.contains( species ) ) { + html_writer.write( "" ); + writeCopyNumberValues( high_copy_base_values, + bdc, + genome, + bdcs_per_genome, + species, + html_writer, + "#404040" ); + html_writer.write( "" ); + } + } + html_writer.write( "
    " ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "Rule 1: high-copy-base > 0 && high-copy-target > 0 && high-copy-base >= low-copy" ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "Rule 2: high-copy-target >= minimal-difference + ( factor * low-copy )" ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "Calculation mode for high copy target : " + COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Calculation mode for high copy base : " + COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Calculation mode for low copy : " + COPY_CALC_MODE_FOR_LOW_COPY_SPECIES ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Minimal difference : " + min_diff ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Factor : " + factor ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Lower copy binary domain combinations : " + counter ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Total absence : " + total_absense_counter ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Not total absence : " + not_total_absense_counter ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Total binary domain combinations : " + all_bdcs.size() ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "" ); + html_writer.write( SurfacingConstants.NL ); + html_writer.close(); + } + + private static void writeDomainValuesToFiles( final List genomes, + final List high_copy_base_species, + final List high_copy_target_species, + final List low_copy_species, + final int min_diff, + final Double factor, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final Writer plain_writer, + final Writer html_writer, + final File proteins_file_base, + final SortedMap high_copy_base_values, + final SortedMap high_copy_target_values, + final SortedMap low_copy_values, + final SortedSet all_domains, + final SortedSet go_ids_of_passing_domains, + final SortedMap> protein_lists_per_species ) + throws IOException { + int counter = 0; + int total_absense_counter = 0; + int not_total_absense_counter = 0; + SurfacingUtil.addHtmlHead( html_writer, "Domain Copy Differences" ); + html_writer.write( "" ); + for( final DomainId domain_id : all_domains ) { + if ( ( high_copy_base_values.get( domain_id ) > 0 ) && ( high_copy_target_values.get( domain_id ) > 0 ) + && ( high_copy_base_values.get( domain_id ) >= low_copy_values.get( domain_id ) ) ) { + if ( high_copy_target_values.get( domain_id ) >= min_diff + + ( factor * low_copy_values.get( domain_id ) ) ) { + if ( low_copy_values.get( domain_id ) <= 0.0 ) { + ++total_absense_counter; + } + else { + ++not_total_absense_counter; + } + ++counter; + writeProteinsToFile( proteins_file_base, protein_lists_per_species, domain_id ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + go_ids_of_passing_domains.addAll( domain_id_to_go_ids_map.get( domain_id ) ); + } + plain_writer.write( domain_id.getId() ); + plain_writer.write( SurfacingConstants.NL ); + html_writer.write( "" ); + html_writer.write( SurfacingConstants.NL ); + plain_writer.write( SurfacingConstants.NL ); + } + } + } + html_writer.write( "
    " + domain_id.getId() + "" ); + html_writer.write( addGoInformation( domain_id, domain_id_to_go_ids_map, go_id_to_term_map ) + .toString() ); + html_writer.write( "" ); + html_writer.write( "" ); + for( final GenomeWideCombinableDomains genome : genomes ) { + final String species = genome.getSpecies().getSpeciesId(); + if ( high_copy_target_species.contains( species ) ) { + html_writer.write( "" ); + writeCopyNumberValues( high_copy_target_values, + domain_id, + genome, + species, + plain_writer, + html_writer, + "#0000FF" ); + html_writer.write( "" ); + } + else if ( low_copy_species.contains( species ) ) { + html_writer.write( "" ); + writeCopyNumberValues( low_copy_values, + domain_id, + genome, + species, + plain_writer, + html_writer, + "#A0A0A0" ); + html_writer.write( "" ); + } + else if ( high_copy_base_species.contains( species ) ) { + html_writer.write( "" ); + writeCopyNumberValues( high_copy_base_values, + domain_id, + genome, + species, + plain_writer, + html_writer, + "#404040" ); + html_writer.write( "" ); + } + } + html_writer.write( "
    " ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "Rule 1: high-copy-base > 0 && high-copy-target > 0 && high-copy-base >= low-copy" ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "Rule 2: high-copy-target >= minimal-difference + ( factor * low-copy )" ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "Calculation mode for high copy target : " + COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Calculation mode for high copy base : " + COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Calculation mode for low copy : " + COPY_CALC_MODE_FOR_LOW_COPY_SPECIES ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Minimal difference : " + min_diff ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Factor : " + factor ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Lower copy domains : " + counter ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Total absence : " + total_absense_counter ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Not total absence : " + not_total_absense_counter ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( "Total domains : " + all_domains.size() ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "
    " ); + html_writer.write( SurfacingConstants.NL ); + html_writer.write( "" ); + html_writer.write( SurfacingConstants.NL ); + html_writer.close(); + plain_writer.write( "# Rule 1: high-copy-base > 0 && high-copy-target > 0 && high-copy-base >= low-copy" ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Rule 2: high-copy-target >= minimal-difference + ( factor * low-copy )" ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Calculation mode for high copy target: " + COPY_CALC_MODE_FOR_HIGH_COPY_TARGET_SPECIES ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Calculation mode for high copy base : " + COPY_CALC_MODE_FOR_HIGH_COPY_BASE_SPECIES ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Calculation mode for low copy : " + COPY_CALC_MODE_FOR_LOW_COPY_SPECIES ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Minimal difference: " + min_diff ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Factor : " + factor ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Lower copy domains: " + counter ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Total absence : " + total_absense_counter ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Not total absence : " + not_total_absense_counter ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.write( "# Total domains : " + all_domains.size() ); + plain_writer.write( SurfacingConstants.NL ); + plain_writer.close(); + } + + private static void writeGoIdsToFile( final Writer writer, final SortedSet gos ) throws IOException { + for( final GoId go_id : gos ) { + writer.write( go_id.toString() ); + writer.write( SurfacingConstants.NL ); + } + writer.close(); + } + + private static void writeProteinsToFile( final File proteins_file_base, + final SortedMap> protein_lists_per_species, + final DomainId domain_id ) throws IOException { + final File my_proteins_file = new File( proteins_file_base.getParentFile() + ForesterUtil.FILE_SEPARATOR + + domain_id + PLUS_MINUS_PROTEINS_FILE_DOM_SUFFIX ); + SurfacingUtil.checkForOutputFileWriteability( my_proteins_file ); + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( my_proteins_file ) ); + SurfacingUtil.extractProteinNames( protein_lists_per_species, domain_id, proteins_file_writer, "\t" ); + proteins_file_writer.close(); + System.out.println( "Wrote proteins list to \"" + my_proteins_file + "\"" ); + } + + public static enum COPY_CALCULATION_MODE { + MEAN, MEDIAN, MAX, MIN + } +} diff --git a/forester/java/src/org/forester/surfacing/DomainId.java b/forester/java/src/org/forester/surfacing/DomainId.java new file mode 100644 index 0000000..18a4a4d --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainId.java @@ -0,0 +1,131 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.ArrayList; +import java.util.List; + +import org.forester.go.GoId; +import org.forester.util.ForesterUtil; + +public class DomainId implements Comparable { + + final private String _id; + private List _go_ids; + + public DomainId( final String id ) { + if ( ForesterUtil.isEmpty( id ) ) { + throw new IllegalArgumentException( "attempt to create domain id from empty or null string" ); + } + _id = id.trim(); + if ( _id.indexOf( ' ' ) > -1 ) { + throw new IllegalArgumentException( "attempt to create domain id from string containing one ore more spaces [" + + _id + "]" ); + } + else if ( _id.indexOf( BinaryDomainCombination.SEPARATOR ) > -1 ) { + throw new IllegalArgumentException( "attempt to create domain id from string containing the separator character [" + + BinaryDomainCombination.SEPARATOR + "] for domain combinations [" + _id + "]" ); + } + setGoIds( null ); + } + + public void addGoId( final GoId go_id ) { + if ( getGoIds() == null ) { + setGoIds( new ArrayList() ); + } + getGoIds().add( go_id ); + } + + @Override + public int compareTo( final DomainId domain_id ) { + if ( this == domain_id ) { + return 0; + } + return getId().toLowerCase().compareTo( domain_id.getId().toLowerCase() ); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check [" + this.getClass() + "] equality to " + o + " [" + + o.getClass() + "]" ); + } + else { + return getId().equals( ( ( DomainId ) o ).getId() ); + } + } + + public GoId getGoId( final int i ) { + return getGoIds().get( i ); + } + + // Note. + // The fact that equals and compareTo do not behave the same in cases where ids only differ by their case + // is not ideal. From Sun regarding Interface SortedSet: + // "Note that the ordering maintained by a sorted set (whether or not an explicit comparator is provided) + // must be consistent with equals if the sorted set is to correctly implement the Set interface. + // (See the Comparable interface or Comparator interface for a precise definition of consistent + // with equals.) This is so because the Set interface is defined in terms of the equals operation, + // but a sorted set performs all element comparisons using its compareTo (or compare) method, + // so two elements that are deemed equal by this method are, from the standpoint of the sorted set, + // equal. The behavior of a sorted set is well-defined even if its ordering is inconsistent with equals; + // it just fails to obey the general contract of the Set interface." + List getGoIds() { + return _go_ids; + } + + public String getId() { + return _id; + } + + public int getNumberOfGoIds() { + if ( getGoIds() == null ) { + return 0; + } + return getGoIds().size(); + } + + @Override + public int hashCode() { + return getId().hashCode(); + } + + private void setGoIds( final List go_ids ) { + _go_ids = go_ids; + } + + @Override + public String toString() { + return getId(); + } +} diff --git a/forester/java/src/org/forester/surfacing/DomainLengths.java b/forester/java/src/org/forester/surfacing/DomainLengths.java new file mode 100644 index 0000000..9e39b9e --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainLengths.java @@ -0,0 +1,143 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2010 Christian M. Zmasek +// Copyright (C) 2008-2010 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.ArrayList; +import java.util.List; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; + +public class DomainLengths { + + final DomainId _domain_id; + final SortedMap _length_statistics; + + public DomainLengths( final DomainId domain_id ) { + _domain_id = domain_id; + _length_statistics = new TreeMap(); + } + + public void addLength( final Species species, final int domain_length ) { + if ( !getLengthStatistics().containsKey( species ) ) { + addLengthStatistics( species, new BasicDescriptiveStatistics() ); + } + getLengthStatistic( species ).addValue( domain_length ); + } + + private void addLengthStatistics( final Species species, final DescriptiveStatistics length_statistic ) { + if ( getLengthStatistics().containsKey( species ) ) { + throw new IllegalArgumentException( "length statistics for [" + species.getSpeciesId() + "] already added" ); + } + getLengthStatistics().put( species, length_statistic ); + } + + /** + * Returns descriptive statistics based on the arithmetic means + * for each species. + * + * + * @return + */ + public DescriptiveStatistics calculateMeanBasedStatistics() { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final DescriptiveStatistics s : getLengthStatisticsList() ) { + stats.addValue( s.arithmeticMean() ); + } + return stats; + } + + /** + * + * Note. This is not technically a Z-score since the distribution + * of means is unknown (and not normal). + * + * @param species + * @return + */ + public double calculateZScoreForSpecies( final Species species ) { + final double species_mean = getLengthStatistic( species ).arithmeticMean(); + final DescriptiveStatistics domain_stats = calculateMeanBasedStatistics(); + final double population_sd = domain_stats.sampleStandardDeviation(); + final double population_mean = domain_stats.arithmeticMean(); + return ( species_mean - population_mean ) / population_sd; + } + + public DomainId getDomainId() { + return _domain_id; + } + + public DescriptiveStatistics getLengthStatistic( final Species species ) { + return getLengthStatistics().get( species ); + } + + private SortedMap getLengthStatistics() { + return _length_statistics; + } + + public List getLengthStatisticsList() { + final List list = new ArrayList(); + for( final DescriptiveStatistics stats : _length_statistics.values() ) { + list.add( stats ); + } + return list; + } + + public List getMeanBasedOutlierSpecies( final double z_score_limit ) { + final List species = new ArrayList(); + if ( getSpeciesList().size() > 1 ) { + for( final Species s : getSpeciesList() ) { + final double z = calculateZScoreForSpecies( s ); + if ( z_score_limit < 0 ) { + if ( z <= z_score_limit ) { + species.add( s ); + } + } + else if ( z_score_limit > 0 ) { + if ( z >= z_score_limit ) { + species.add( s ); + } + } + } + } + return species; + } + + public List getSpeciesList() { + final List list = new ArrayList(); + for( final Species s : _length_statistics.keySet() ) { + list.add( s ); + } + return list; + } + + public boolean isHasLengthStatistic( final Species species ) { + return getLengthStatistics().containsKey( species ); + } +} diff --git a/forester/java/src/org/forester/surfacing/DomainLengthsTable.java b/forester/java/src/org/forester/surfacing/DomainLengthsTable.java new file mode 100644 index 0000000..4b6ca22 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainLengthsTable.java @@ -0,0 +1,165 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2010 Christian M. Zmasek +// Copyright (C) 2008-2010 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +public class DomainLengthsTable { + + private final static DecimalFormat DF = new DecimalFormat( "#.0" ); + final SortedMap _domain_lengths; + final List _species; + + public DomainLengthsTable() { + _domain_lengths = new TreeMap(); + _species = new ArrayList(); + } + + private void addDomainLengths( final DomainLengths domain_lengths ) { + if ( getDomainLengths().containsKey( domain_lengths.getDomainId() ) ) { + throw new IllegalArgumentException( "domain lengths for [" + domain_lengths.getDomainId() + + "] already added" ); + } + getDomainLengths().put( domain_lengths.getDomainId(), domain_lengths ); + } + + private void addLength( final DomainId domain_id, final Species species, final int domain_length ) { + if ( !getDomainLengths().containsKey( domain_id ) ) { + addDomainLengths( new DomainLengths( domain_id ) ); + } + getDomainLengths().get( domain_id ).addLength( species, domain_length ); + } + + public void addLengths( final List protein_list ) { + for( final Protein protein : protein_list ) { + final Species species = protein.getSpecies(); + if ( !_species.contains( species ) ) { + _species.add( species ); + } + for( final Domain domain : protein.getProteinDomains() ) { + addLength( domain.getDomainId(), species, ( domain.getTo() - domain.getFrom() ) + 1 ); + } + } + } + + public DescriptiveStatistics calculateMeanBasedStatisticsForAllSpecies() { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Species species : getSpecies() ) { + final DescriptiveStatistics stats_per_species = calculateMeanBasedStatisticsForSpecies( species ); + stats.addValue( stats_per_species.arithmeticMean() ); + } + return stats; + } + + public DescriptiveStatistics calculateMeanBasedStatisticsForDomain( final DomainId domain_id ) { + return getDomainLengths( domain_id ).calculateMeanBasedStatistics(); + } + + public DescriptiveStatistics calculateMeanBasedStatisticsForSpecies( final Species species ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final DomainLengths l : getDomainLengths().values() ) { + if ( l.isHasLengthStatistic( species ) ) { + stats.addValue( l.getLengthStatistic( species ).arithmeticMean() ); + } + } + return stats; + } + + public StringBuilder createMeanBasedStatisticsPerSpeciesTable() { + final StringBuilder sb = new StringBuilder(); + sb.append( "SPECIES" ); + sb.append( "\t" ); + sb.append( "MEAN" ); + sb.append( "\t" ); + sb.append( "SD" ); + sb.append( "\t" ); + sb.append( "MIN" ); + sb.append( "\t" ); + sb.append( "MAX" ); + sb.append( "\t" ); + sb.append( "MEDIAN" ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + for( final Species species : getSpecies() ) { + final DescriptiveStatistics stats = calculateMeanBasedStatisticsForSpecies( species ); + sb.append( species ); + sb.append( "\t" ); + sb.append( DF.format( stats.arithmeticMean() ) ); + sb.append( "\t" ); + try { + sb.append( DF.format( stats.sampleStandardDeviation() ) ); + } + catch ( final ArithmeticException e ) { + sb.append( "" ); + } + sb.append( "\t" ); + sb.append( DF.format( stats.getMin() ) ); + sb.append( "\t" ); + sb.append( DF.format( stats.getMax() ) ); + sb.append( "\t" ); + try { + sb.append( DF.format( stats.median() ) ); + } + catch ( final ArithmeticException e ) { + sb.append( "" ); + } + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb; + } + + private SortedMap getDomainLengths() { + return _domain_lengths; + } + + public DomainLengths getDomainLengths( final DomainId domain_id ) { + return getDomainLengths().get( domain_id ); + } + + public List getDomainLengthsList() { + final List list = new ArrayList(); + for( final DomainLengths l : getDomainLengths().values() ) { + list.add( l ); + } + return list; + } + + public DescriptiveStatistics getLengthStatistic( final DomainId domain_id, final Species species ) { + return getDomainLengths( domain_id ).getLengthStatistic( species ); + } + + public List getSpecies() { + return _species; + } +} diff --git a/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java new file mode 100644 index 0000000..82de445 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java @@ -0,0 +1,744 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates; +import org.forester.evoinference.parsimony.DolloParsimony; +import org.forester.evoinference.parsimony.FitchParsimony; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.BinaryCharacters; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType; +import org.forester.util.ForesterUtil; + +public final class DomainParsimonyCalculator { + + private static final String TYPE_FORBINARY_CHARACTERS = "parsimony inferred"; + private CharacterStateMatrix _gain_loss_matrix; + private CharacterStateMatrix _binary_internal_states_matrix; + private final List _gwcd_list; + private final Phylogeny _phylogeny; + private int _total_losses; + private int _total_gains; + private int _total_unchanged; + private int _cost; + private Map> _domain_id_to_secondary_features_map; + private SortedSet _positive_filter; + + private DomainParsimonyCalculator( final Phylogeny phylogeny ) { + init(); + _phylogeny = phylogeny; + _gwcd_list = null; + } + + private DomainParsimonyCalculator( final Phylogeny phylogeny, final List gwcd_list ) { + init(); + _phylogeny = phylogeny; + _gwcd_list = gwcd_list; + } + + private DomainParsimonyCalculator( final Phylogeny phylogeny, + final List gwcd_list, + final Map> domain_id_to_secondary_features_map ) { + init(); + _phylogeny = phylogeny; + _gwcd_list = gwcd_list; + setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map ); + } + + int calculateNumberOfBinaryDomainCombination() { + if ( getGenomeWideCombinableDomainsList().isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + final Set all_binary_combinations = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) { + for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { + all_binary_combinations.add( bc ); + } + } + return all_binary_combinations.size(); + } + + CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence() { + return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + + CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence() { + return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() ); + } + + CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map mapping_results_map ) { + return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(), + getDomainIdToSecondaryFeaturesMap(), + mapping_results_map ); + } + + Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) { + for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + final String node_identifier = node.getName(); + final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ), + getUnitsGainedOnNode( node_identifier ), + getUnitsLostOnNode( node_identifier ), + TYPE_FORBINARY_CHARACTERS, + getSumOfPresentOnNode( node_identifier ), + getSumOfGainsOnNode( node_identifier ), + getSumOfLossesOnNode( node_identifier ) ); + node.getNodeData().setBinaryCharacters( bc ); + } + return phylogeny; + } + + private void executeDolloParsimony( final boolean on_domain_presence ) { + reset(); + final DolloParsimony dollo = DolloParsimony.createInstance(); + dollo.setReturnGainLossMatrix( true ); + dollo.setReturnInternalStates( true ); + CharacterStateMatrix states = null; + if ( on_domain_presence ) { + states = createMatrixOfDomainPresenceOrAbsence(); + } + else { + states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence(); + } + dollo.execute( getPhylogeny(), states ); + setGainLossMatrix( dollo.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() ); + setCost( dollo.getCost() ); + setTotalGains( dollo.getTotalGains() ); + setTotalLosses( dollo.getTotalLosses() ); + setTotalUnchanged( dollo.getTotalUnchanged() ); + } + + public void executeDolloParsimonyOnBinaryDomainCombintionPresence() { + executeDolloParsimony( false ); + } + + public void executeDolloParsimonyOnDomainPresence() { + executeDolloParsimony( true ); + } + + public void executeDolloParsimonyOnDomainPresence( final SortedSet positive_filter ) { + setPositiveFilter( positive_filter ); + executeDolloParsimony( true ); + setPositiveFilter( null ); + } + + public void executeDolloParsimonyOnSecondaryFeatures( final Map mapping_results_map ) { + if ( getDomainIdToSecondaryFeaturesMap() == null ) { + throw new RuntimeException( "Domain id to secondary features map has apparently not been set" ); + } + reset(); + final DolloParsimony dollo = DolloParsimony.createInstance(); + dollo.setReturnGainLossMatrix( true ); + dollo.setReturnInternalStates( true ); + final CharacterStateMatrix states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map ); + dollo.execute( getPhylogeny(), states ); + setGainLossMatrix( dollo.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() ); + setCost( dollo.getCost() ); + setTotalGains( dollo.getTotalGains() ); + setTotalLosses( dollo.getTotalLosses() ); + setTotalUnchanged( dollo.getTotalUnchanged() ); + } + + private void executeFitchParsimony( final boolean on_domain_presence, + final boolean use_last, + final boolean randomize, + final long random_number_seed ) { + reset(); + if ( use_last ) { + System.out.println( " Fitch parsimony: use_last = true" ); + } + final FitchParsimony fitch = new FitchParsimony(); + fitch.setRandomize( randomize ); + if ( randomize ) { + fitch.setRandomNumberSeed( random_number_seed ); + } + fitch.setUseLast( use_last ); + fitch.setReturnGainLossMatrix( true ); + fitch.setReturnInternalStates( true ); + CharacterStateMatrix states = null; + if ( on_domain_presence ) { + states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + else { + states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + fitch.execute( getPhylogeny(), states ); + setGainLossMatrix( fitch.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); + setCost( fitch.getCost() ); + setTotalGains( fitch.getTotalGains() ); + setTotalLosses( fitch.getTotalLosses() ); + setTotalUnchanged( fitch.getTotalUnchanged() ); + } + + public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) { + executeFitchParsimony( false, use_last, false, 0 ); + } + + public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) { + executeFitchParsimony( false, false, true, random_number_seed ); + } + + public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) { + executeFitchParsimony( true, use_last, false, 0 ); + } + + public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) { + executeFitchParsimony( true, false, true, random_number_seed ); + } + + public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix binary_states_matrix, + final String[] character_labels ) { + reset(); + if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) { + throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" ); + } + if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) { + throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" ); + } + final CharacterStateMatrix gl_matrix = new BasicCharacterStateMatrix( binary_states_matrix + .getNumberOfIdentifiers(), + binary_states_matrix + .getNumberOfCharacters() ); + int total_gains = 0; + int total_losses = 0; + int total_unchanged = 0; + int i = 0; + for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) { + gl_matrix.setIdentifier( i++, it.next().getName() ); + } + for( int c = 0; c < character_labels.length; ++c ) { + gl_matrix.setCharacter( c, character_labels[ c ] ); + final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode node = it.next(); + final String name = node.getName(); + final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix + .getIdentifierIndex( name ), c ); + final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent(); + GainLossStates gl_state = null; + if ( node.isRoot() ) { + ++total_unchanged; + if ( bin_state == BinaryStates.ABSENT ) { + gl_state = GainLossStates.UNCHANGED_ABSENT; + } + else { + gl_state = GainLossStates.UNCHANGED_PRESENT; + } + } + else { + final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix + .getIdentifierIndex( parent_node.getName() ), c ); + if ( bin_state == BinaryStates.ABSENT ) { + if ( parent_bin_state == BinaryStates.ABSENT ) { + ++total_unchanged; + gl_state = GainLossStates.UNCHANGED_ABSENT; + } + else { + ++total_losses; + gl_state = GainLossStates.LOSS; + } + } + else { + if ( parent_bin_state == BinaryStates.ABSENT ) { + ++total_gains; + gl_state = GainLossStates.GAIN; + } + else { + ++total_unchanged; + gl_state = GainLossStates.UNCHANGED_PRESENT; + } + } + } + gl_matrix.setState( name, c, gl_state ); + } + } + setTotalGains( total_gains ); + setTotalLosses( total_losses ); + setTotalUnchanged( total_unchanged ); + setCost( total_gains + total_losses ); + setGainLossMatrix( gl_matrix ); + } + + public int getCost() { + return _cost; + } + + private Map> getDomainIdToSecondaryFeaturesMap() { + return _domain_id_to_secondary_features_map; + } + + public CharacterStateMatrix getGainLossCountsMatrix() { + final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( getGainLossMatrix() + .getNumberOfIdentifiers(), 3 ); + for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) { + matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) ); + } + matrix.setCharacter( 0, "GAINS" ); + matrix.setCharacter( 1, "LOSSES" ); + matrix.setCharacter( 2, "NET" ); + for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) { + int gains = 0; + int losses = 0; + for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) { + final GainLossStates s = getGainLossMatrix().getState( i, c ); + if ( s == GainLossStates.GAIN ) { + ++gains; + } + else if ( s == GainLossStates.LOSS ) { + ++losses; + } + } + matrix.setState( i, 0, gains ); + matrix.setState( i, 1, losses ); + matrix.setState( i, 2, gains - losses ); + } + return matrix; + } + + public CharacterStateMatrix getGainLossMatrix() { + return _gain_loss_matrix; + } + + private List getGenomeWideCombinableDomainsList() { + return _gwcd_list; + } + + public CharacterStateMatrix getInternalStatesMatrix() { + return _binary_internal_states_matrix; + } + + public int getNetGainsOnNode( final String node_identifier ) { + if ( getGainLossMatrix() == null ) { + throw new RuntimeException( "no gain loss matrix has been calculated" ); + } + int net = 0; + final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier ); + for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) { + if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) { + ++net; + } + else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) { + --net; + } + } + return net; + } + + private Phylogeny getPhylogeny() { + return _phylogeny; + } + + private SortedSet getPositiveFilter() { + return _positive_filter; + } + + public int getSumOfGainsOnNode( final String node_identifier ) { + return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN ); + } + + public int getSumOfLossesOnNode( final String node_identifier ) { + return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS ); + } + + public int getSumOfPresentOnNode( final String node_identifier ) { + return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier ); + } + + int getSumOfUnchangedAbsentOnNode( final String node_identifier ) { + return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); + } + + int getSumOfUnchangedOnNode( final String node_identifier ) { + return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier ); + } + + int getSumOfUnchangedPresentOnNode( final String node_identifier ) { + return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); + } + + public int getTotalGains() { + return _total_gains; + } + + public int getTotalLosses() { + return _total_losses; + } + + public int getTotalUnchanged() { + return _total_unchanged; + } + + public SortedSet getUnitsGainedOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN ); + } + + public SortedSet getUnitsLostOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS ); + } + + public SortedSet getUnitsOnNode( final String node_identifier ) { + final SortedSet present = getUnitsGainedOnNode( node_identifier ); + present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) ); + return present; + } + + SortedSet getUnitsUnchangedAbsentOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); + } + + SortedSet getUnitsUnchangedPresentOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); + } + + private void init() { + setDomainIdToSecondaryFeaturesMap( null ); + setPositiveFilter( null ); + reset(); + } + + private void reset() { + setGainLossMatrix( null ); + setBinaryInternalStatesMatrix( null ); + setCost( -1 ); + setTotalGains( -1 ); + setTotalLosses( -1 ); + setTotalUnchanged( -1 ); + } + + private void setBinaryInternalStatesMatrix( final CharacterStateMatrix binary_states_matrix ) { + _binary_internal_states_matrix = binary_states_matrix; + } + + private void setCost( final int cost ) { + _cost = cost; + } + + private void setDomainIdToSecondaryFeaturesMap( final Map> domain_id_to_secondary_features_map ) { + _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map; + } + + private void setGainLossMatrix( final CharacterStateMatrix gain_loss_matrix ) { + _gain_loss_matrix = gain_loss_matrix; + } + + private void setPositiveFilter( final SortedSet positive_filter ) { + _positive_filter = positive_filter; + } + + private void setTotalGains( final int total_gains ) { + _total_gains = total_gains; + } + + private void setTotalLosses( final int total_losses ) { + _total_losses = total_losses; + } + + private void setTotalUnchanged( final int total_unchanged ) { + _total_unchanged = total_unchanged; + } + + public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) { + return new DomainParsimonyCalculator( phylogeny ); + } + + public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny, + final List gwcd_list ) { + if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) { + throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes() + + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" ); + } + return new DomainParsimonyCalculator( phylogeny, gwcd_list ); + } + + public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny, + final List gwcd_list, + final Map> domain_id_to_secondary_features_map ) { + if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) { + throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" ); + } + return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map ); + } + + public static CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list ) { + if ( gwcd_list.isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + final int number_of_identifiers = gwcd_list.size(); + final SortedSet all_binary_combinations = new TreeSet(); + final Set[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ]; + int identifier_index = 0; + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + binary_combinations_per_genome[ identifier_index ] = new HashSet(); + for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { + all_binary_combinations.add( bc ); + binary_combinations_per_genome[ identifier_index ].add( bc ); + } + ++identifier_index; + } + final int number_of_characters = all_binary_combinations.size(); + final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, + number_of_characters ); + int character_index = 0; + for( final BinaryDomainCombination bc : all_binary_combinations ) { + matrix.setCharacter( character_index++, bc.toString() ); + } + identifier_index = 0; + final Set all_identifiers = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + final String species_id = gwcd.getSpecies().getSpeciesId(); + if ( all_identifiers.contains( species_id ) ) { + throw new AssertionError( "species [" + species_id + "] is not unique" ); + } + all_identifiers.add( species_id ); + matrix.setIdentifier( identifier_index, species_id ); + for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { + BinaryDomainCombination bc = null; + if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) { + bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) { + bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + else { + bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); + } + else { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT ); + } + } + ++identifier_index; + } + return matrix; + } + + static CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence( final List gwcd_list ) { + return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null ); + } + + public static CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence( final List gwcd_list, + final SortedSet positive_filter ) { + if ( gwcd_list.isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) { + throw new IllegalArgumentException( "positive filter is empty" ); + } + final int number_of_identifiers = gwcd_list.size(); + final SortedSet all_domain_ids = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + for( final DomainId domain : gwcd.getAllDomainIds() ) { + all_domain_ids.add( domain ); + } + } + int number_of_characters = all_domain_ids.size(); + if ( positive_filter != null ) { + //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes + number_of_characters = 0; + for( final DomainId id : all_domain_ids ) { + if ( positive_filter.contains( id ) ) { + number_of_characters++; + } + } + } + final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, + number_of_characters ); + int character_index = 0; + for( final DomainId id : all_domain_ids ) { + if ( positive_filter == null ) { + matrix.setCharacter( character_index++, id.getId() ); + } + else { + if ( positive_filter.contains( id ) ) { + matrix.setCharacter( character_index++, id.getId() ); + } + } + } + int identifier_index = 0; + final Set all_identifiers = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + final String species_id = gwcd.getSpecies().getSpeciesId(); + if ( all_identifiers.contains( species_id ) ) { + throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); + } + all_identifiers.add( species_id ); + matrix.setIdentifier( identifier_index, species_id ); + for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { + if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) { + throw new RuntimeException( "this should not have happened: problem with character #" + ci ); + } + final DomainId id = new DomainId( matrix.getCharacter( ci ) ); + if ( gwcd.contains( id ) ) { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); + } + else { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT ); + } + } + ++identifier_index; + } + return matrix; + } + + /** + * For folds instead of Pfam-domains, for example + * + * + * @param gwcd_list + * @return + */ + static CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final List gwcd_list, + final Map> domain_id_to_second_features_map, + final Map mapping_results_map ) { + if ( gwcd_list.isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { + throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); + } + final int number_of_identifiers = gwcd_list.size(); + final SortedSet all_secondary_features = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + int mapped = 0; + int not_mapped = 0; + for( final DomainId domain : gwcd.getAllDomainIds() ) { + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) ); + mapped++; + } + else { + not_mapped++; + } + } + if ( mapping_results_map != null ) { + final MappingResults mr = new MappingResults(); + mr.setDescription( gwcd.getSpecies().getSpeciesId() ); + mr.setSumOfSuccesses( mapped ); + mr.setSumOfFailures( not_mapped ); + mapping_results_map.put( gwcd.getSpecies(), mr ); + } + } + final int number_of_characters = all_secondary_features.size(); + final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, + number_of_characters ); + int character_index = 0; + for( final String second_id : all_secondary_features ) { + matrix.setCharacter( character_index++, second_id ); + } + int identifier_index = 0; + final Set all_identifiers = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + final String species_id = gwcd.getSpecies().getSpeciesId(); + if ( all_identifiers.contains( species_id ) ) { + throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); + } + all_identifiers.add( species_id ); + matrix.setIdentifier( identifier_index, species_id ); + final Set all_second_per_gwcd = new HashSet(); + for( final DomainId domain : gwcd.getAllDomainIds() ) { + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) ); + } + } + for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { + if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); + } + else { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT ); + } + } + ++identifier_index; + } + return matrix; + } + + private static int getStateSumDeltaOnNode( final String node_identifier, + final CharacterStateMatrix gain_loss_matrix, + final GainLossStates state ) { + if ( gain_loss_matrix == null ) { + throw new RuntimeException( "no gain loss matrix has been calculated" ); + } + if ( ForesterUtil.isEmpty( node_identifier ) ) { + throw new IllegalArgumentException( "node identifier must not be empty" ); + } + if ( gain_loss_matrix.isEmpty() ) { + throw new RuntimeException( "gain loss matrix is empty" ); + } + int sum = 0; + final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier ); + for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) { + if ( gain_loss_matrix.getState( id_index, c ) == state ) { + ++sum; + } + } + return sum; + } + + private static SortedSet getUnitsDeltaOnNode( final String node_identifier, + final CharacterStateMatrix gain_loss_matrix, + final GainLossStates state ) { + if ( gain_loss_matrix == null ) { + throw new RuntimeException( "no gain loss matrix has been calculated" ); + } + if ( ForesterUtil.isEmpty( node_identifier ) ) { + throw new IllegalArgumentException( "node identifier must not be empty" ); + } + if ( gain_loss_matrix.isEmpty() ) { + throw new RuntimeException( "gain loss matrix is empty" ); + } + final SortedSet d = new TreeSet(); + final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier ); + for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) { + if ( gain_loss_matrix.getState( id_index, c ) == state ) { + if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) { + throw new AssertionError( "this should not have happended: character [" + + gain_loss_matrix.getCharacter( c ) + "] already in set" ); + } + d.add( gain_loss_matrix.getCharacter( c ) ); + } + } + return d; + } +} diff --git a/forester/java/src/org/forester/surfacing/DomainSimilarity.java b/forester/java/src/org/forester/surfacing/DomainSimilarity.java new file mode 100644 index 0000000..bdf227c --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainSimilarity.java @@ -0,0 +1,101 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.SortedMap; +import java.util.SortedSet; + +/* + * This is to represent a measure of similarity between two or more domains from + * different genomes. + */ +public interface DomainSimilarity extends Comparable { + + public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain );; + + public DomainId getDomainId(); + + /** + * For pairwise similarities, this should return the "difference"; for example the difference in counts + * for copy number based features (the same as getMaximalDifferenceInCounts(), or the number + * of actually different domain combinations. + * For pairwise similarities, this should return the difference, + * while for comparisons of more than two domains, this should return the maximal difference + * + * + * + * @return + */ + public int getMaximalDifference(); + + /** + * For pairwise similarities, this should return the difference in counts, + * while for comparisons of more than two domains, this should return the maximal difference + * in counts + * + * + * @return the (maximal) difference in counts + */ + public int getMaximalDifferenceInCounts(); + + public double getMaximalSimilarityScore(); + + public double getMeanSimilarityScore(); + + public double getMinimalSimilarityScore(); + + /** + * This should return the number of pairwise distances used to calculate + * this similarity score + * + * @return the number of pairwise distances + */ + public int getN(); + + public SortedSet getSpecies(); + + /** + * This should return a map, which maps species names to + * SpeciesSpecificDomainSimilariyData + * + * + * @return SortedMap + */ + public SortedMap getSpeciesData(); + + public double getStandardDeviationOfSimilarityScore(); + + public StringBuffer toStringBuffer( final PrintableDomainSimilarity.PRINT_OPTION print_option ); + + static public enum DomainSimilarityScoring { + DOMAINS, PROTEINS, COMBINATIONS; + } + + public static enum DomainSimilaritySortField { + MIN, MAX, SD, MEAN, ABS_MAX_COUNTS_DIFFERENCE, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, SPECIES_COUNT, DOMAIN_ID, + } +} diff --git a/forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java new file mode 100644 index 0000000..4a4c91f --- /dev/null +++ b/forester/java/src/org/forester/surfacing/DomainSimilarityCalculator.java @@ -0,0 +1,47 @@ +// $Id: +// $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.List; +import java.util.SortedSet; + +public interface DomainSimilarityCalculator { + + public SortedSet calculateSimilarities( final PairwiseDomainSimilarityCalculator pairwise_calculator, + final List cdc_list, + final boolean ignore_domains_without_combinations_in_any_genome, + final boolean ignore_domains_specific_to_one_genome );; + + public static enum Detailedness { + BASIC, LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES, PUNCTILIOUS + } + + public static enum GoAnnotationOutput { + NONE, ALL + } +} diff --git a/forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java b/forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java new file mode 100644 index 0000000..e1c6cc6 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/GenomeWideCombinableDomains.java @@ -0,0 +1,79 @@ +// $Id: +// $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.SortedMap; +import java.util.SortedSet; + +import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType; +import org.forester.util.DescriptiveStatistics; + +public interface GenomeWideCombinableDomains { + + public boolean contains( DomainId key_id ); + + public CombinableDomains get( DomainId key_id ); + + public SortedMap getAllCombinableDomainsIds(); + + /** + * This should return all domains ids present in the genome. + * + * @return a sorted set of domains ids + */ + public SortedSet getAllDomainIds(); + + public DomainCombinationType getDomainCombinationType(); + + SortedSet getMostPromiscuosDomain(); + + /** + * This should return a statistic for per domain + * promiscuity in a genome. + * + * @return descriptive statistics for per domain promiscuity in a genome + */ + public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics(); + + public int getSize(); + + public Species getSpecies(); + + /** + * This should return all binary domain combinations present in the genome. + * + * @return a sorted set of binary domain combinations + */ + public SortedSet toBinaryDomainCombinations(); + + public StringBuilder toStringBuilder( GenomeWideCombinableDomainsSortOrder order ); + + public static enum GenomeWideCombinableDomainsSortOrder { + ALPHABETICAL_KEY_ID, KEY_DOMAIN_PROTEINS_COUNT, KEY_DOMAIN_COUNT, COMBINATIONS_COUNT + } +} diff --git a/forester/java/src/org/forester/surfacing/MappingResults.java b/forester/java/src/org/forester/surfacing/MappingResults.java new file mode 100644 index 0000000..8204dbc --- /dev/null +++ b/forester/java/src/org/forester/surfacing/MappingResults.java @@ -0,0 +1,58 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.surfacing; + +public class MappingResults { + + private String _description; + private int _sum_of_successes; + private int _sum_of_failures; + + public String getDescription() { + return _description; + } + + public int getSumOfFailures() { + return _sum_of_failures; + } + + public int getSumOfSuccesses() { + return _sum_of_successes; + } + + public void setDescription( final String description ) { + _description = description; + } + + public void setSumOfFailures( final int sum_of_failures ) { + _sum_of_failures = sum_of_failures; + } + + public void setSumOfSuccesses( final int sum_of_successes ) { + _sum_of_successes = sum_of_successes; + } +} diff --git a/forester/java/src/org/forester/surfacing/PairwiseDomainSimilarity.java b/forester/java/src/org/forester/surfacing/PairwiseDomainSimilarity.java new file mode 100644 index 0000000..d1f67d0 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/PairwiseDomainSimilarity.java @@ -0,0 +1,41 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public interface PairwiseDomainSimilarity { + + /** + * This should return the -- not normalized, not absolute -- difference in + * counts (for example domain counts) for the two domains. + * It is important that it is: (counts for domain 1) minus (counts for domain 2). + * + * @return the difference in counts + */ + public int getDifferenceInCounts(); + + public double getSimilarityScore(); +} diff --git a/forester/java/src/org/forester/surfacing/PairwiseDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/PairwiseDomainSimilarityCalculator.java new file mode 100644 index 0000000..f0e04ca --- /dev/null +++ b/forester/java/src/org/forester/surfacing/PairwiseDomainSimilarityCalculator.java @@ -0,0 +1,34 @@ +// $Id: +// cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public interface PairwiseDomainSimilarityCalculator { + + public PairwiseDomainSimilarity calculateSimilarity( final CombinableDomains domains_1, + final CombinableDomains domains_2 ); +} diff --git a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java new file mode 100644 index 0000000..ae94f81 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java @@ -0,0 +1,353 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.go.GoId; +import org.forester.go.GoNameSpace; +import org.forester.go.GoTerm; +import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +public class PairwiseGenomeComparator { + + private List _domain_distance_scores_means; + private List _shared_domains_based_distances; + private List _shared_binary_combinations_based_distances; + + //private List _histogram_datas; + public PairwiseGenomeComparator() { + init(); + } + + public List getDomainDistanceScoresMeans() { + return _domain_distance_scores_means; + } + + //public List getHistogramDatas() { + // return _histogram_datas; + //} + public List getSharedBinaryCombinationsBasedDistances() { + return _shared_binary_combinations_based_distances; + } + + public List getSharedDomainsBasedDistances() { + return _shared_domains_based_distances; + } + + private void init() { + //_histogram_datas = new ArrayList(); + _domain_distance_scores_means = new ArrayList(); + _shared_domains_based_distances = new ArrayList(); + _shared_binary_combinations_based_distances = new ArrayList(); + } + + public void performPairwiseComparisons( final StringBuilder html_desc, + final boolean sort_by_species_count_first, + final Detailedness detailedness, + final boolean ignore_domains_without_combs_in_all_spec, + final boolean ignore_domains_specific_to_one_species, + final DomainSimilarity.DomainSimilaritySortField domain_similarity_sort_field, + final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final DomainSimilarity.DomainSimilarityScoring scoring, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit, + final Species[] species, + final int number_of_genomes, + final List list_of_genome_wide_combinable_domains, + final PairwiseDomainSimilarityCalculator pw_calc, + final String automated_pairwise_comparison_suffix, + final boolean verbose, + final String automated_pairwise_comparison_prefix, + final String command_line_prg_name, + final boolean display_histograms, + final File out_dir, + final boolean write_pairwise_comparisons ) { + init(); + final BasicSymmetricalDistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + if ( verbose ) { + System.out.println(); + System.out.println( "Pairwise genome distances:" ); + System.out.print( "[species-i - species-j:" ); + System.out.print( " mean-score-based" ); + System.out.print( " (sd)" ); + System.out.print( " [N]" ); + System.out.print( " | shared-domains-based" ); + System.out.println( " | shared-binary-combinations-based]" ); + System.out.println(); + } + for( int i = 0; i < number_of_genomes; ++i ) { + final String species_i = species[ i ].getSpeciesId(); + domain_distance_scores_means.setIdentifier( i, species_i ); + shared_domains_based_distances.setIdentifier( i, species_i ); + shared_binary_combinations_based_distances.setIdentifier( i, species_i ); + if ( verbose ) { + System.out.println( ( i + 1 ) + "/" + number_of_genomes ); + } + for( int j = 0; j < i; ++j ) { + if ( ( list_of_genome_wide_combinable_domains.get( i ).getSize() < 1 ) + || ( list_of_genome_wide_combinable_domains.get( j ).getSize() < 1 ) ) { + domain_distance_scores_means + .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); + shared_domains_based_distances + .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); + shared_binary_combinations_based_distances + .setValue( i, j, DomainArchitectureBasedGenomeSimilarityCalculator.MAX_SIMILARITY_SCORE ); + continue; + } + final List genome_pair = new ArrayList( 2 ); + genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) ); + genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) ); + DomainSimilarityCalculator.GoAnnotationOutput go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.NONE; + if ( domain_id_to_go_ids_map != null ) { + go_annotation_output = DomainSimilarityCalculator.GoAnnotationOutput.ALL; + } + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field, + sort_by_species_count_first, + true ); + final SortedSet similarities = calc + .calculateSimilarities( pw_calc, + genome_pair, + ignore_domains_without_combs_in_all_spec, + ignore_domains_specific_to_one_species ); + SurfacingUtil.decoratePrintableDomainSimilarities( similarities, + detailedness, + go_annotation_output, + go_id_to_term_map, + go_namespace_limit ); + final DescriptiveStatistics stats = SurfacingUtil + .calculateDescriptiveStatisticsForMeanValues( similarities ); + final String species_j = species[ j ].getSpeciesId(); + final DomainArchitectureBasedGenomeSimilarityCalculator genome_similarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains + .get( i ), + list_of_genome_wide_combinable_domains + .get( j ) ); + genome_similarity_calculator.setAllowDomainsToBeIgnored( false ); + // TODO make histos for these 5 values + double dissimilarity_score_mean; + if ( stats.getN() < 1 ) { + // No domains in common + dissimilarity_score_mean = 1.0; + } + else { + dissimilarity_score_mean = 1.0 - stats.arithmeticMean(); + } + final double shared_domains_based_genome_distance = 1.0 - genome_similarity_calculator + .calculateSharedDomainsBasedGenomeSimilarityScore(); + final double shared_binary_combinations_based_genome_distance = 1.0 - genome_similarity_calculator + .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore(); + domain_distance_scores_means.setValue( i, j, dissimilarity_score_mean ); + shared_domains_based_distances.setValue( i, j, shared_domains_based_genome_distance ); + shared_binary_combinations_based_distances.setValue( i, + j, + shared_binary_combinations_based_genome_distance ); + if ( verbose ) { + System.out.print( species_i + "-" ); + System.out.print( species_j + ": " ); + System.out.print( ForesterUtil.round( dissimilarity_score_mean, 2 ) ); + if ( stats.getN() > 1 ) { + System.out.print( " (" + ForesterUtil.round( stats.sampleStandardDeviation(), 2 ) + ")" ); + } + else { + System.out.print( " (n/a)" ); + } + System.out.print( " [" + stats.getN() + "]" ); + System.out.print( " | " ); + System.out.print( ForesterUtil.round( shared_domains_based_genome_distance, 2 ) ); + System.out.print( " | " ); + System.out.println( ForesterUtil.round( shared_binary_combinations_based_genome_distance, 2 ) ); + } + String pairwise_similarities_output_file_str = automated_pairwise_comparison_prefix + species_i + "_" + + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; + } + DescriptiveStatistics pw_stats = null; + if ( write_pairwise_comparisons ) { + try { + final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? pairwise_similarities_output_file_str + : out_dir + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + pw_stats = SurfacingUtil.writeDomainSimilaritiesToFile( html_desc, + new StringBuilder( species_i + "-" + + species_j ), + writer, + similarities, + true, + null, + domain_similarity_print_option, + domain_similarity_sort_field, + scoring, + false ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( command_line_prg_name, "Failed to write similarites to: \"" + + pairwise_similarities_output_file_str + "\" [" + e.getMessage() + "]" ); + } + } + // pairwise_matrix.setValue( i, j, cdc_list.get( cdc_list.size() + // - 1 ) ); + if ( pw_stats != null ) { + if ( pw_stats.getMin() >= pw_stats.getMax() ) { + ForesterUtil.printWarningMessage( command_line_prg_name, "for [" + species_i + "-" + species_j + + "] score minimum is [" + pw_stats.getMin() + "] while score maximum is [" + + pw_stats.getMax() + "], possibly indicating that a genome is compared to itself" ); + } + if ( display_histograms && ( pw_stats.getMin() < pw_stats.getMax() ) ) { + //final double[] values = pw_stats.getDataAsDoubleArray(); + // List data_items = new + // ArrayList( values.length ); + // for( int n = 0; n < values.length; i++ ) { + // data_items.add( new BasicHistogramDataItem( "", values[ n ] ) + // ); + // } + //~ _histogram_datas.add( new HistogramData( species_i + "-" + species_j, values, null, 20 ) ); + } + } + } + } + getDomainDistanceScoresMeans().add( domain_distance_scores_means ); + getSharedDomainsBasedDistances().add( shared_domains_based_distances ); + getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances ); + if ( verbose ) { + System.out.println(); + } + } + + public void performPairwiseComparisonsJacknifed( final Species[] species, + final int number_of_genomes, + final List list_of_genome_wide_combinable_domains, + final boolean verbose, + final int number_of_resamplings, + final double jacknife_ratio, + final long random_seed ) { + init(); + if ( number_of_resamplings < 2 ) { + throw new IllegalArgumentException( "attempt to perform jacknife resampling with less than 2 resamplings" ); + } + if ( jacknife_ratio <= 0.0 ) { + throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio of 0.0 or less" ); + } + else if ( jacknife_ratio >= 1.0 ) { + throw new IllegalArgumentException( "attempt to perform jacknife resampling with jacknife ratio 1.0 or more" ); + } + final DomainId[] all_unique_domain_ids = getAllUniqueDomainIdAsArray( list_of_genome_wide_combinable_domains ); + if ( verbose ) { + System.out.println(); + System.out.println( "Jacknife: total of domains: " + all_unique_domain_ids.length ); + } + if ( verbose ) { + System.out.print( "resampling " ); + } + final Random generator = new Random( random_seed ); + for( int r = 0; r < number_of_resamplings; ++r ) { + if ( verbose ) { + System.out.print( " " + r ); + } + final SortedSet domain_ids_to_ignore = randomlyPickDomainIds( all_unique_domain_ids, + jacknife_ratio, + generator ); + final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + final BasicSymmetricalDistanceMatrix shared_binary_combinations_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); + for( int i = 0; i < number_of_genomes; ++i ) { + final String species_i = species[ i ].getSpeciesId(); + shared_domains_based_distances.setIdentifier( i, species_i ); + shared_binary_combinations_based_distances.setIdentifier( i, species_i ); + for( int j = 0; j < i; ++j ) { + final List genome_pair = new ArrayList( 2 ); + genome_pair.add( list_of_genome_wide_combinable_domains.get( i ) ); + genome_pair.add( list_of_genome_wide_combinable_domains.get( j ) ); + final DomainArchitectureBasedGenomeSimilarityCalculator genome_simiarity_calculator = new DomainArchitectureBasedGenomeSimilarityCalculator( list_of_genome_wide_combinable_domains + .get( i ), + list_of_genome_wide_combinable_domains + .get( j ) ); + genome_simiarity_calculator.setAllowDomainsToBeIgnored( true ); + genome_simiarity_calculator.setDomainIdsToIgnore( domain_ids_to_ignore ); + shared_domains_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator + .calculateSharedDomainsBasedGenomeSimilarityScore() ); + shared_binary_combinations_based_distances.setValue( i, j, 1.0 - genome_simiarity_calculator + .calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() ); + } + } + getSharedDomainsBasedDistances().add( shared_domains_based_distances ); + getSharedBinaryCombinationsBasedDistances().add( shared_binary_combinations_based_distances ); + } + if ( verbose ) { + System.out.println(); + } + } + + static private DomainId[] getAllUniqueDomainIdAsArray( final List list_of_genome_wide_combinable_domains ) { + DomainId[] all_domain_ids_array; + final SortedSet all_domain_ids = new TreeSet(); + for( final GenomeWideCombinableDomains genome_wide_combinable_domains : list_of_genome_wide_combinable_domains ) { + final SortedSet all_domains = genome_wide_combinable_domains.getAllDomainIds(); + for( final DomainId domain : all_domains ) { + all_domain_ids.add( domain ); + } + } + all_domain_ids_array = new DomainId[ all_domain_ids.size() ]; + int n = 0; + for( final DomainId domain_id : all_domain_ids ) { + all_domain_ids_array[ n++ ] = domain_id; + } + return all_domain_ids_array; + } + + static private SortedSet randomlyPickDomainIds( final DomainId[] all_domain_ids_array, + final double jacknife_ratio, + final Random generator ) { + final int size = all_domain_ids_array.length; + final SortedSet random_domain_ids = new TreeSet(); + final int number_of_ids_pick = ForesterUtil.roundToInt( jacknife_ratio * size ); + while ( random_domain_ids.size() < number_of_ids_pick ) { + final int r = generator.nextInt( size ); + random_domain_ids.add( all_domain_ids_array[ r ] ); + } + return random_domain_ids; + } +} diff --git a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java new file mode 100644 index 0000000..0fb075e --- /dev/null +++ b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java @@ -0,0 +1,717 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.go.GoId; +import org.forester.go.GoNameSpace; +import org.forester.go.GoTerm; +import org.forester.go.GoXRef; +import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; +import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput; +import org.forester.util.ForesterUtil; + +public class PrintableDomainSimilarity implements DomainSimilarity { + + final public static String SPECIES_SEPARATOR = " "; + final private static char TAB = '\t'; + final private static int BEFORE = -1; + final private static int EQUAL = 0; + final private static int AFTER = 1; + final private static String NO_SPECIES = " "; + final private double _min; + final private double _max; + final private double _mean; + final private double _sd; + final private int _n; + private final int _max_difference_in_counts; + private final int _max_difference; + private DomainSimilarityCalculator.GoAnnotationOutput _go_annotation_output; + final private CombinableDomains _combinable_domains; + final private SortedMap _species_data; + final private DomainSimilaritySortField _sort_field; + private List _species_order; + private final boolean _sort_by_species_count_first; + private DomainSimilarityCalculator.Detailedness _detailedness; + private Map _go_id_to_term_map; + private GoNameSpace _go_namespace_limit; + private final boolean _treat_as_binary_comparison; + + /** + * If go_id_to_term_map not null, detailed GO information is written, + * only GO ids otherwise. + * + * + */ + public PrintableDomainSimilarity( final CombinableDomains combinable_domains, + final double min, + final double max, + final double mean, + final double median, + final double sd, + final int n, + final int max_difference_in_counts, + final int max_difference, + final SortedMap species_data, + final DomainSimilaritySortField sort_field, + final boolean sort_by_species_count_first, + final boolean treat_as_binary_comparison ) { + if ( combinable_domains == null ) { + throw new IllegalArgumentException( "attempt to use null combinable domains" ); + } + if ( sort_field == null ) { + throw new IllegalArgumentException( "attempt to use null sorting" ); + } + if ( species_data == null ) { + throw new IllegalArgumentException( "attempt to use null species data" ); + } + if ( species_data.size() < 1 ) { + throw new IllegalArgumentException( "attempt to use empty species data" ); + } + if ( n < 0 ) { + throw new IllegalArgumentException( "attempt to use N less than 0" ); + } + if ( ( species_data.size() > 1 ) && ( n < 1 ) ) { + throw new IllegalArgumentException( "attempt to use N less than 1" ); + } + if ( sd < 0.0 ) { + throw new IllegalArgumentException( "attempt to use negative SD" ); + } + if ( max < min ) { + throw new IllegalArgumentException( "attempt to use max smaller than min" ); + } + init(); + _combinable_domains = combinable_domains; + _min = min; + _max = max; + _mean = mean; + _sd = sd; + _n = n; + _max_difference_in_counts = max_difference_in_counts; + _max_difference = max_difference; + _species_data = species_data; + _sort_field = sort_field; + _sort_by_species_count_first = sort_by_species_count_first; + _treat_as_binary_comparison = treat_as_binary_comparison; + final int s = species_data.size(); + if ( ( ( s * s ) - s ) != ( getN() * 2 ) ) { + throw new IllegalArgumentException( "illegal species count and n: species count:" + s + ", n:" + _n + + " for domain " + combinable_domains.getKeyDomain() ); + } + if ( s > 2 ) { + if ( getMaximalDifferenceInCounts() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference in counts with more than two species" ); + } + if ( getMaximalDifference() < 0 ) { + throw new IllegalArgumentException( "attempt to use negative max difference with more than two species" ); + } + } + } + + private void addGoInformation( final StringBuffer sb, final boolean for_table, final boolean html ) { + if ( !for_table ) { + sb.append( "<" ); + } + switch ( getGoAnnotationOutput() ) { + case ALL: { + final int go_ids = getCombinableDomains().getKeyDomain().getNumberOfGoIds(); + boolean first = true; + for( int i = 0; i < go_ids; ++i ) { + final GoId go_id = getCombinableDomains().getKeyDomain().getGoId( i ); + if ( getGoIdToTermMap() != null ) { + if ( getGoIdToTermMap().containsKey( go_id ) ) { + first = appendGoTerm( sb, getGoIdToTermMap().get( go_id ), first, html ); + } + else { + sb.append( "go id \"" + go_id + "\" not found [" + + getCombinableDomains().getKeyDomain().getId() + "]" ); + } + } + else { + if ( !first ) { + sb.append( ", " ); + } + if ( html ) { + sb.append( "" + go_id + "" ); + } + else { + sb.append( go_id ); + } + first = false; + } + } + break; + } + case NONE: { + break; + } + default: + throw new RuntimeException( "unknown " + getGoAnnotationOutput() ); + } + if ( !for_table ) { + sb.append( ">: " ); + } + } + + private void addSpeciesSpecificDomainData( final StringBuffer sb, final Species species, final boolean html ) { + if ( getDetaildness() != DomainSimilarityCalculator.Detailedness.BASIC ) { + sb.append( "[" ); + } + if ( html ) { + sb.append( "" ); + if ( ( SurfacingConstants.TAXONOMY_LINK != null ) && ( species.getSpeciesId().length() > 2 ) + && ( species.getSpeciesId().length() < 6 ) ) { + sb.append( "" + species.getSpeciesId() + "" ); + } + else { + sb.append( species.getSpeciesId() ); + } + sb.append( "" ); + } + else { + sb.append( species.getSpeciesId() ); + } + if ( getDetaildness() != DomainSimilarityCalculator.Detailedness.BASIC ) { + sb.append( ":" ); + sb.append( getSpeciesData().get( species ).toStringBuffer( getDetaildness(), html ) ); + sb.append( "]" ); + } + if ( html ) { + sb.append( "
    " ); + } + sb.append( PrintableDomainSimilarity.SPECIES_SEPARATOR ); + } + + private boolean appendGoTerm( final StringBuffer sb, final GoTerm go_term, final boolean first, final boolean html ) { + if ( ( getGoNamespaceLimit() == null ) || getGoNamespaceLimit().equals( go_term.getGoNameSpace() ) ) { + if ( !first ) { + sb.append( ", " ); + } + final GoId go_id = go_term.getGoId(); + if ( html ) { + sb.append( "" + go_id + + "" ); + } + else { + sb.append( go_id ); + } + sb.append( ":" ); + sb.append( go_term.getName() ); + if ( !html ) { + if ( getGoNamespaceLimit() == null ) { + sb.append( ":" ); + sb.append( go_term.getGoNameSpace().toString() ); + } + for( final GoXRef xref : go_term.getGoXRefs() ) { + sb.append( ":" ); + sb.append( xref.toString() ); + } + } + return false; + } + return true; + } + + private void boldEndIfSortedBy( final DomainSimilaritySortField sort_field, final StringBuffer sb ) { + if ( getSortField() == sort_field ) { + sb.append( "" ); + } + } + + private void boldStartIfSortedBy( final DomainSimilaritySortField sort_field, final StringBuffer sb ) { + if ( getSortField() == sort_field ) { + sb.append( "" ); + } + } + + private int compareByDomainId( final DomainSimilarity other ) { + return getDomainId().compareTo( other.getDomainId() ); + } + + private int compareBySpeciesCount( final DomainSimilarity domain_similarity ) { + final int s_this = getSpeciesData().size(); + final int s_other = domain_similarity.getSpeciesData().size(); + if ( s_this < s_other ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( s_this > s_other ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return PrintableDomainSimilarity.EQUAL; + } + } + + public int compareTo( final DomainSimilarity domain_similarity ) { + if ( this == domain_similarity ) { + return PrintableDomainSimilarity.EQUAL; + } + else if ( domain_similarity == null ) { + throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to null" ); + } + else if ( domain_similarity.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to compare " + this.getClass() + " to " + + domain_similarity.getClass() ); + } + switch ( getSortField() ) { + case MIN: + if ( isSortBySpeciesCountFirst() ) { + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + } + if ( getMinimalSimilarityScore() < domain_similarity.getMinimalSimilarityScore() ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( getMinimalSimilarityScore() > domain_similarity.getMinimalSimilarityScore() ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return compareByDomainId( domain_similarity ); + } + case MAX: + if ( isSortBySpeciesCountFirst() ) { + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + } + if ( getMaximalSimilarityScore() < domain_similarity.getMaximalSimilarityScore() ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( getMaximalSimilarityScore() > domain_similarity.getMaximalSimilarityScore() ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return compareByDomainId( domain_similarity ); + } + case MEAN: + if ( isSortBySpeciesCountFirst() ) { + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + } + if ( getMeanSimilarityScore() < domain_similarity.getMeanSimilarityScore() ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( getMeanSimilarityScore() > domain_similarity.getMeanSimilarityScore() ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return compareByDomainId( domain_similarity ); + } + case SD: + if ( isSortBySpeciesCountFirst() ) { + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + } + if ( getStandardDeviationOfSimilarityScore() < domain_similarity + .getStandardDeviationOfSimilarityScore() ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( getStandardDeviationOfSimilarityScore() > domain_similarity + .getStandardDeviationOfSimilarityScore() ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return compareByDomainId( domain_similarity ); + } + case MAX_DIFFERENCE: + if ( isSortBySpeciesCountFirst() ) { + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + } + if ( getMaximalDifference() > domain_similarity.getMaximalDifference() ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( getMaximalDifference() < domain_similarity.getMaximalDifference() ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return compareByDomainId( domain_similarity ); + } + case ABS_MAX_COUNTS_DIFFERENCE: + if ( isSortBySpeciesCountFirst() ) { + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + } + if ( Math.abs( getMaximalDifferenceInCounts() ) > Math.abs( domain_similarity + .getMaximalDifferenceInCounts() ) ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( Math.abs( getMaximalDifferenceInCounts() ) < Math.abs( domain_similarity + .getMaximalDifferenceInCounts() ) ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return compareByDomainId( domain_similarity ); + } + case MAX_COUNTS_DIFFERENCE: + if ( getSpeciesData().size() != 2 ) { + throw new RuntimeException( "attempt to sort by maximal difference with species not equal to two" ); + } + if ( isSortBySpeciesCountFirst() ) { + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + } + if ( getMaximalDifferenceInCounts() > domain_similarity.getMaximalDifferenceInCounts() ) { + return PrintableDomainSimilarity.BEFORE; + } + else if ( getMaximalDifferenceInCounts() < domain_similarity.getMaximalDifferenceInCounts() ) { + return PrintableDomainSimilarity.AFTER; + } + else { + return compareByDomainId( domain_similarity ); + } + case SPECIES_COUNT: + final int i = compareBySpeciesCount( domain_similarity ); + if ( i != PrintableDomainSimilarity.EQUAL ) { + return i; + } + else { + return compareByDomainId( domain_similarity ); + } + case DOMAIN_ID: + return compareByDomainId( domain_similarity ); + } + throw new AssertionError( "Unknown sort method: " + getSortField() ); + } + + public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain ) { + final SortedSet sorted_ids = new TreeSet(); + if ( getSpeciesData().containsKey( species_of_combinable_domain ) ) { + for( final DomainId id : getSpeciesData().get( species_of_combinable_domain ) + .getCombinableDomainIdToCountsMap().keySet() ) { + sorted_ids.add( id ); + } + } + return sorted_ids; + } + + private CombinableDomains getCombinableDomains() { + return _combinable_domains; + } + + private DomainSimilarityCalculator.Detailedness getDetaildness() { + return _detailedness; + } + + public DomainId getDomainId() { + return getCombinableDomains().getKeyDomain(); + } + + private DomainSimilarityCalculator.GoAnnotationOutput getGoAnnotationOutput() { + return _go_annotation_output; + } + + private Map getGoIdToTermMap() { + return _go_id_to_term_map; + } + + public GoNameSpace getGoNamespaceLimit() { + return _go_namespace_limit; + } + + public int getMaximalDifference() { + return _max_difference; + } + + @Override + public int getMaximalDifferenceInCounts() { + return _max_difference_in_counts; + } + + public double getMaximalSimilarityScore() { + return _max; + } + + public double getMeanSimilarityScore() { + return _mean; + } + + public double getMinimalSimilarityScore() { + return _min; + } + + public int getN() { + return _n; + } + + private DomainSimilaritySortField getSortField() { + return _sort_field; + } + + public SortedSet getSpecies() { + final SortedSet species = new TreeSet(); + for( final Species s : getSpeciesData().keySet() ) { + species.add( s ); + } + return species; + } + + public List getSpeciesCustomOrder() { + return _species_order; + } + + public SortedMap getSpeciesData() { + return _species_data; + } + + private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html ) { + final StringBuffer sb = new StringBuffer(); + for( final Species species : getSpeciesData().keySet() ) { + addSpeciesSpecificDomainData( sb, species, html ); + } + return sb; + } + + private StringBuffer getSpeciesDataInCustomOrder( final boolean html ) { + final StringBuffer sb = new StringBuffer(); + for( final Species order_species : getSpeciesCustomOrder() ) { + if ( getSpeciesData().keySet().contains( order_species ) ) { + addSpeciesSpecificDomainData( sb, order_species, html ); + } + else { + sb.append( PrintableDomainSimilarity.NO_SPECIES ); + sb.append( PrintableDomainSimilarity.SPECIES_SEPARATOR ); + } + } + return sb; + } + + public double getStandardDeviationOfSimilarityScore() { + return _sd; + } + + private void init() { + _detailedness = DomainSimilarityCalculator.Detailedness.PUNCTILIOUS; + _go_annotation_output = null; + _go_id_to_term_map = null; + } + + private boolean isSortBySpeciesCountFirst() { + return _sort_by_species_count_first; + } + + private boolean isTreatAsBinaryComparison() { + return _treat_as_binary_comparison; + } + + public void setDetailedness( final Detailedness detailedness ) { + _detailedness = detailedness; + } + + public void setGoAnnotationOutput( final GoAnnotationOutput go_annotation_output ) { + _go_annotation_output = go_annotation_output; + } + + public void setGoIdToTermMap( final Map go_id_to_term_map ) { + _go_id_to_term_map = go_id_to_term_map; + } + + public void setGoNamespaceLimit( final GoNameSpace go_namespace_limit ) { + _go_namespace_limit = go_namespace_limit; + } + + public void setSpeciesOrder( final List species_order ) { + if ( !species_order.containsAll( getSpeciesData().keySet() ) ) { + throw new IllegalArgumentException( "list to order species must contain all species of multiple combinable domains similarity" ); + } + _species_order = species_order; + } + + @Override + public String toString() { + return toStringBuffer( null ).toString(); + } + + public StringBuffer toStringBuffer( final PrintableDomainSimilarity.PRINT_OPTION print_option ) { + switch ( print_option ) { + case SIMPLE_TAB_DELIMITED: + return toStringBufferSimpleTabDelimited(); + case HTML: + return toStringBufferDetailedHTML(); + default: + throw new AssertionError( "Unknown print option: " + print_option ); + } + } + + private StringBuffer toStringBufferDetailedHTML() { + final StringBuffer sb = new StringBuffer(); + sb.append( "" ); + sb.append( "" ); + boldStartIfSortedBy( DomainSimilaritySortField.DOMAIN_ID, sb ); + sb.append( "" + getDomainId() + + "" ); + boldEndIfSortedBy( DomainSimilaritySortField.DOMAIN_ID, sb ); + sb.append( "" ); + sb.append( "" ); + boldStartIfSortedBy( DomainSimilaritySortField.MEAN, sb ); + sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) ); + boldEndIfSortedBy( DomainSimilaritySortField.MEAN, sb ); + sb.append( "" ); + if ( !isTreatAsBinaryComparison() ) { + sb.append( "" ); + sb.append( "(" ); + boldStartIfSortedBy( DomainSimilaritySortField.SD, sb ); + sb.append( ForesterUtil.round( getStandardDeviationOfSimilarityScore(), 3 ) ); + boldEndIfSortedBy( DomainSimilaritySortField.SD, sb ); + sb.append( ")" ); + sb.append( "" ); + sb.append( "" ); + sb.append( "[" ); + boldStartIfSortedBy( DomainSimilaritySortField.MIN, sb ); + sb.append( ForesterUtil.round( getMinimalSimilarityScore(), 3 ) ); + boldEndIfSortedBy( DomainSimilaritySortField.MIN, sb ); + sb.append( "," ); + boldStartIfSortedBy( DomainSimilaritySortField.MAX, sb ); + sb.append( ForesterUtil.round( getMaximalSimilarityScore(), 3 ) ); + boldEndIfSortedBy( DomainSimilaritySortField.MAX, sb ); + sb.append( "]" ); + sb.append( "" ); + } + sb.append( "" ); + boldStartIfSortedBy( DomainSimilaritySortField.MAX_DIFFERENCE, sb ); + sb.append( getMaximalDifference() ); + boldEndIfSortedBy( DomainSimilaritySortField.MAX_DIFFERENCE, sb ); + sb.append( "" ); + sb.append( "" ); + if ( isTreatAsBinaryComparison() ) { + boldStartIfSortedBy( DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE, sb ); + boldStartIfSortedBy( DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE, sb ); + sb.append( getMaximalDifferenceInCounts() ); + boldEndIfSortedBy( DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE, sb ); + boldStartIfSortedBy( DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE, sb ); + } + else { + boldStartIfSortedBy( DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE, sb ); + boldStartIfSortedBy( DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE, sb ); + sb.append( Math.abs( getMaximalDifferenceInCounts() ) ); + boldEndIfSortedBy( DomainSimilaritySortField.ABS_MAX_COUNTS_DIFFERENCE, sb ); + boldStartIfSortedBy( DomainSimilaritySortField.MAX_COUNTS_DIFFERENCE, sb ); + } + sb.append( "" ); + if ( !isTreatAsBinaryComparison() ) { + sb.append( "" ); + if ( ( getSortField() == DomainSimilaritySortField.SPECIES_COUNT ) || isSortBySpeciesCountFirst() ) { + sb.append( "" ); + } + sb.append( getSpeciesData().size() ); + if ( ( getSortField() == DomainSimilaritySortField.SPECIES_COUNT ) || isSortBySpeciesCountFirst() ) { + sb.append( "" ); + } + sb.append( "" ); + } + if ( getGoAnnotationOutput() != DomainSimilarityCalculator.GoAnnotationOutput.NONE ) { + sb.append( "" ); + addGoInformation( sb, true, true ); + sb.append( "" ); + } + if ( ( getSpeciesCustomOrder() == null ) || getSpeciesCustomOrder().isEmpty() ) { + sb.append( "" ); + sb.append( getSpeciesDataInAlphabeticalOrder( true ) ); + sb.append( "" ); + } + else { + sb.append( "" ); + sb.append( getSpeciesDataInCustomOrder( true ) ); + sb.append( "" ); + } + sb.append( "" ); + return sb; + } + + private StringBuffer toStringBufferSimpleTabDelimited() { + final StringBuffer sb = new StringBuffer(); + sb.append( getDomainId() ); + switch ( getSortField() ) { + case MIN: + sb.append( TAB ); + sb.append( ForesterUtil.round( getMinimalSimilarityScore(), 3 ) ); + break; + case MAX: + sb.append( TAB ); + sb.append( ForesterUtil.round( getMaximalSimilarityScore(), 3 ) ); + break; + case MEAN: + sb.append( TAB ); + sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) ); + break; + case SD: + sb.append( TAB ); + sb.append( ForesterUtil.round( getStandardDeviationOfSimilarityScore(), 3 ) ); + break; + case MAX_DIFFERENCE: + sb.append( TAB ); + sb.append( getMaximalDifference() ); + case ABS_MAX_COUNTS_DIFFERENCE: + case MAX_COUNTS_DIFFERENCE: + sb.append( TAB ); + if ( isTreatAsBinaryComparison() ) { + sb.append( getMaximalDifferenceInCounts() ); + } + else { + sb.append( Math.abs( getMaximalDifferenceInCounts() ) ); + } + break; + case SPECIES_COUNT: + sb.append( TAB ); + sb.append( getSpeciesData().size() ); + break; + case DOMAIN_ID: + break; + default: + throw new AssertionError( "Unknown sort method: " + getSortField() ); + } + if ( getGoAnnotationOutput() != DomainSimilarityCalculator.GoAnnotationOutput.NONE ) { + sb.append( TAB ); + addGoInformation( sb, true, false ); + } + return sb; + } + + public static enum PRINT_OPTION { + SIMPLE_TAB_DELIMITED, HTML; + } +} diff --git a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDomainSimilariyData.java b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDomainSimilariyData.java new file mode 100644 index 0000000..9c36890 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDomainSimilariyData.java @@ -0,0 +1,141 @@ +// $Id: +// 22:09:42 cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.util.DescriptiveStatistics; + +class PrintableSpeciesSpecificDomainSimilariyData implements SpeciesSpecificDomainSimilariyData { + + private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" ); + final SortedMap _combinable_domain_id_to_count_map; + final private int _key_domain_proteins_count; + final private int _key_domain_domains_count; + final private int _combinable_domains_count; + final private DescriptiveStatistics _key_domain_confidence_descriptive_statistics; + + public PrintableSpeciesSpecificDomainSimilariyData( final int key_domain_proteins_count, + final int key_domain_domains_count, + final int combinable_domains, + final DescriptiveStatistics key_domain_confidence_descriptive_statistics ) { + _key_domain_proteins_count = key_domain_proteins_count; + _key_domain_domains_count = key_domain_domains_count; + _combinable_domains_count = combinable_domains; + _key_domain_confidence_descriptive_statistics = key_domain_confidence_descriptive_statistics; + _combinable_domain_id_to_count_map = new TreeMap(); + } + + public void addProteinsExhibitingCombinationCount( final DomainId domain_id, final int count ) { + if ( getCombinableDomainIdToCountsMap().containsKey( domain_id ) ) { + throw new IllegalArgumentException( "Domain with id " + domain_id + " already exists" ); + } + getCombinableDomainIdToCountsMap().put( domain_id, count ); + } + + public SortedMap getCombinableDomainIdToCountsMap() { + return _combinable_domain_id_to_count_map; + } + + private int getCombinableDomainsCount() { + return _combinable_domains_count; + } + + private DescriptiveStatistics getKeyDomainConfidenceDescriptiveStatistics() { + return _key_domain_confidence_descriptive_statistics; + } + + private int getKeyDomainDomainsCount() { + return _key_domain_domains_count; + } + + private int getKeyDomainProteinsCount() { + return _key_domain_proteins_count; + } + + public int getNumberOfProteinsExhibitingCombinationWith( final DomainId domain_id ) { + if ( !getCombinableDomainIdToCountsMap().containsKey( domain_id ) ) { + throw new IllegalArgumentException( "Domain with id " + domain_id + " not found" ); + } + return getCombinableDomainIdToCountsMap().get( domain_id ); + } + + @Override + public String toString() { + return toStringBuffer( DomainSimilarityCalculator.Detailedness.LIST_COMBINING_DOMAIN_FOR_EACH_SPECIES, false ) + .toString(); + } + + public StringBuffer toStringBuffer( final DomainSimilarityCalculator.Detailedness detailedness, final boolean html ) { + final StringBuffer sb = new StringBuffer(); + if ( detailedness == DomainSimilarityCalculator.Detailedness.PUNCTILIOUS ) { + sb.append( " " ); + sb.append( getKeyDomainDomainsCount() ); + sb.append( ", " ); + sb.append( getKeyDomainProteinsCount() ); + sb.append( ", " ); + sb.append( getCombinableDomainsCount() ); + sb.append( ", " ); + if ( html ) { + sb.append( "" ); + } + sb.append( FORMATTER.format( getKeyDomainConfidenceDescriptiveStatistics().arithmeticMean() ) ); + if ( html ) { + sb.append( "" ); + } + if ( !getCombinableDomainIdToCountsMap().isEmpty() ) { + sb.append( ":" ); + } + } + final Set ids = getCombinableDomainIdToCountsMap().keySet(); + int i = 0; + for( final DomainId domain_id : ids ) { + ++i; + sb.append( " " ); + if ( html ) { + sb.append( "" + + domain_id.getId() + "" ); + } + else { + sb.append( domain_id.getId() ); + } + if ( detailedness == DomainSimilarityCalculator.Detailedness.PUNCTILIOUS ) { + sb.append( ":" ); + sb.append( getCombinableDomainIdToCountsMap().get( domain_id ) ); + } + if ( i < ids.size() - 1 ) { + sb.append( "," ); + } + } + return sb; + } +} diff --git a/forester/java/src/org/forester/surfacing/Protein.java b/forester/java/src/org/forester/surfacing/Protein.java new file mode 100644 index 0000000..ecfd1e8 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/Protein.java @@ -0,0 +1,68 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.List; + +public interface Protein { + + public void addProteinDomain( final Domain protein_domain ); + + /** + * If in_nc_order is set to true, this should return true only and only if + * the order in List 'domains' and this protein (as determined by the start positions + * of the domains of this proteins, _not_ by their index) are the same + * (interspersing, 'other', domains in this are ignored). + * If in_nc_order is set to false, this should return true only and only if + * this contains all domains listed in 'domains' (order and count do not matter). + * + * @param domains a list of domain ids in a certain order. + * @param in_nc_order to consider order + * @return + */ + public boolean contains( final List domains, final boolean in_nc_order ); + + public String getAccession(); + + public String getDescription(); + + public String getName(); + + public int getNumberOfProteinDomains(); + + public Domain getProteinDomain( final int index ); + + public int getProteinDomainCount( final DomainId domain_id ); + + public List getProteinDomains(); + + public List getProteinDomains( final DomainId domain_id ); + + public ProteinId getProteinId(); + + public Species getSpecies(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java b/forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java new file mode 100644 index 0000000..d61252c --- /dev/null +++ b/forester/java/src/org/forester/surfacing/ProteinCountsBasedPairwiseDomainSimilarityCalculator.java @@ -0,0 +1,41 @@ +// $Id: +// 22:05:28 cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public class ProteinCountsBasedPairwiseDomainSimilarityCalculator implements PairwiseDomainSimilarityCalculator { + + public PairwiseDomainSimilarity calculateSimilarity( final CombinableDomains domains_1, + final CombinableDomains domains_2 ) { + if ( !domains_1.getKeyDomain().equals( domains_2.getKeyDomain() ) ) { + throw new IllegalArgumentException( "attempt to calculate similarity between domain collection with different keys" ); + } + final int pc1 = domains_1.getKeyDomainProteinsCount(); + final int pc2 = domains_2.getKeyDomainProteinsCount(); + return new CountsBasedPairwiseDomainSimilarity( pc1 - pc2, pc1 + pc2 ); + } +} diff --git a/forester/java/src/org/forester/surfacing/ProteinId.java b/forester/java/src/org/forester/surfacing/ProteinId.java new file mode 100644 index 0000000..afaf661 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/ProteinId.java @@ -0,0 +1,80 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import org.forester.util.ForesterUtil; + +public class ProteinId implements Comparable { + + final private String _id; + + public ProteinId( final String id ) { + if ( ForesterUtil.isEmpty( id ) ) { + throw new IllegalArgumentException( "attempt to create new protein id from empty or null string" ); + } + _id = id.trim(); + } + + @Override + public int compareTo( final ProteinId protein_id ) { + if ( this == protein_id ) { + return 0; + } + return getId().toLowerCase().compareTo( protein_id.getId().toLowerCase() ); + } + + @Override + public boolean equals( final Object o ) { + if ( this == o ) { + return true; + } + else if ( o == null ) { + throw new IllegalArgumentException( "attempt to check protein id equality to null" ); + } + else if ( o.getClass() != this.getClass() ) { + throw new IllegalArgumentException( "attempt to check protein id equality to " + o + " [" + o.getClass() + + "]" ); + } + else { + return getId().equals( ( ( ProteinId ) o ).getId() ); + } + } + + public String getId() { + return _id; + } + + @Override + public int hashCode() { + return getId().hashCode(); + } + + @Override + public String toString() { + return getId(); + } +} diff --git a/forester/java/src/org/forester/surfacing/SimpleDomain.java b/forester/java/src/org/forester/surfacing/SimpleDomain.java new file mode 100644 index 0000000..76d73bf --- /dev/null +++ b/forester/java/src/org/forester/surfacing/SimpleDomain.java @@ -0,0 +1,122 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import org.forester.go.GoId; +import org.forester.util.ForesterUtil; + +/* + * A limited implementation of Domain. Its intended use is for when only a + * domain identifier is needed. Note intended for general use. + */ +public class SimpleDomain implements Domain { + + final private DomainId _id; + + public SimpleDomain( final String id_str ) { + if ( ForesterUtil.isEmpty( id_str ) ) { + throw new IllegalArgumentException( "attempt to create protein domain with null or empty id" ); + } + _id = new DomainId( id_str ); + } + + @Override + public void addGoId( final GoId go_id ) { + throw new RuntimeException( "method not implemented" ); + } + + public int compareTo( final Domain domain ) { + if ( this == domain ) { + return 0; + } + return getDomainId().compareTo( domain.getDomainId() ); + } + + public DomainId getDomainId() { + return _id; + } + + public int getFrom() { + throw new RuntimeException( "method not implemented" ); + } + + @Override + public GoId getGoId( final int i ) { + throw new RuntimeException( "method not implemented" ); + } + + public int getLength() { + throw new RuntimeException( "method not implemented" ); + } + + public short getNumber() { + throw new RuntimeException( "method not implemented" ); + } + + @Override + public int getNumberOfGoIds() { + throw new RuntimeException( "method not implemented" ); + } + + @Override + public double getPerDomainEvalue() { + throw new RuntimeException( "method not implemented" ); + } + + @Override + public double getPerDomainScore() { + throw new RuntimeException( "method not implemented" ); + } + + public double getPerSequenceEvalue() { + throw new RuntimeException( "method not implemented" ); + } + + public double getPerSequenceScore() { + throw new RuntimeException( "method not implemented" ); + } + + public String getSearchParameter() { + throw new RuntimeException( "method not implemented" ); + } + + public int getTo() { + throw new RuntimeException( "method not implemented" ); + } + + public short getTotalCount() { + throw new RuntimeException( "method not implemented" ); + } + + public boolean isCompleteQueryMatch() { + throw new RuntimeException( "method not implemented" ); + } + + public boolean isCompleteTargetMatch() { + throw new RuntimeException( "method not implemented" ); + } +} diff --git a/forester/java/src/org/forester/surfacing/Species.java b/forester/java/src/org/forester/surfacing/Species.java new file mode 100644 index 0000000..fb387f6 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/Species.java @@ -0,0 +1,32 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +public interface Species extends Comparable { + + public abstract String getSpeciesId(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/SpeciesSpecificDomainSimilariyData.java b/forester/java/src/org/forester/surfacing/SpeciesSpecificDomainSimilariyData.java new file mode 100644 index 0000000..8c5908d --- /dev/null +++ b/forester/java/src/org/forester/surfacing/SpeciesSpecificDomainSimilariyData.java @@ -0,0 +1,50 @@ +// $Id: +// cmzmasek Exp $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.util.SortedMap; + +/* + * A helper class for PrintableDomainSimilarity. + */ +interface SpeciesSpecificDomainSimilariyData { + + public void addProteinsExhibitingCombinationCount( final DomainId domain_id, final int count ); + + /** + * This should return a sorted map mapping domain ids to their corresponding + * counts + * + * @return a sorted map mapping domain ids to their corresponding counts + */ + public SortedMap getCombinableDomainIdToCountsMap(); + + public int getNumberOfProteinsExhibitingCombinationWith( final DomainId domain_id ); + + public StringBuffer toStringBuffer( final DomainSimilarityCalculator.Detailedness detailedness, boolean html ); +} diff --git a/forester/java/src/org/forester/surfacing/SurfacingConstants.java b/forester/java/src/org/forester/surfacing/SurfacingConstants.java new file mode 100644 index 0000000..6bfd208 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/SurfacingConstants.java @@ -0,0 +1,48 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import org.forester.util.ForesterUtil; + +public class SurfacingConstants { + + public static final String GOOGLE_WEB_SEARCH_LINK = "http://www.google.com/search?q="; + public static final String GOOGLE_SCHOLAR_LINK = "http://scholar.google.com/scholar?q="; + public static final String GOOGLE_SCHOLAR_LIMITS = "&as_subj=bio&as_subj=med&as_subj=chm&num=100"; + public static final String AMIGO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; + public static final String PFAM_FAMILY_ID_LINK = "http://pfam.sanger.ac.uk/family?id="; + public static final String NL = ForesterUtil.LINE_SEPARATOR; + public static final String TAXONOMY_LINK = "http://beta.uniprot.org/taxonomy/?query="; + static final boolean SECONDARY_FEATURES_ARE_SCOP = true; + static final String SECONDARY_FEATURES_SCOP_LINK = "http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi?key="; + public static final String NONE = "[none]"; + public static final String UNIPROT_LINK = "http://beta.uniprot.org/taxonomy/?query="; + public static final String GO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; + public static final String EOL_LINK = "http://www.eol.org/search?q="; + public static final String TOL_LINK = "http://www.googlesyndicatedsearch.com/u/TreeofLife?q="; + public static final String WIKIPEDIA_LINK = "http://wikipedia.org/wiki/"; +} diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java new file mode 100644 index 0000000..119d014 --- /dev/null +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -0,0 +1,2414 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.application.surfacing_old; +import org.forester.evoinference.distance.NeighborJoining; +import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates; +import org.forester.evoinference.matrix.distance.DistanceMatrix; +import org.forester.go.GoId; +import org.forester.go.GoNameSpace; +import org.forester.go.GoTerm; +import org.forester.go.GoUtils; +import org.forester.go.PfamToGoMapping; +import org.forester.io.parsers.nexus.NexusConstants; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.BinaryCharacters; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; +import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput; +import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; +import org.forester.util.AsciiHistogram; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterUtil; + +public final class SurfacingUtil { + + private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" ); + private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); + private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { + + public int compare( final Domain d1, + final Domain d2 ) { + if ( d1.getPerSequenceEvalue() < d2 + .getPerSequenceEvalue() ) { + return -1; + } + else if ( d1 + .getPerSequenceEvalue() > d2 + .getPerSequenceEvalue() ) { + return 1; + } + else { + return d1.compareTo( d2 ); + } + } + }; + public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); + + private SurfacingUtil() { + // Hidden constructor. + } + + public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome, + final SortedSet binary_domain_combinations ) { + final SortedMap all_cd = genome.getAllCombinableDomainsIds(); + for( final DomainId domain_id : all_cd.keySet() ) { + binary_domain_combinations.addAll( all_cd.get( domain_id ).toBinaryDomainCombinations() ); + } + } + + public static void addAllDomainIdsToSet( final GenomeWideCombinableDomains genome, + final SortedSet domain_ids ) { + final SortedSet domains = genome.getAllDomainIds(); + for( final DomainId domain : domains ) { + domain_ids.add( domain ); + } + } + + public static void addHtmlHead( final Writer w, final String title ) throws IOException { + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( "" ); + w.write( title ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + + public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set similarities ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final DomainSimilarity similarity : similarities ) { + stats.addValue( similarity.getMeanSimilarityScore() ); + } + return stats; + } + + public static int calculateOverlap( final Domain domain, final List covered_positions ) { + int overlap_count = 0; + for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { + if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) { + ++overlap_count; + } + } + return overlap_count; + } + + public static void checkForOutputFileWriteability( final File outfile ) { + final String error = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, error ); + } + } + + private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, + final boolean get_gains ) { + final SortedSet domains = new TreeSet(); + for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { + final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); + if ( get_gains ) { + domains.addAll( chars.getGainedCharacters() ); + } + else { + domains.addAll( chars.getLostCharacters() ); + } + } + return domains; + } + + public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final BinaryDomainCombination.DomainCombinationType dc_type, + final List all_binary_domains_combination_gained, + final boolean get_gains ) { + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + for( final String id : sorted_ids ) { + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) + || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { + if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { + all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); + } + else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { + all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); + } + else { + all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix + .getCharacter( c ) ) ); + } + } + } + } + } + + private static File createBaseDirForPerNodeDomainFiles( final String base_dir, + final boolean domain_combinations, + final CharacterStateMatrix.GainLossStates state, + final String outfile ) { + File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() + + ForesterUtil.FILE_SEPARATOR + base_dir ); + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( domain_combinations ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DC" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( state == GainLossStates.GAIN ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "GAINS" ); + } + else if ( state == GainLossStates.LOSS ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + return per_node_go_mapped_domain_gain_loss_files_base_dir; + } + + public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { + final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings + .size() ); + for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { + if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { + domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); + } + domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); + } + return domain_id_to_go_ids_map; + } + + public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) + throws IOException { + final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, "\t" ); + final Map> map = new TreeMap>(); + for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { + final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) ); + if ( !map.containsKey( domain_id ) ) { + map.put( domain_id, new HashSet() ); + } + map.get( domain_id ).add( primary_table.getValue( 1, r ) ); + } + return map; + } + + public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { + checkForOutputFileWriteability( nj_tree_outfile ); + final NeighborJoining nj = NeighborJoining.createInstance(); + final Phylogeny phylogeny = nj.execute( distance ); + phylogeny.setName( nj_tree_outfile.getName() ); + writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); + return phylogeny; + } + + private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { + final SortedMap cds = gwcd.getAllCombinableDomainsIds(); + final SortedSet binary_combinations = new TreeSet(); + for( final DomainId domain_id : cds.keySet() ) { + final CombinableDomains cd = cds.get( domain_id ); + binary_combinations.addAll( cd.toBinaryDomainCombinations() ); + } + return binary_combinations; + } + + public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, + final Detailedness detailedness, + final GoAnnotationOutput go_annotation_output, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit ) { + if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) ) { + throw new IllegalArgumentException( "attempt to use a GO namespace limit without a GO id to term map" ); + } + for( final DomainSimilarity domain_similarity : domain_similarities ) { + if ( domain_similarity instanceof PrintableDomainSimilarity ) { + final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; + printable_domain_similarity.setDetailedness( detailedness ); + printable_domain_similarity.setGoAnnotationOutput( go_annotation_output ); + printable_domain_similarity.setGoIdToTermMap( go_id_to_term_map ); + printable_domain_similarity.setGoNamespaceLimit( go_namespace_limit ); + } + } + } + + public static void executeDomainLengthAnalysis( final String[][] input_file_properties, + final int number_of_genomes, + final DomainLengthsTable domain_lengths_table, + final File outfile ) throws IOException { + final DecimalFormat df = new DecimalFormat( "#.00" ); + checkForOutputFileWriteability( outfile ); + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + out.write( "MEAN BASED STATISTICS PER SPECIES" ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.write( ForesterUtil.LINE_SEPARATOR ); + final List domain_lengths_list = domain_lengths_table.getDomainLengthsList(); + out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" ); + out.write( ForesterUtil.LINE_SEPARATOR ); + for( final DomainLengths domain_lengths : domain_lengths_list ) { + final List species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 ); + if ( species_list.size() > 0 ) { + out.write( domain_lengths.getDomainId() + "\t" ); + for( final Species species : species_list ) { + out.write( species + "\t" ); + } + out.write( ForesterUtil.LINE_SEPARATOR ); + // DescriptiveStatistics stats_for_domain = domain_lengths + // .calculateMeanBasedStatistics(); + //AsciiHistogram histo = new AsciiHistogram( stats_for_domain ); + //System.out.println( histo.toStringBuffer( 40, '=', 60, 4 ).toString() ); + } + } + out.write( ForesterUtil.LINE_SEPARATOR ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.write( "OUTLIER SPECIES (Z 1.0)" ); + out.write( ForesterUtil.LINE_SEPARATOR ); + final DescriptiveStatistics stats_for_all_species = domain_lengths_table + .calculateMeanBasedStatisticsForAllSpecies(); + out.write( stats_for_all_species.asSummary() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + final AsciiHistogram histo = new AsciiHistogram( stats_for_all_species ); + out.write( histo.toStringBuffer( 40, '=', 60, 4 ).toString() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + final double population_sd = stats_for_all_species.sampleStandardDeviation(); + final double population_mean = stats_for_all_species.arithmeticMean(); + for( final Species species : domain_lengths_table.getSpecies() ) { + final double x = domain_lengths_table.calculateMeanBasedStatisticsForSpecies( species ).arithmeticMean(); + final double z = ( x - population_mean ) / population_sd; + out.write( species + "\t" + z ); + out.write( ForesterUtil.LINE_SEPARATOR ); + } + out.write( ForesterUtil.LINE_SEPARATOR ); + for( final Species species : domain_lengths_table.getSpecies() ) { + final DescriptiveStatistics stats_for_species = domain_lengths_table + .calculateMeanBasedStatisticsForSpecies( species ); + final double x = stats_for_species.arithmeticMean(); + final double z = ( x - population_mean ) / population_sd; + if ( ( z <= -1.0 ) || ( z >= 1.0 ) ) { + out.write( species + "\t" + df.format( z ) + "\t" + stats_for_species.asSummary() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + } + } + out.close(); + // final List histogram_datas = new ArrayList(); + // for( int i = 0; i < number_of_genomes; ++i ) { + // final Species species = new BasicSpecies( input_file_properties[ i ][ 0 ] ); + // histogram_datas + // .add( new HistogramData( species.toString(), domain_lengths_table + // .calculateMeanBasedStatisticsForSpecies( species ) + // .getDataAsDoubleArray(), 5, 600, null, 60 ) ); + // } + // final HistogramsFrame hf = new HistogramsFrame( histogram_datas ); + // hf.setVisible( true ); + System.gc(); + } + + /** + * + * @param all_binary_domains_combination_lost_fitch + * @param consider_directedness_and_adjacency_for_bin_combinations + * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations + * which were gained under unweighted (Fitch) parsimony. + */ + public static void executeParsimonyAnalysis( final long random_number_seed_for_fitch_parsimony, + final boolean radomize_fitch_parsimony, + final String outfile_name, + final DomainParsimonyCalculator domain_parsimony, + final Phylogeny phylogeny, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit, + final String parameters_str, + final Map>[] domain_id_to_secondary_features_maps, + final SortedSet positive_filter, + final boolean output_binary_domain_combinations_for_graphs, + final List all_binary_domains_combination_gained_fitch, + final List all_binary_domains_combination_lost_fitch, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; + final String date_time = ForesterUtil.getCurrentDateTime(); + final SortedSet all_pfams_encountered = new TreeSet(); + final SortedSet all_pfams_gained_as_domains = new TreeSet(); + final SortedSet all_pfams_lost_as_domains = new TreeSet(); + final SortedSet all_pfams_gained_as_dom_combinations = new TreeSet(); + final SortedSet all_pfams_lost_as_dom_combinations = new TreeSet(); + writeToNexus( outfile_name, domain_parsimony, phylogeny ); + // DOLLO DOMAINS + // ------------- + Phylogeny local_phylogeny_l = phylogeny.copy(); + if ( ( positive_filter != null ) && ( positive_filter.size() > 0 ) ) { + domain_parsimony.executeDolloParsimonyOnDomainPresence( positive_filter ); + } + else { + domain_parsimony.executeDolloParsimonyOnDomainPresence(); + } + SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name + + surfacing_old.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name + + surfacing_old.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_GAINS_D, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil + .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_LOSSES_D, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name + + surfacing_old.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null ); + //HTML: + writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + false, + domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D, + sep, + ForesterUtil.LINE_SEPARATOR, + "Dollo Parsimony | Gains | Domains", + "+", + domain_id_to_secondary_features_maps, + all_pfams_encountered, + all_pfams_gained_as_domains, + "_dollo_gains_d" ); + writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + false, + domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D, + sep, + ForesterUtil.LINE_SEPARATOR, + "Dollo Parsimony | Losses | Domains", + "-", + domain_id_to_secondary_features_maps, + all_pfams_encountered, + all_pfams_lost_as_domains, + "_dollo_losses_d" ); + writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + false, + domain_parsimony.getGainLossMatrix(), + null, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D, + sep, + ForesterUtil.LINE_SEPARATOR, + "Dollo Parsimony | Present | Domains", + "", + domain_id_to_secondary_features_maps, + all_pfams_encountered, + null, + "_dollo_present_d" ); + preparePhylogeny( local_phylogeny_l, + domain_parsimony, + date_time, + "Dollo parsimony on domain presence/absence", + "dollo_on_domains_" + outfile_name, + parameters_str ); + SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name + + surfacing_old.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + try { + writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" ); + writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" ); + } + catch ( final IOException e ) { + e.printStackTrace(); + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getLocalizedMessage() ); + } + if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) { + // FITCH DOMAIN COMBINATIONS + // ------------------------- + local_phylogeny_l = phylogeny.copy(); + String randomization = "no"; + if ( radomize_fitch_parsimony ) { + domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( random_number_seed_for_fitch_parsimony ); + randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony; + } + else { + domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( false ); + } + SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name + + surfacing_old.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); + SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name + + surfacing_old.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + + surfacing_old.PARSIMONY_OUTPUT_FITCH_GAINS_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil + .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_LOSSES_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name + + surfacing_old.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); + if ( all_binary_domains_combination_gained_fitch != null ) { + collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony + .getGainLossMatrix(), dc_type, all_binary_domains_combination_gained_fitch, true ); + } + if ( all_binary_domains_combination_lost_fitch != null ) { + collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony + .getGainLossMatrix(), dc_type, all_binary_domains_combination_lost_fitch, false ); + } + if ( output_binary_domain_combinations_for_graphs ) { + SurfacingUtil + .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony + .getGainLossMatrix(), + null, + outfile_name + + surfacing_old.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS, + sep, + ForesterUtil.LINE_SEPARATOR, + BinaryDomainCombination.OutputFormat.DOT ); + } + // HTML: + writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + true, + domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + "Fitch Parsimony | Gains | Domain Combinations", + "+", + null, + all_pfams_encountered, + all_pfams_gained_as_dom_combinations, + "_fitch_gains_dc" ); + writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + true, + domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + "Fitch Parsimony | Losses | Domain Combinations", + "-", + null, + all_pfams_encountered, + all_pfams_lost_as_dom_combinations, + "_fitch_losses_dc" ); + writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + true, + domain_parsimony.getGainLossMatrix(), + null, + outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + "Fitch Parsimony | Present | Domain Combinations", + "", + null, + all_pfams_encountered, + null, + "_fitch_present_dc" ); + writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map, + go_id_to_term_map, + outfile_name, + all_pfams_encountered ); + writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, + all_pfams_gained_as_domains ); + writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains ); + writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_GAINED_AS_DC_SUFFIX, + all_pfams_gained_as_dom_combinations ); + writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_LOST_AS_DC_SUFFIX, + all_pfams_lost_as_dom_combinations ); + preparePhylogeny( local_phylogeny_l, + domain_parsimony, + date_time, + "Fitch parsimony on binary domain combination presence/absence randomization: " + + randomization, + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); + SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name + + surfacing_old.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); + } + } + + public static void executeParsimonyAnalysisForSecondaryFeatures( final String outfile_name, + final DomainParsimonyCalculator secondary_features_parsimony, + final Phylogeny phylogeny, + final String parameters_str, + final Map mapping_results_map ) { + final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; + final String date_time = ForesterUtil.getCurrentDateTime(); + System.out.println(); + writeToNexus( outfile_name + surfacing_old.NEXUS_SECONDARY_FEATURES, secondary_features_parsimony + .createMatrixOfSecondaryFeaturePresenceOrAbsence( null ), phylogeny ); + final Phylogeny local_phylogeny_copy = phylogeny.copy(); + secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map ); + SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name + + surfacing_old.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name + + surfacing_old.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + SurfacingUtil + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + + surfacing_old.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + + surfacing_old.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + null, + outfile_name + + surfacing_old.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + preparePhylogeny( local_phylogeny_copy, + secondary_features_parsimony, + date_time, + "Dollo parsimony on secondary feature presence/absence", + "dollo_on_secondary_features_" + outfile_name, + parameters_str ); + SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name + + surfacing_old.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + } + + public static void extractProteinNames( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator ) throws IOException { + for( final Protein protein : proteins ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId().getId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); + } + } + out.write( "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); + } + } + out.flush(); + } + + public static void extractProteinNames( final SortedMap> protein_lists_per_species, + final DomainId domain_id, + final Writer out, + final String separator ) throws IOException { + for( final Species species : protein_lists_per_species.keySet() ) { + for( final Protein protein : protein_lists_per_species.get( species ) ) { + final List domains = protein.getProteinDomains( domain_id ); + if ( domains.size() > 0 ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Domain domain : domains ) { + stats.addValue( domain.getPerSequenceEvalue() ); + } + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" + FORMATTER.format( stats.median() ) + "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); + } + } + } + out.flush(); + } + + public static SortedSet getAllDomainIds( final List gwcd_list ) { + final SortedSet all_domains_ids = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + final Set all_domains = gwcd.getAllDomainIds(); + // for( final Domain domain : all_domains ) { + all_domains_ids.addAll( all_domains ); + // } + } + return all_domains_ids; + } + + public static SortedMap getDomainCounts( final List protein_domain_collections ) { + final SortedMap map = new TreeMap(); + for( final Protein protein_domain_collection : protein_domain_collections ) { + for( final Object name : protein_domain_collection.getProteinDomains() ) { + final BasicDomain protein_domain = ( BasicDomain ) name; + final String id = protein_domain.getDomainId().getId(); + if ( map.containsKey( id ) ) { + map.put( id, map.get( id ) + 1 ); + } + else { + map.put( id, 1 ); + } + } + } + return map; + } + + public static int getNumberOfNodesLackingName( final Phylogeny p, final StringBuilder names ) { + final PhylogenyNodeIterator it = p.iteratorPostorder(); + int c = 0; + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( ForesterUtil.isEmpty( n.getName() ) + && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() + .getScientificName() ) ) ) { + if ( n.getParent() != null ) { + names.append( " " ); + names.append( n.getParent().getName() ); + } + ++c; + } + } + return c; + } + + /** + * Returns true is Domain domain falls in an uninterrupted stretch of + * covered positions. + * + * @param domain + * @param covered_positions + * @return + */ + public static boolean isEngulfed( final Domain domain, final List covered_positions ) { + for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { + if ( ( i >= covered_positions.size() ) || ( covered_positions.get( i ) != true ) ) { + return false; + } + } + return true; + } + + public static void preparePhylogeny( final Phylogeny p, + final DomainParsimonyCalculator domain_parsimony, + final String date_time, + final String method, + final String name, + final String parameters_str ) { + domain_parsimony.decoratePhylogenyWithDomains( p ); + final StringBuilder desc = new StringBuilder(); + desc.append( "[Method: " + method + "] [Date: " + date_time + "] " ); + desc.append( "[Cost: " + domain_parsimony.getCost() + "] " ); + desc.append( "[Gains: " + domain_parsimony.getTotalGains() + "] " ); + desc.append( "[Losses: " + domain_parsimony.getTotalLosses() + "] " ); + desc.append( "[Unchanged: " + domain_parsimony.getTotalUnchanged() + "] " ); + desc.append( "[Parameters: " + parameters_str + "]" ); + p.setName( name ); + p.setDescription( desc.toString() ); + p.setConfidence( new Confidence( domain_parsimony.getCost(), "parsimony" ) ); + p.setRerootable( false ); + p.setRooted( true ); + } + + /** + * + * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 => + * domain with 0.3 is ignored + * + * -----------0.1 ----------0.2 --0.3 => domain with 0.3 is ignored + * + * + * ------------0.1 ----------0.3 --0.2 => domains with 0.3 and 0.2 are _not_ + * ignored + * + * @param max_allowed_overlap + * maximal allowed overlap (inclusive) to be still considered not + * overlapping (zero or negative value to allow any overlap) + * @param remove_engulfed_domains + * to remove domains which are completely engulfed by coverage of + * domains with better support + * @param protein + * @return + */ + public static Protein removeOverlappingDomains( final int max_allowed_overlap, + final boolean remove_engulfed_domains, + final Protein protein ) { + final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies() + .getSpeciesId() ); + final List sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein ); + final List covered_positions = new ArrayList(); + for( final Domain domain : sorted ) { + if ( ( ( max_allowed_overlap < 0 ) || ( SurfacingUtil.calculateOverlap( domain, covered_positions ) <= max_allowed_overlap ) ) + && ( !remove_engulfed_domains || !isEngulfed( domain, covered_positions ) ) ) { + final int covered_positions_size = covered_positions.size(); + for( int i = covered_positions_size; i < domain.getFrom(); ++i ) { + covered_positions.add( false ); + } + final int new_covered_positions_size = covered_positions.size(); + for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { + if ( i < new_covered_positions_size ) { + covered_positions.set( i, true ); + } + else { + covered_positions.add( true ); + } + } + pruned_protein.addProteinDomain( domain ); + } + } + return pruned_protein; + } + + static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { + final List domains = new ArrayList(); + for( final Domain d : protein.getProteinDomains() ) { + domains.add( d ); + } + Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER ); + return domains; + } + + public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p, + final boolean get_gains, + final String outdir, + final String suffix_for_filename ) throws IOException { + CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN; + if ( !get_gains ) { + state = CharacterStateMatrix.GainLossStates.LOSS; + } + final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing_old.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES, + false, + state, + outdir ); + for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + if ( !node.isExternal() ) { + final SortedSet domains = collectAllDomainsChangedOnSubtree( node, get_gains ); + if ( domains.size() > 0 ) { + final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR + + node.getName() + suffix_for_filename ); + for( final String domain : domains ) { + writer.write( domain ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + writer.close(); + } + } + } + } + + private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final String outfile_name, + final SortedSet all_pfams_encountered ) { + final File all_pfams_encountered_file = new File( outfile_name + surfacing_old.ALL_PFAMS_ENCOUNTERED_SUFFIX ); + final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name + + surfacing_old.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); + final File encountered_pfams_summary_file = new File( outfile_name + + surfacing_old.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); + int biological_process_counter = 0; + int cellular_component_counter = 0; + int molecular_function_counter = 0; + int pfams_with_mappings_counter = 0; + int pfams_without_mappings_counter = 0; + int pfams_without_mappings_to_bp_or_mf_counter = 0; + int pfams_with_mappings_to_bp_or_mf_counter = 0; + try { + final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) ); + final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) ); + final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) ); + summary_writer.write( "# Pfam to GO mapping summary" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Actual summary is at the end of this file." ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Encountered Pfams without a GO mapping:" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + for( final String pfam : all_pfams_encountered ) { + all_pfams_encountered_writer.write( pfam ); + all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); + final DomainId domain_id = new DomainId( pfam ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + ++pfams_with_mappings_counter; + all_pfams_encountered_with_go_annotation_writer.write( pfam ); + all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR ); + final List go_ids = domain_id_to_go_ids_map.get( domain_id ); + boolean maps_to_bp = false; + boolean maps_to_cc = false; + boolean maps_to_mf = false; + for( final GoId go_id : go_ids ) { + final GoTerm go_term = go_id_to_term_map.get( go_id ); + if ( go_term.getGoNameSpace().isBiologicalProcess() ) { + maps_to_bp = true; + } + else if ( go_term.getGoNameSpace().isCellularComponent() ) { + maps_to_cc = true; + } + else if ( go_term.getGoNameSpace().isMolecularFunction() ) { + maps_to_mf = true; + } + } + if ( maps_to_bp ) { + ++biological_process_counter; + } + if ( maps_to_cc ) { + ++cellular_component_counter; + } + if ( maps_to_mf ) { + ++molecular_function_counter; + } + if ( maps_to_bp || maps_to_mf ) { + ++pfams_with_mappings_to_bp_or_mf_counter; + } + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + } + } + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + ++pfams_without_mappings_counter; + summary_writer.write( pfam ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + } + } + all_pfams_encountered_writer.close(); + all_pfams_encountered_with_go_annotation_writer.close(); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() + + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter + + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file + + "\"" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote summary (including all [" + + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" + + encountered_pfams_summary_file + "\"" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Sum of Pfams encountered : " + + all_pfams_encountered.size() ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams without a mapping : " + + pfams_without_mappings_counter + " [" + + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams with a mapping : " + + pfams_with_mappings_counter + " [" + + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams with mapping to biological process: " + + biological_process_counter + " [" + + ( 100 * biological_process_counter / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams with mapping to molecular function: " + + molecular_function_counter + " [" + + ( 100 * molecular_function_counter / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams with mapping to cellular component: " + + cellular_component_counter + " [" + + ( 100 * cellular_component_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter + + " [" + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " [" + + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " [" + + ( 100 * biological_process_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " [" + + ( 100 * molecular_function_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " [" + + ( 100 * cellular_component_counter / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing_old.PRG_NAME, "Failure to write: " + e ); + } + } + + public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties, + final File output_dir, + final GenomeWideCombinableDomains gwcd, + final int i, + final GenomeWideCombinableDomainsSortOrder dc_sort_order ) { + File dc_outfile_dot = new File( input_file_properties[ i ][ 0 ] + + surfacing_old.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); + if ( output_dir != null ) { + dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot ); + } + checkForOutputFileWriteability( dc_outfile_dot ); + final SortedSet binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd ); + try { + final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); + for( final BinaryDomainCombination bdc : binary_combinations ) { + out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) + .toString() ); + out_dot.write( SurfacingConstants.NL ); + } + out_dot.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote binary domain combination for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); + } + + /* + * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value + * + * + */ + static public StringBuffer proteinToDomainCombinations( final Protein protein, + final String protein_id, + final String separator ) { + final StringBuffer sb = new StringBuffer(); + if ( protein.getSpecies() == null ) { + throw new IllegalArgumentException( "species must not be null" ); + } + if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) { + throw new IllegalArgumentException( "species id must not be empty" ); + } + final List domains = protein.getProteinDomains(); + if ( domains.size() > 1 ) { + final Map counts = new HashMap(); + for( final Domain domain : domains ) { + final String id = domain.getDomainId().getId(); + if ( counts.containsKey( id ) ) { + counts.put( id, counts.get( id ) + 1 ); + } + else { + counts.put( id, 1 ); + } + } + for( int i = 1; i < domains.size(); ++i ) { + for( int j = 0; j < i; ++j ) { + Domain domain_n = domains.get( i ); + Domain domain_c = domains.get( j ); + if ( domain_n.getFrom() > domain_c.getFrom() ) { + domain_n = domains.get( j ); + domain_c = domains.get( i ); + } + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domain_n.getDomainId().getId() ); + sb.append( separator ); + sb.append( domain_c.getDomainId().getId() ); + sb.append( separator ); + sb.append( domain_n.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( domain_c.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( counts.get( domain_n.getDomainId().getId() ) ); + sb.append( separator ); + sb.append( counts.get( domain_c.getDomainId().getId() ) ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + } + } + else if ( domains.size() == 1 ) { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domains.get( 0 ).getDomainId().getId() ); + sb.append( separator ); + sb.append( separator ); + sb.append( domains.get( 0 ).getPerDomainEvalue() ); + sb.append( separator ); + sb.append( separator ); + sb.append( 1 ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + else { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb; + } + + public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final Map descriptions ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + for( final String id : sorted_ids ) { + out.write( indentifier_characters_separator ); + out.write( "#" + id ); + out.write( indentifier_characters_separator ); + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { + out.write( matrix.getCharacter( c ) ); + if ( ( descriptions != null ) && !descriptions.isEmpty() + && descriptions.containsKey( matrix.getCharacter( c ) ) ) { + out.write( "\t" ); + out.write( descriptions.get( matrix.getCharacter( c ) ) ); + } + out.write( character_separator ); + } + } + } + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); + } + + public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final BinaryDomainCombination.OutputFormat bc_output_format ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + for( final String id : sorted_ids ) { + out.write( indentifier_characters_separator ); + out.write( "#" + id ); + out.write( indentifier_characters_separator ); + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { + BinaryDomainCombination bdc = null; + try { + bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getLocalizedMessage() ); + } + out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() ); + out.write( character_separator ); + } + } + } + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); + } + + public static void writeBinaryStatesMatrixToList( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit, + final boolean domain_combinations, + final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final String title_for_html, + final String prefix_for_html, + final Map>[] domain_id_to_secondary_features_maps, + final SortedSet all_pfams_encountered, + final SortedSet pfams_gained_or_lost, + final String suffix_for_per_node_events_file ) { + if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" ); + } + else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" ); + } + else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" ); + } + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + try { + final Writer out = new BufferedWriter( new FileWriter( outfile ) ); + final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing_old.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, + domain_combinations, + state, + filename ); + Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null; + File per_node_go_mapped_domain_gain_loss_outfile = null; + int per_node_counter = 0; + out.write( "" ); + out.write( SurfacingConstants.NL ); + addHtmlHead( out, title_for_html ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "

    " ); + out.write( SurfacingConstants.NL ); + out.write( title_for_html ); + out.write( SurfacingConstants.NL ); + out.write( "

    " ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + for( final String id : sorted_ids ) { + final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id ); + if ( matcher.matches() ) { + continue; + } + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + } + out.write( "
    " ); + out.write( "" + id + "" ); + out.write( "
    " ); + out.write( SurfacingConstants.NL ); + for( final String id : sorted_ids ) { + final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id ); + if ( matcher.matches() ) { + continue; + } + out.write( SurfacingConstants.NL ); + out.write( "

    " ); + out.write( "" + id + "" ); + writeTaxonomyLinks( out, id ); + out.write( "

    " ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + per_node_counter = 0; + if ( matrix.getNumberOfCharacters() > 0 ) { + per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); + SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); + per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil + .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); + } + else { + per_node_go_mapped_domain_gain_loss_outfile = null; + per_node_go_mapped_domain_gain_loss_outfile_writer = null; + } + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) { + final String character = matrix.getCharacter( c ); + String domain_0 = ""; + String domain_1 = ""; + if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) { + final String[] s = character.split( BinaryDomainCombination.SEPARATOR ); + if ( s.length != 2 ) { + throw new AssertionError( "this should not have happened: unexpected format for domain combination: [" + + character + "]" ); + } + domain_0 = s[ 0 ]; + domain_1 = s[ 1 ]; + } + else { + domain_0 = character; + } + writeDomainData( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + out, + domain_0, + domain_1, + prefix_for_html, + character_separator, + domain_id_to_secondary_features_maps, + null ); + all_pfams_encountered.add( domain_0 ); + if ( pfams_gained_or_lost != null ) { + pfams_gained_or_lost.add( domain_0 ); + } + if ( !ForesterUtil.isEmpty( domain_1 ) ) { + all_pfams_encountered.add( domain_1 ); + if ( pfams_gained_or_lost != null ) { + pfams_gained_or_lost.add( domain_1 ); + } + } + if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { + writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer, + domain_0, + domain_1 ); + per_node_counter++; + } + } + } + if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { + per_node_go_mapped_domain_gain_loss_outfile_writer.close(); + if ( per_node_counter < 1 ) { + per_node_go_mapped_domain_gain_loss_outfile.delete(); + } + per_node_counter = 0; + } + out.write( "
    " ); + out.write( "Pfam domain(s)" ); + out.write( "" ); + out.write( "GO term acc" ); + out.write( "" ); + out.write( "GO term" ); + out.write( "" ); + out.write( "GO namespace" ); + out.write( "
    " ); + out.write( SurfacingConstants.NL ); + out.write( "
    " ); + out.write( SurfacingConstants.NL ); + } // for( final String id : sorted_ids ) { + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + + "\"" ); + } + + public static void writeBinaryStatesMatrixToListORIGIG( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit, + final boolean domain_combinations, + final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final String title_for_html, + final String prefix_for_html, + final Map>[] domain_id_to_secondary_features_maps, + final SortedSet all_pfams_encountered, + final SortedSet pfams_gained_or_lost, + final String suffix_for_per_node_events_file ) { + if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" ); + } + else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" ); + } + else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" ); + } + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + try { + final Writer out = new BufferedWriter( new FileWriter( outfile ) ); + final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing_old.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, + domain_combinations, + state, + filename ); + Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null; + File per_node_go_mapped_domain_gain_loss_outfile = null; + int per_node_counter = 0; + out.write( "" ); + out.write( SurfacingConstants.NL ); + addHtmlHead( out, title_for_html ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "

    " ); + out.write( SurfacingConstants.NL ); + out.write( title_for_html ); + out.write( SurfacingConstants.NL ); + out.write( "

    " ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + for( final String id : sorted_ids ) { + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + } + out.write( "
    " ); + out.write( "" + id + "" ); + writeTaxonomyLinks( out, id ); + out.write( "
    " ); + out.write( SurfacingConstants.NL ); + for( final String id : sorted_ids ) { + out.write( SurfacingConstants.NL ); + out.write( "

    " ); + out.write( "" + id + "" ); + writeTaxonomyLinks( out, id ); + out.write( "

    " ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + per_node_counter = 0; + if ( matrix.getNumberOfCharacters() > 0 ) { + per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); + SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); + per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil + .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); + } + else { + per_node_go_mapped_domain_gain_loss_outfile = null; + per_node_go_mapped_domain_gain_loss_outfile_writer = null; + } + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) { + final String character = matrix.getCharacter( c ); + String domain_0 = ""; + String domain_1 = ""; + if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) { + final String[] s = character.split( BinaryDomainCombination.SEPARATOR ); + if ( s.length != 2 ) { + throw new AssertionError( "this should not have happened: unexpected format for domain combination: [" + + character + "]" ); + } + domain_0 = s[ 0 ]; + domain_1 = s[ 1 ]; + } + else { + domain_0 = character; + } + writeDomainData( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + out, + domain_0, + domain_1, + prefix_for_html, + character_separator, + domain_id_to_secondary_features_maps, + null ); + all_pfams_encountered.add( domain_0 ); + if ( pfams_gained_or_lost != null ) { + pfams_gained_or_lost.add( domain_0 ); + } + if ( !ForesterUtil.isEmpty( domain_1 ) ) { + all_pfams_encountered.add( domain_1 ); + if ( pfams_gained_or_lost != null ) { + pfams_gained_or_lost.add( domain_1 ); + } + } + if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { + writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer, + domain_0, + domain_1 ); + per_node_counter++; + } + } + } + if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { + per_node_go_mapped_domain_gain_loss_outfile_writer.close(); + if ( per_node_counter < 1 ) { + per_node_go_mapped_domain_gain_loss_outfile.delete(); + } + per_node_counter = 0; + } + out.write( "
    " ); + out.write( "Pfam domain(s)" ); + out.write( "" ); + out.write( "GO term acc" ); + out.write( "" ); + out.write( "GO term" ); + out.write( "" ); + out.write( "Penultimate GO term" ); + out.write( "" ); + out.write( "GO namespace" ); + out.write( "
    " ); + out.write( SurfacingConstants.NL ); + out.write( "
    " ); + out.write( SurfacingConstants.NL ); + } // for( final String id : sorted_ids ) { + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + + "\"" ); + } + + public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties, + final File output_dir, + final Writer per_genome_domain_promiscuity_statistics_writer, + final GenomeWideCombinableDomains gwcd, + final int i, + final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) { + File dc_outfile = new File( input_file_properties[ i ][ 0 ] + + surfacing_old.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); + if ( output_dir != null ) { + dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile ); + } + checkForOutputFileWriteability( dc_outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) ); + out.write( gwcd.toStringBuilder( dc_sort_order ).toString() ); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics(); + try { + per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 0 ] + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" ); + if ( stats.getN() < 2 ) { + per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" ); + } + else { + per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats + .sampleStandardDeviation() ) + + "\t" ); + } + per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" ); + final SortedSet mpds = gwcd.getMostPromiscuosDomain(); + for( final DomainId mpd : mpds ) { + per_genome_domain_promiscuity_statistics_writer.write( mpd.getId() + " " ); + } + per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + if ( input_file_properties[ i ].length == 3 ) { + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); + } + else { + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" + + dc_outfile + "\"" ); + } + } + + private static void writeDomainData( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit, + final Writer out, + final String domain_0, + final String domain_1, + final String prefix_for_html, + final String character_separator_for_non_html_output, + final Map>[] domain_id_to_secondary_features_maps, + final Set all_go_ids ) throws IOException { + boolean any_go_annotation_present = false; + boolean first_has_no_go = false; + int domain_count = 2; // To distinguish between domains and binary domain combinations. + if ( ForesterUtil.isEmpty( domain_1 ) ) { + domain_count = 1; + } + // The following has a difficult to understand logic. + for( int d = 0; d < domain_count; ++d ) { + List go_ids = null; + boolean go_annotation_present = false; + if ( d == 0 ) { + final DomainId domain_id = new DomainId( domain_0 ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + go_annotation_present = true; + any_go_annotation_present = true; + go_ids = domain_id_to_go_ids_map.get( domain_id ); + } + else { + first_has_no_go = true; + } + } + else { + final DomainId domain_id = new DomainId( domain_1 ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + go_annotation_present = true; + any_go_annotation_present = true; + go_ids = domain_id_to_go_ids_map.get( domain_id ); + } + } + if ( go_annotation_present ) { + boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) ); + for( final GoId go_id : go_ids ) { + out.write( "" ); + if ( first ) { + first = false; + writeDomainIdsToHtml( out, + domain_0, + domain_1, + prefix_for_html, + domain_id_to_secondary_features_maps ); + } + else { + out.write( "" ); + } + if ( !go_id_to_term_map.containsKey( go_id ) ) { + throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" ); + } + final GoTerm go_term = go_id_to_term_map.get( go_id ); + if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) { + // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName(); + final String go_id_str = go_id.getId(); + out.write( "" ); + out.write( "" + go_id_str + "" ); + out.write( "" ); + out.write( go_term.getName() ); + if ( domain_count == 2 ) { + out.write( " (" + d + ")" ); + } + out.write( "" ); + // out.write( top ); + // out.write( "" ); + out.write( "[" ); + out.write( go_term.getGoNameSpace().toShortString() ); + out.write( "]" ); + out.write( "" ); + if ( all_go_ids != null ) { + all_go_ids.add( go_id ); + } + } + else { + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + } + out.write( "" ); + out.write( SurfacingConstants.NL ); + } + } + } // for( int d = 0; d < domain_count; ++d ) + if ( !any_go_annotation_present ) { + out.write( "" ); + writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + } + } + + private static void writeDomainDataORIG( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit, + final Writer out, + final String domain_0, + final String domain_1, + final String prefix_for_html, + final String character_separator_for_non_html_output, + final Map>[] domain_id_to_secondary_features_maps, + final Set all_go_ids ) throws IOException { + boolean any_go_annotation_present = false; + boolean first_has_no_go = false; + int domain_count = 2; // To distinguish between domains and binary domain combinations. + if ( ForesterUtil.isEmpty( domain_1 ) ) { + domain_count = 1; + } + // The following has a difficult to understand logic. + for( int d = 0; d < domain_count; ++d ) { + List go_ids = null; + boolean go_annotation_present = false; + if ( d == 0 ) { + final DomainId domain_id = new DomainId( domain_0 ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + go_annotation_present = true; + any_go_annotation_present = true; + go_ids = domain_id_to_go_ids_map.get( domain_id ); + } + else { + first_has_no_go = true; + } + } + else { + final DomainId domain_id = new DomainId( domain_1 ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + go_annotation_present = true; + any_go_annotation_present = true; + go_ids = domain_id_to_go_ids_map.get( domain_id ); + } + } + if ( go_annotation_present ) { + boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) ); + for( final GoId go_id : go_ids ) { + out.write( "" ); + if ( first ) { + first = false; + writeDomainIdsToHtml( out, + domain_0, + domain_1, + prefix_for_html, + domain_id_to_secondary_features_maps ); + } + else { + out.write( "" ); + } + if ( !go_id_to_term_map.containsKey( go_id ) ) { + throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" ); + } + final GoTerm go_term = go_id_to_term_map.get( go_id ); + if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) { + final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName(); + final String go_id_str = go_id.getId(); + out.write( "" ); + out.write( "" + go_id_str + "" ); + out.write( "" ); + out.write( go_term.getName() ); + if ( domain_count == 2 ) { + out.write( " (" + d + ")" ); + } + out.write( "" ); + out.write( top ); + out.write( "" ); + out.write( "[" ); + out.write( go_term.getGoNameSpace().toShortString() ); + out.write( "]" ); + out.write( "" ); + if ( all_go_ids != null ) { + all_go_ids.add( go_id ); + } + } + else { + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + } + out.write( "" ); + out.write( SurfacingConstants.NL ); + } + } + } // for( int d = 0; d < domain_count; ++d ) + if ( !any_go_annotation_present ) { + out.write( "" ); + writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + } + } + + private static void writeDomainIdsToHtml( final Writer out, + final String domain_0, + final String domain_1, + final String prefix_for_detailed_html, + final Map>[] domain_id_to_secondary_features_maps ) + throws IOException { + out.write( "" ); + if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { + out.write( prefix_for_detailed_html ); + out.write( " " ); + } + out.write( "" + domain_0 + "" ); + //if ( ForesterUtil.isEmpty( domain_1 ) ) { + // out.write( " [gs]" ); + //} + // if ( !ForesterUtil.isEmpty( domain_1 ) ) { + // out.write( "=" ); + // out.write( "" + domain_1 + "" ); + //} + // else if ( ( domain_id_to_secondary_features_maps != null ) + // && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + // out.write( " [" ); + // boolean first = true; + // for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { + // final Set sec_features = domain_id_to_secondary_features_map.get( new DomainId( domain_0 ) ); + // if ( ( sec_features != null ) && ( sec_features.size() > 0 ) ) { + // for( final String sec_feature : sec_features ) { + // if ( first ) { + // first = false; + // } + // else { + // out.write( ", " ); + // } + // if ( SurfacingConstants.SECONDARY_FEATURES_ARE_SCOP + // && ( SurfacingConstants.SECONDARY_FEATURES_SCOP_LINK != null ) ) { + // out.write( "" + sec_feature + "" ); + // } + // else { + // out.write( sec_feature ); + // } + // } + // } + // } + // out.write( "]" ); + // } + out.write( "" ); + } + + private static void writeDomainIdsToHtmlORIG( final Writer out, + final String domain_0, + final String domain_1, + final String prefix_for_detailed_html, + final Map>[] domain_id_to_secondary_features_maps ) + throws IOException { + out.write( "" ); + if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { + out.write( prefix_for_detailed_html ); + out.write( " " ); + } + out.write( "" + domain_0 + "" ); + if ( ForesterUtil.isEmpty( domain_1 ) ) { + out.write( " [gs]" ); + } + if ( !ForesterUtil.isEmpty( domain_1 ) ) { + out.write( "=" ); + out.write( "" + domain_1 + "" ); + } + else if ( ( domain_id_to_secondary_features_maps != null ) + && ( domain_id_to_secondary_features_maps.length > 0 ) ) { + out.write( " [" ); + boolean first = true; + for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { + final Set sec_features = domain_id_to_secondary_features_map.get( new DomainId( domain_0 ) ); + if ( ( sec_features != null ) && ( sec_features.size() > 0 ) ) { + for( final String sec_feature : sec_features ) { + if ( first ) { + first = false; + } + else { + out.write( ", " ); + } + if ( SurfacingConstants.SECONDARY_FEATURES_ARE_SCOP + && ( SurfacingConstants.SECONDARY_FEATURES_SCOP_LINK != null ) ) { + out.write( "" + sec_feature + "" ); + } + else { + out.write( sec_feature ); + } + } + } + } + out.write( "]" ); + } + out.write( "" ); + } + + public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc, + final StringBuilder html_title, + final Writer w, + final SortedSet similarities, + final boolean treat_as_binary, + final List species_order, + final PrintableDomainSimilarity.PRINT_OPTION print_option, + final DomainSimilarity.DomainSimilaritySortField sort_field, + final DomainSimilarity.DomainSimilarityScoring scoring, + final boolean verbose ) throws IOException { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + String histogram_title = null; + switch ( sort_field ) { + case ABS_MAX_COUNTS_DIFFERENCE: + if ( treat_as_binary ) { + histogram_title = "absolute counts difference:"; + } + else { + histogram_title = "absolute (maximal) counts difference:"; + } + break; + case MAX_COUNTS_DIFFERENCE: + if ( treat_as_binary ) { + histogram_title = "counts difference:"; + } + else { + histogram_title = "(maximal) counts difference:"; + } + break; + case DOMAIN_ID: + histogram_title = "score mean:"; + break; + case MIN: + histogram_title = "score minimum:"; + break; + case MAX: + histogram_title = "score maximum:"; + break; + case MAX_DIFFERENCE: + if ( treat_as_binary ) { + histogram_title = "difference:"; + } + else { + histogram_title = "(maximal) difference:"; + } + break; + case MEAN: + histogram_title = "score mean:"; + break; + case SD: + histogram_title = "score standard deviation:"; + break; + case SPECIES_COUNT: + histogram_title = "species number:"; + break; + default: + throw new AssertionError( "Unknown sort field: " + sort_field ); + } + for( final DomainSimilarity similarity : similarities ) { + switch ( sort_field ) { + case ABS_MAX_COUNTS_DIFFERENCE: + stats.addValue( Math.abs( similarity.getMaximalDifferenceInCounts() ) ); + break; + case MAX_COUNTS_DIFFERENCE: + stats.addValue( similarity.getMaximalDifferenceInCounts() ); + break; + case DOMAIN_ID: + stats.addValue( similarity.getMeanSimilarityScore() ); + break; + case MIN: + stats.addValue( similarity.getMinimalSimilarityScore() ); + break; + case MAX: + stats.addValue( similarity.getMaximalSimilarityScore() ); + break; + case MAX_DIFFERENCE: + stats.addValue( similarity.getMaximalDifference() ); + break; + case MEAN: + stats.addValue( similarity.getMeanSimilarityScore() ); + break; + case SD: + stats.addValue( similarity.getStandardDeviationOfSimilarityScore() ); + break; + case SPECIES_COUNT: + stats.addValue( similarity.getSpecies().size() ); + break; + default: + throw new AssertionError( "Unknown sort field: " + sort_field ); + } + } + // + // final HistogramData[] hists = new HistogramData[ 1 ]; + // + // + // List data_items = new + // ArrayList(); + // double[] values = stats.getDataAsDoubleArray(); + // for( int i = 0; i < values.length; i++ ) { + // HistogramDataItem data_item = new BasicHistogramDataItem( "", values[ + // i ] ); + // data_items.add( data_item ); + // } + // + // + // HistogramData hd0 = new HistogramData( "name", + // data_items, + // null, 20, + // 40 ); + // + // + // + // + // hists[ 0 ] = hd0; + // + // final HistogramsFrame hf = new HistogramsFrame( hists ); + // hf.setVisible( true ); + // + AsciiHistogram histo = null; + if ( stats.getMin() < stats.getMin() ) { + histo = new AsciiHistogram( stats, histogram_title ); + } + if ( verbose ) { + if ( histo != null ) { + System.out.println( histo.toStringBuffer( 20, '|', 40, 5 ) ); + } + System.out.println(); + System.out.println( "N : " + stats.getN() ); + System.out.println( "Min : " + stats.getMin() ); + System.out.println( "Max : " + stats.getMax() ); + System.out.println( "Mean : " + stats.arithmeticMean() ); + if ( stats.getN() > 1 ) { + System.out.println( "SD : " + stats.sampleStandardDeviation() ); + } + else { + System.out.println( "SD : n/a" ); + } + System.out.println( "Median : " + stats.median() ); + if ( stats.getN() > 1 ) { + System.out.println( "Pearsonian skewness : " + stats.pearsonianSkewness() ); + } + else { + System.out.println( "Pearsonian skewness : n/a" ); + } + } + switch ( print_option ) { + case SIMPLE_TAB_DELIMITED: + break; + case HTML: + w.write( "" ); + w.write( SurfacingConstants.NL ); + addHtmlHead( w, "SURFACING :: " + html_title ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( html_desc.toString() ); + w.write( SurfacingConstants.NL ); + w.write( "
    " ); + w.write( "
    " ); + w.write( SurfacingConstants.NL ); + w.write( "
    " );
    +                w.write( SurfacingConstants.NL );
    +                if ( histo != null ) {
    +                    w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
    +                    w.write( SurfacingConstants.NL );
    +                }
    +                w.write( "
    " ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "
    N: " + stats.getN() + "
    Min: " + stats.getMin() + "
    Max: " + stats.getMax() + "
    Mean: " + stats.arithmeticMean() + "
    SD: " + stats.sampleStandardDeviation() + "
    SD: n/a
    Median: " + stats.median() + "
    Pearsonian skewness: " + stats.pearsonianSkewness() + "
    Pearsonian skewness: n/a
    " ); + w.write( SurfacingConstants.NL ); + w.write( "
    " ); + w.write( SurfacingConstants.NL ); + w.write( "
    " ); + w.write( SurfacingConstants.NL ); + w.write( "
    " ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + break; + } + w.write( SurfacingConstants.NL ); + for( final DomainSimilarity similarity : similarities ) { + if ( ( species_order != null ) && !species_order.isEmpty() ) { + ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); + } + w.write( similarity.toStringBuffer( print_option ).toString() ); + w.write( SurfacingConstants.NL ); + } + switch ( print_option ) { + case HTML: + w.write( SurfacingConstants.NL ); + w.write( "
    " ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + break; + } + w.flush(); + w.close(); + return stats; + } + + private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer, + final String domain_0, + final String domain_1 ) throws IOException { + individual_files_writer.write( domain_0 ); + individual_files_writer.write( ForesterUtil.LINE_SEPARATOR ); + if ( !ForesterUtil.isEmpty( domain_1 ) ) { + individual_files_writer.write( domain_1 ); + individual_files_writer.write( ForesterUtil.LINE_SEPARATOR ); + } + } + + public static void writeMatrixToFile( final CharacterStateMatrix matrix, + final String filename, + final Format format ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + matrix.toWriter( out, format ); + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); + } + + public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { + checkForOutputFileWriteability( matrix_outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) ); + for( final DistanceMatrix distance_matrix : matrices ) { + out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.flush(); + } + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); + } + + private static void writePfamsToFile( final String outfile_name, final SortedSet pfams ) { + try { + final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) ); + for( final String pfam : pfams ) { + writer.write( pfam ); + writer.write( ForesterUtil.LINE_SEPARATOR ); + } + writer.close(); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name + + "]" ); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing_old.PRG_NAME, "Failure to write: " + e ); + } + } + + public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) { + final PhylogenyWriter writer = new PhylogenyWriter(); + try { + writer.toPhyloXML( new File( filename ), phylogeny, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing_old.PRG_NAME, "failed to write phylogeny to \"" + filename + + "\": " + e ); + } + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); + } + + public static void writeTaxonomyLinks( final Writer writer, final String species ) throws IOException { + if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { + final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( species ); + writer.write( " [" ); + if ( matcher.matches() ) { + writer.write( "uniprot" ); + } + else { + writer.write( "eol" ); + writer.write( "|" ); + writer.write( "tol" ); + } + writer.write( "]" ); + } + } + + public static void writeTaxonomyLinksORIG( final Writer writer, final String species ) throws IOException { + if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { + final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( species ); + writer.write( " [" ); + if ( matcher.matches() ) { + writer.write( "uniprot" ); + } + else { + writer.write( "eol" ); + writer.write( "|" ); + writer.write( "tol" ); + writer.write( "|" ); + writer.write( "wikipedia" ); + writer.write( "|" ); + writer.write( "gs" ); + } + writer.write( "]" ); + } + } + + private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix ) { + if ( !( matrix instanceof BasicCharacterStateMatrix ) ) { + throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class + + "] to nexus" ); + } + final BasicCharacterStateMatrix my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix ) matrix; + try { + final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) ); + w.write( NexusConstants.NEXUS ); + w.write( ForesterUtil.LINE_SEPARATOR ); + my_matrix.writeNexusTaxaBlock( w ); + my_matrix.writeNexusBinaryChractersBlock( w ); + w.flush(); + w.close(); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + } + + private static void writeToNexus( final String outfile_name, + final CharacterStateMatrix matrix, + final Phylogeny phylogeny ) { + if ( !( matrix instanceof BasicCharacterStateMatrix ) ) { + throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class + + "] to nexus" ); + } + final BasicCharacterStateMatrix my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix ) matrix; + final List phylogenies = new ArrayList( 1 ); + phylogenies.add( phylogeny ); + try { + final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) ); + w.write( NexusConstants.NEXUS ); + w.write( ForesterUtil.LINE_SEPARATOR ); + my_matrix.writeNexusTaxaBlock( w ); + my_matrix.writeNexusBinaryChractersBlock( w ); + PhylogenyWriter.writeNexusTreesBlock( w, phylogenies ); + w.flush(); + w.close(); + ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + } + } + + private static void writeToNexus( final String outfile_name, final DomainParsimonyCalculator domain_parsimony ) { + writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAINS, domain_parsimony + .createMatrixOfDomainPresenceOrAbsence() ); + writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS, domain_parsimony + .createMatrixOfBinaryDomainCombinationPresenceOrAbsence() ); + } + + private static void writeToNexus( final String outfile_name, + final DomainParsimonyCalculator domain_parsimony, + final Phylogeny phylogeny ) { + writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAINS, domain_parsimony + .createMatrixOfDomainPresenceOrAbsence(), phylogeny ); + writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS, domain_parsimony + .createMatrixOfBinaryDomainCombinationPresenceOrAbsence(), phylogeny ); + } +} diff --git a/forester/java/src/org/forester/surfacing/TestSurfacing.java b/forester/java/src/org/forester/surfacing/TestSurfacing.java new file mode 100644 index 0000000..678259e --- /dev/null +++ b/forester/java/src/org/forester/surfacing/TestSurfacing.java @@ -0,0 +1,6277 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.surfacing; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates; +import org.forester.io.parsers.HmmPfamOutputParser; +import org.forester.io.parsers.nexus.PaupLogParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType; +import org.forester.test.Test; +import org.forester.util.ForesterUtil; + +@SuppressWarnings( "unused") +public class TestSurfacing { + + private final static double ZERO_DIFF = 1.0E-9; + + public static boolean isEqual( final double a, final double b ) { + return ( ( Math.abs( a - b ) ) < TestSurfacing.ZERO_DIFF ); + } + + private static StringBuffer mapToStringBuffer( final Map map ) { + final StringBuffer sb = new StringBuffer(); + for( final PhylogenyNode key : map.keySet() ) { + if ( !key.isExternal() ) { + sb.append( key.getName() ); + sb.append( " : " ); + sb.append( map.get( key ).toString() ); + sb.append( ForesterUtil.getLineSeparator() ); + } + } + return sb; + } + + public static boolean test( final File test_dir ) { + System.out.print( " Domain id: " ); + if ( !TestSurfacing.testDomainId() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Protein id: " ); + if ( !TestSurfacing.testProteinId() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Species: " ); + if ( !TestSurfacing.testSpecies() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Basic domain: " ); + if ( !TestSurfacing.testBasicDomain() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Basic protein: " ); + if ( !TestSurfacing.testBasicProtein() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Combinable domains: " ); + if ( !TestSurfacing.testCombinableDomains() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Directed combinable domains: " ); + if ( !TestSurfacing.testDirectedCombinableDomains() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Genome wide specific combinable domains: " ); + if ( !TestSurfacing.testGenomeWideCombinableDomains() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Domain architecture based genome similarity calculator: " ); + if ( !TestSurfacing.testDomainArchitectureBasedGenomeSimilarityCalculator() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Hmmpfam output parser: " ); + if ( !TestSurfacing.testHmmPfamOutputParser( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Hmmpfam output parser with filter: " ); + if ( !TestSurfacing.testHmmPfamOutputParserWithFilter( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Combinations based pairwise similarity calculator: " ); + if ( !TestSurfacing.testCombinationsBasedPairwiseSimilarityCalculator() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Copy number based pairwise similarity calculator: " ); + if ( !TestSurfacing.testCopyNumberBasedPairwiseSimilarityCalculator() ) { + return false; + } + System.out.println( "OK." ); + System.out.print( " Domain combination counting: " ); + if ( !TestSurfacing.testDomainCombinationCounting( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Basic domain similarity calculator: " ); + if ( !TestSurfacing.testBasicDomainSimilarityCalculator() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Basic domain similarity calculator not ignoring species specific domains: " ); + if ( !TestSurfacing.testBasicDomainSimilarityCalculatorNotIgnoringSpeciesSpeficDomains() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Basic domain similarity calculator removal of singles: " ); + if ( !TestSurfacing.testBasicDomainSimilarityCalculatorRemovalOfSingles() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Domain sorting: " ); + if ( !TestSurfacing.testDomainSorting() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Overlap removal: " ); + if ( !TestSurfacing.testOverlapRemoval() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Engulfing overlap removal: " ); + if ( !TestSurfacing.testEngulfingOverlapRemoval() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Binary domain combination: " ); + if ( !TestSurfacing.testBinaryDomainCombination() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Parsimony: " ); + if ( !TestSurfacing.testParsimony() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Directedness: " ); + if ( !TestSurfacing.testDirectedness() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Directedness and adjacency: " ); + if ( !TestSurfacing.testDirectednessAndAdjacency() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Dollo parsimony on secodary features: " ); + if ( !TestSurfacing.testParsimonyOnSecondaryFeatures() ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Paup log parser: " ); + if ( !TestSurfacing.testPaupLogParser( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + System.out.print( " Binary state matrix to gain loss matrix: " ); + if ( !TestSurfacing.testBinaryStateMatrixToGainLossMatrix( test_dir ) ) { + System.out.println( "failed." ); + return false; + } + System.out.println( "OK." ); + return true; + } + + private static boolean testBasicDomain() { + try { + final Domain pd = new BasicDomain( "id", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + if ( !pd.getDomainId().getId().equals( "id" ) ) { + return false; + } + if ( pd.getNumber() != 1 ) { + return false; + } + if ( pd.getTotalCount() != 4 ) { + return false; + } + if ( !pd.equals( new BasicDomain( "id", 22, 111, ( short ) 1, ( short ) 4, 0.2, -12 ) ) ) { + return false; + } + final Domain a1 = new BasicDomain( "a", 1, 10, ( short ) 1, ( short ) 4, 0.1, -12 ); + final BasicDomain a1_copy = new BasicDomain( "a", 1, 10, ( short ) 1, ( short ) 4, 0.1, -12 ); + final BasicDomain a1_equal = new BasicDomain( "a", 524, 743994, ( short ) 1, ( short ) 300, 3.0005, 230 ); + final BasicDomain a2 = new BasicDomain( "a", 1, 10, ( short ) 2, ( short ) 4, 0.1, -12 ); + final BasicDomain a3 = new BasicDomain( "A", 1, 10, ( short ) 1, ( short ) 4, 0.1, -12 ); + if ( !a1.equals( a1 ) ) { + return false; + } + if ( !a1.equals( a1_copy ) ) { + return false; + } + if ( !a1.equals( a1_equal ) ) { + return false; + } + if ( !a1.equals( a2 ) ) { + return false; + } + if ( a1.equals( a3 ) ) { + return false; + } + if ( a1.compareTo( a1 ) != 0 ) { + return false; + } + if ( a1.compareTo( a1_copy ) != 0 ) { + return false; + } + if ( a1.compareTo( a1_equal ) != 0 ) { + return false; + } + if ( a1.compareTo( a2 ) != 0 ) { + return false; + } + if ( a1.compareTo( a3 ) != 0 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicDomainSimilarityCalculator() { + // mouse : ABCDE + // rabbit: A.C.EF + // ciona : A....FGX + // nemve : ABCDEFG + // + // domain A: + // m r c n + // m 2/(2+3) 0 4/(4+2) + // r 1/(1+4) 3/(3+3) + // c 2/(2+5) + // n + // + // mean = ( 2/5 + 0 + 2/3 + 1/5 + 1/2 + 2/7 ) / 6 + // min = 0.0 + // max = 2/3 + // n = 6 + // + // + // domain B: + // m n + // m 4/(4+2) + // n + // + // mean = 2/3 + // min = 2/3 + // max = 2/3 + // sd = 0.0 + // n = 1 + // + // + // domain C: + // m r n + // m - 2/(2+3) 4/(4+2) + // r - - 3/(3+3) + // n - - - + // + // mean = (2/5 + 2/3 + 1/2)/3 = + // min = 2/5 + // max = 2/3 + // sd = 0.0 + // n = 3 + try { + final Domain A = new BasicDomain( "A", 1, 2, ( short ) 1, ( short ) 1, 0.15, -12 ); + final Domain B = new BasicDomain( "B", 1, 2, ( short ) 1, ( short ) 1, 0.2, -12 ); + final Domain C = new BasicDomain( "C", 1, 2, ( short ) 1, ( short ) 1, 0.3, -12 ); + final Domain D = new BasicDomain( "D", 1, 2, ( short ) 1, ( short ) 1, 0.5, -12 ); + final Domain E = new BasicDomain( "E", 1, 2, ( short ) 1, ( short ) 1, 0.5, -12 ); + final Domain F = new BasicDomain( "F", 1, 2, ( short ) 1, ( short ) 1, 0.01, -12 ); + final Domain G = new BasicDomain( "G", 1, 2, ( short ) 1, ( short ) 1, 0.001, -12 ); + final Domain X = new BasicDomain( "X", 1, 2, ( short ) 1, ( short ) 1, 0.0001, -12 ); + if ( !TestSurfacing.isEqual( X.getPerSequenceScore(), -12 ) ) { + return false; + } + final Protein mouse_1 = new BasicProtein( "1", "mouse" ); + final Protein rabbit_1 = new BasicProtein( "1", "rabbit" ); + final Protein ciona_1 = new BasicProtein( "1", "ciona" ); + final Protein nemve_1 = new BasicProtein( "1", "nemve" ); + mouse_1.addProteinDomain( A ); + mouse_1.addProteinDomain( B ); + mouse_1.addProteinDomain( C ); + mouse_1.addProteinDomain( D ); + mouse_1.addProteinDomain( E ); + rabbit_1.addProteinDomain( A ); + rabbit_1.addProteinDomain( C ); + rabbit_1.addProteinDomain( E ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( F ); + ciona_1.addProteinDomain( G ); + ciona_1.addProteinDomain( X ); + nemve_1.addProteinDomain( A ); + nemve_1.addProteinDomain( B ); + nemve_1.addProteinDomain( C ); + nemve_1.addProteinDomain( D ); + nemve_1.addProteinDomain( E ); + nemve_1.addProteinDomain( F ); + nemve_1.addProteinDomain( G ); + final List protein_list_mouse = new ArrayList(); + final List protein_list_rabbit = new ArrayList(); + final List protein_list_ciona = new ArrayList(); + final List protein_list_nemve = new ArrayList(); + protein_list_mouse.add( mouse_1 ); + protein_list_rabbit.add( rabbit_1 ); + protein_list_ciona.add( ciona_1 ); + protein_list_nemve.add( nemve_1 ); + final List cdc_list = new ArrayList(); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + true, + new BasicSpecies( "mouse" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + true, + new BasicSpecies( "rabbit" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + true, + new BasicSpecies( "ciona" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + true, + new BasicSpecies( "nemve" ) ) ); + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + false, + false ); + final SortedSet sims = calc + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list, + true, + true ); + final Iterator sims_it = sims.iterator(); + final DomainSimilarity sa = sims_it.next(); + if ( !sa.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa.getSpeciesData().size() != 4 ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "ciona" ) ) ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "mouse" ) ) ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "nemve" ) ) ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "rabbit" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getMeanSimilarityScore(), + ( 2.0 / 5 + 0 + 2.0 / 3 + 1.0 / 5 + 1.0 / 2 + 2.0 / 7 ) / 6 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getStandardDeviationOfSimilarityScore(), ( 0.23410788192183737 ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getMaximalSimilarityScore(), ( 2.0 / 3 ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getMinimalSimilarityScore(), ( 0.0 ) ) ) { + return false; + } + if ( sa.getN() != 6 ) { + return false; + } + if ( sa.getMaximalDifference() != 7 ) { + return false; + } + if ( sa.getMaximalDifferenceInCounts() != 3 ) { + return false; + } + final DomainSimilarity sb = sims_it.next(); + if ( !sb.getDomainId().getId().equals( "B" ) ) { + return false; + } + if ( sb.getSpeciesData().size() != 2 ) { + return false; + } + if ( !sb.getSpecies().contains( new BasicSpecies( "mouse" ) ) ) { + return false; + } + if ( !sb.getSpecies().contains( new BasicSpecies( "nemve" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sb.getMeanSimilarityScore(), 2.0 / 3 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sb.getStandardDeviationOfSimilarityScore(), 0.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sb.getMaximalSimilarityScore(), ( 2.0 / 3 ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sb.getMinimalSimilarityScore(), ( 2.0 / 3 ) ) ) { + return false; + } + if ( sb.getN() != 1 ) { + return false; + } + if ( sb.getMaximalDifference() != 2 ) { + return false; + } + if ( sb.getMaximalDifferenceInCounts() != 2 ) { + return false; + } + final DomainSimilarity sc = sims_it.next(); + if ( !sc.getDomainId().getId().equals( "C" ) ) { + return false; + } + if ( sc.getSpeciesData().size() != 3 ) { + return false; + } + if ( !sc.getSpecies().contains( new BasicSpecies( "mouse" ) ) ) { + return false; + } + if ( !sc.getSpecies().contains( new BasicSpecies( "rabbit" ) ) ) { + return false; + } + if ( !sc.getSpecies().contains( new BasicSpecies( "nemve" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sc.getMeanSimilarityScore(), ( 2.0 / 5 + 2.0 / 3 + 1.0 / 2 ) / 3 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sc.getStandardDeviationOfSimilarityScore(), 0.13471506281091264 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sc.getMaximalSimilarityScore(), ( 2.0 / 3 ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sc.getMinimalSimilarityScore(), ( 2.0 / 5 ) ) ) { + return false; + } + if ( sc.getN() != 3 ) { + return false; + } + if ( sc.getMaximalDifference() != 3 ) { + return false; + } + if ( sc.getMaximalDifferenceInCounts() != 3 ) { + return false; + } + // mouse : ....ABCDE..... + // rabbit: ....A.C.EFFF.. + // ciona : AAAAA......FGX + // nemve : ....ABCDEFG... + // + // domain A: + // m r c n + // m 2/(2+3) 0 4/(4+2) + // r - 1/(1+5) 3/(3+3) + // c - 2/(2+6) + // n + // + // mean = ( 2/5 + 0 + 2/3 + 1/6 + 1/2 + 2/8 ) / 6 + // min = 0.0 + // max = 2/3 + // n = 6 + final List cdc_list2 = new ArrayList(); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + false, + new BasicSpecies( "mouse" ) ) ); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + false, + new BasicSpecies( "rabbit" ) ) ); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + false, + new BasicSpecies( "ciona" ) ) ); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + false, + new BasicSpecies( "nemve" ) ) ); + final DomainSimilarityCalculator calc2 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + false, + false ); + final SortedSet sims2 = calc2 + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list2, + false, + true ); + final Iterator sims_it2 = sims2.iterator(); + final DomainSimilarity sa2 = sims_it2.next(); + if ( !sa2.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa2.getSpeciesData().size() != 4 ) { + return false; + } + if ( !sa2.getSpecies().contains( new BasicSpecies( "ciona" ) ) ) { + return false; + } + if ( !sa2.getSpecies().contains( new BasicSpecies( "mouse" ) ) ) { + return false; + } + if ( !sa2.getSpecies().contains( new BasicSpecies( "nemve" ) ) ) { + return false; + } + if ( !sa2.getSpeciesData().keySet().contains( new BasicSpecies( "rabbit" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa2.getMeanSimilarityScore(), + ( 2.0 / 5 + 0 + 2.0 / 3 + 1.0 / 6 + 1.0 / 2 + 2.0 / 8 ) / 6 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa2.getStandardDeviationOfSimilarityScore(), ( 0.2404663678647683 ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa2.getMaximalSimilarityScore(), ( 2.0 / 3 ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa2.getMinimalSimilarityScore(), ( 0.0 ) ) ) { + return false; + } + if ( sa2.getN() != 6 ) { + return false; + } + if ( sa2.getMaximalDifference() != 8 ) { + return false; + } + if ( sa2.getMaximalDifferenceInCounts() != 3 ) { + return false; + } + final Protein ciona_2 = new BasicProtein( "2", "ciona" ); + ciona_2.addProteinDomain( A ); + ciona_2.addProteinDomain( A ); + ciona_2.addProteinDomain( A ); + ciona_2.addProteinDomain( B ); + ciona_2.addProteinDomain( B ); + ciona_2.addProteinDomain( B ); + ciona_2.addProteinDomain( F ); + ciona_2.addProteinDomain( F ); + ciona_2.addProteinDomain( F ); + ciona_2.addProteinDomain( F ); + ciona_2.addProteinDomain( G ); + ciona_2.addProteinDomain( X ); + final Protein ciona_3 = new BasicProtein( "3", "ciona" ); + ciona_3.addProteinDomain( A ); + ciona_3.addProteinDomain( A ); + ciona_3.addProteinDomain( A ); + ciona_3.addProteinDomain( A ); + ciona_3.addProteinDomain( B ); + ciona_3.addProteinDomain( B ); + ciona_3.addProteinDomain( X ); + ciona_3.addProteinDomain( X ); + protein_list_ciona.add( ciona_2 ); + protein_list_ciona.add( ciona_3 ); + final List cdc_list3 = new ArrayList(); + cdc_list3.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + true, + new BasicSpecies( "mouse" ) ) ); + cdc_list3.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + true, + new BasicSpecies( "rabbit" ) ) ); + cdc_list3.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + true, + new BasicSpecies( "ciona" ) ) ); + cdc_list3.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + true, + new BasicSpecies( "nemve" ) ) ); + final DomainSimilarityCalculator calc3 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + false, + false ); + final SortedSet sims3 = calc3 + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list3, + false, + true ); + final Iterator sims_it3 = sims3.iterator(); + final DomainSimilarity sa3 = sims_it3.next(); + if ( !sa3.getDomainId().getId().equals( "A" ) ) { + return false; + } + final SpeciesSpecificDomainSimilariyData ssdsd = sa3.getSpeciesData().get( new BasicSpecies( "ciona" ) ); + if ( ssdsd.getCombinableDomainIdToCountsMap().size() != 4 ) { + return false; + } + if ( ssdsd.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "B" ) ) != 2 ) { + return false; + } + if ( ssdsd.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "F" ) ) != 2 ) { + return false; + } + if ( ssdsd.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "G" ) ) != 2 ) { + return false; + } + if ( ssdsd.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "X" ) ) != 3 ) { + return false; + } + final List cdc_list4 = new ArrayList(); + cdc_list4.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + false, + new BasicSpecies( "mouse" ) ) ); + cdc_list4.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + false, + new BasicSpecies( "rabbit" ) ) ); + cdc_list4.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + false, + new BasicSpecies( "ciona" ) ) ); + ; + cdc_list4.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + false, + new BasicSpecies( "nemve" ) ) ); + final DomainSimilarityCalculator calc4 = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + true, + false ); + final SortedSet sims4 = calc4 + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list4, + false, + true ); + final Iterator sims_it4 = sims4.iterator(); + final DomainSimilarity sa4 = sims_it4.next(); + if ( !sa4.getDomainId().getId().equals( "A" ) ) { + return false; + } + final SpeciesSpecificDomainSimilariyData ssdsd4 = sa4.getSpeciesData().get( new BasicSpecies( "ciona" ) ); + if ( ssdsd4.getCombinableDomainIdToCountsMap().size() != 5 ) { + return false; + } + if ( ssdsd4.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "A" ) ) != 3 ) { + return false; + } + if ( ssdsd4.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "B" ) ) != 2 ) { + return false; + } + if ( ssdsd4.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "F" ) ) != 2 ) { + return false; + } + if ( ssdsd4.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "G" ) ) != 2 ) { + return false; + } + if ( ssdsd4.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "X" ) ) != 3 ) { + return false; + } + final SortedSet sims4_d = calc4 + .calculateSimilarities( new DomainCountsBasedPairwiseSimilarityCalculator(), cdc_list4, false, true ); + final Iterator sims_it4_d = sims4_d.iterator(); + final DomainSimilarity sa4_d = sims_it4_d.next(); + if ( !sa4_d.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa4_d.getCombinableDomainIds( new BasicSpecies( "ciona" ) ).size() != 5 ) { + return false; + } + if ( !TestSurfacing.isEqual( sa4_d.getMeanSimilarityScore(), ( 1 + 1 - 11.0 / 13 + 1 - 11.0 / 13 + 1 + 1 + + 1 - 11.0 / 13 ) / 6.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa4_d.getMaximalSimilarityScore(), 1.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa4_d.getMinimalSimilarityScore(), ( 1 - 11.0 / 13 ) ) ) { + return false; + } + if ( sa4_d.getN() != 6 ) { + return false; + } + final SortedSet sims4_p = calc4 + .calculateSimilarities( new ProteinCountsBasedPairwiseDomainSimilarityCalculator(), + cdc_list4, + false, + true ); + final Iterator sims_it4_p = sims4_p.iterator(); + final DomainSimilarity sa4_p = sims_it4_p.next(); + if ( !sa4_p.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa4_p.getCombinableDomainIds( new BasicSpecies( "ciona" ) ).size() != 5 ) { + return false; + } + if ( !sa4_p.getCombinableDomainIds( new BasicSpecies( "ciona" ) ).contains( new DomainId( "A" ) ) ) { + return false; + } + if ( !sa4_p.getCombinableDomainIds( new BasicSpecies( "ciona" ) ).contains( new DomainId( "B" ) ) ) { + return false; + } + if ( !sa4_p.getCombinableDomainIds( new BasicSpecies( "ciona" ) ).contains( new DomainId( "F" ) ) ) { + return false; + } + if ( !sa4_p.getCombinableDomainIds( new BasicSpecies( "ciona" ) ).contains( new DomainId( "G" ) ) ) { + return false; + } + if ( !sa4_p.getCombinableDomainIds( new BasicSpecies( "ciona" ) ).contains( new DomainId( "X" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa4_p.getMeanSimilarityScore(), + ( 1 + 1 - 2.0 / 4 + 1 - 2.0 / 4 + 1 + 1 + 1 - 2.0 / 4 ) / 6.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa4_p.getMaximalSimilarityScore(), 1 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa4_p.getMinimalSimilarityScore(), ( 1 - 2.0 / 4 ) ) ) { + return false; + } + if ( sa4_p.getN() != 6 ) { + return false; + } + final List cdc_list5 = new ArrayList(); + cdc_list5.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + true, + new BasicSpecies( "mouse" ) ) ); + cdc_list5.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + true, + new BasicSpecies( "rabbit" ) ) ); + cdc_list5.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + true, + new BasicSpecies( "ciona" ) ) ); + cdc_list5.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + true, + new BasicSpecies( "nemve" ) ) ); + final SortedSet sims5_d = calc4 + .calculateSimilarities( new DomainCountsBasedPairwiseSimilarityCalculator(), cdc_list5, false, true ); + final Iterator sims_it5_d = sims5_d.iterator(); + final DomainSimilarity sa5_d = sims_it5_d.next(); + if ( sa5_d.getSpecies().size() != 4 ) { + return false; + } + if ( !sa5_d.getSpecies().last().equals( new BasicSpecies( "rabbit" ) ) ) { + return false; + } + final SpeciesSpecificDomainSimilariyData ssdsd5 = sa5_d.getSpeciesData().get( new BasicSpecies( "ciona" ) ); + if ( ssdsd5.getCombinableDomainIdToCountsMap().size() != 4 ) { + return false; + } + if ( ssdsd5.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "B" ) ) != 2 ) { + return false; + } + if ( ssdsd5.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "F" ) ) != 2 ) { + return false; + } + if ( ssdsd5.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "G" ) ) != 2 ) { + return false; + } + if ( ssdsd5.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "X" ) ) != 3 ) { + return false; + } + if ( !sa5_d.getDomainId().getId().equals( "A" ) ) { + return false; + } + final Species ciona = new BasicSpecies( "ciona" ); + if ( sa5_d.getCombinableDomainIds( ciona ).size() != 4 ) { + return false; + } + if ( sa5_d.getCombinableDomainIds( ciona ).contains( new DomainId( "A" ) ) ) { + return false; + } + if ( !sa5_d.getCombinableDomainIds( ciona ).contains( new DomainId( "B" ) ) ) { + return false; + } + if ( !sa5_d.getCombinableDomainIds( ciona ).contains( new DomainId( "F" ) ) ) { + return false; + } + if ( !sa5_d.getCombinableDomainIds( ciona ).contains( new DomainId( "G" ) ) ) { + return false; + } + if ( !sa5_d.getCombinableDomainIds( ciona ).contains( new DomainId( "X" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa5_d.getMeanSimilarityScore(), ( 1 + 1 - 11.0 / 13 + 1 - 11.0 / 13 + 1 + 1 + + 1 - 11.0 / 13 ) / 6.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa5_d.getMaximalSimilarityScore(), 1.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa5_d.getMinimalSimilarityScore(), ( 1 - 11.0 / 13 ) ) ) { + return false; + } + if ( sa5_d.getN() != 6 ) { + return false; + } + if ( sa5_d.getMaximalDifference() != sa5_d.getMaximalDifferenceInCounts() ) { + return false; + } + if ( sa5_d.getMaximalDifference() != 11 ) { + return false; + } + if ( sa5_d.getMaximalDifferenceInCounts() != 11 ) { + return false; + } + final SortedSet sims5_p = calc4 + .calculateSimilarities( new ProteinCountsBasedPairwiseDomainSimilarityCalculator(), + cdc_list5, + false, + true ); + final Iterator sims_it5_p = sims5_p.iterator(); + final DomainSimilarity sa5_p = sims_it5_p.next(); + if ( !sa5_p.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa5_p.getCombinableDomainIds( ciona ).size() != 4 ) { + return false; + } + if ( sa5_p.getCombinableDomainIds( ciona ).contains( new DomainId( "A" ) ) ) { + return false; + } + if ( !sa5_p.getCombinableDomainIds( ciona ).contains( new DomainId( "B" ) ) ) { + return false; + } + if ( !sa5_p.getCombinableDomainIds( ciona ).contains( new DomainId( "F" ) ) ) { + return false; + } + if ( !sa5_p.getCombinableDomainIds( ciona ).contains( new DomainId( "G" ) ) ) { + return false; + } + if ( !sa5_p.getCombinableDomainIds( ciona ).contains( new DomainId( "X" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa5_p.getMeanSimilarityScore(), + ( 1 + 1 - 2.0 / 4 + 1 - 2.0 / 4 + 1 + 1 + 1 - 2.0 / 4 ) / 6.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa5_p.getMaximalSimilarityScore(), 1 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa5_p.getMinimalSimilarityScore(), ( 1 - 2.0 / 4 ) ) ) { + return false; + } + if ( sa5_p.getN() != 6 ) { + return false; + } + if ( sa5_p.getMaximalDifference() != sa5_p.getMaximalDifferenceInCounts() ) { + return false; + } + if ( sa5_p.getMaximalDifference() != 2 ) { + return false; + } + if ( sa5_p.getMaximalDifferenceInCounts() != 2 ) { + return false; + } + final List cdc_list6 = new ArrayList(); + cdc_list6.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + false, + new BasicSpecies( "mouse" ) ) ); + cdc_list6.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + false, + new BasicSpecies( "rabbit" ) ) ); + cdc_list6.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + false, + new BasicSpecies( "ciona" ) ) ); + cdc_list6.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + false, + new BasicSpecies( "nemve" ) ) ); + final SortedSet sims6_d = calc4 + .calculateSimilarities( new DomainCountsBasedPairwiseSimilarityCalculator(), cdc_list6, false, true ); + final Iterator sims_it6_d = sims6_d.iterator(); + final DomainSimilarity sa6_d = sims_it6_d.next(); + if ( sa6_d.getSpecies().size() != 4 ) { + return false; + } + if ( !sa6_d.getSpecies().last().equals( new BasicSpecies( "rabbit" ) ) ) { + return false; + } + final SpeciesSpecificDomainSimilariyData ssdsd6 = sa6_d.getSpeciesData().get( new BasicSpecies( "ciona" ) ); + if ( ssdsd6.getCombinableDomainIdToCountsMap().size() != 5 ) { + return false; + } + if ( ssdsd6.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "B" ) ) != 2 ) { + return false; + } + if ( ssdsd6.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "F" ) ) != 2 ) { + return false; + } + if ( ssdsd6.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "G" ) ) != 2 ) { + return false; + } + if ( ssdsd6.getNumberOfProteinsExhibitingCombinationWith( new DomainId( "X" ) ) != 3 ) { + return false; + } + if ( !sa5_d.getDomainId().getId().equals( "A" ) ) { + return false; + } + final Species ciona6 = new BasicSpecies( "ciona" ); + if ( sa6_d.getCombinableDomainIds( ciona6 ).size() != 5 ) { + return false; + } + if ( !sa6_d.getCombinableDomainIds( ciona6 ).contains( new DomainId( "A" ) ) ) { + return false; + } + if ( !sa6_d.getCombinableDomainIds( ciona6 ).contains( new DomainId( "B" ) ) ) { + return false; + } + if ( !sa6_d.getCombinableDomainIds( ciona6 ).contains( new DomainId( "F" ) ) ) { + return false; + } + if ( !sa6_d.getCombinableDomainIds( ciona6 ).contains( new DomainId( "G" ) ) ) { + return false; + } + if ( !sa6_d.getCombinableDomainIds( ciona6 ).contains( new DomainId( "X" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa6_d.getMeanSimilarityScore(), ( 1 + 1 - 11.0 / 13 + 1 - 11.0 / 13 + 1 + 1 + + 1 - 11.0 / 13 ) / 6.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa6_d.getMaximalSimilarityScore(), 1.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa6_d.getMinimalSimilarityScore(), ( 1 - 11.0 / 13 ) ) ) { + return false; + } + if ( sa6_d.getN() != 6 ) { + return false; + } + if ( sa6_d.getMaximalDifference() != sa6_d.getMaximalDifferenceInCounts() ) { + return false; + } + if ( sa6_d.getMaximalDifference() != 11 ) { + return false; + } + if ( sa6_d.getMaximalDifferenceInCounts() != 11 ) { + return false; + } + final SortedSet sims6_p = calc4 + .calculateSimilarities( new ProteinCountsBasedPairwiseDomainSimilarityCalculator(), + cdc_list6, + false, + true ); + final Iterator sims_it6_p = sims6_p.iterator(); + final DomainSimilarity sa6_p = sims_it6_p.next(); + if ( !sa6_p.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa6_p.getCombinableDomainIds( ciona ).size() != 5 ) { + return false; + } + if ( !sa6_p.getCombinableDomainIds( ciona ).contains( new DomainId( "A" ) ) ) { + return false; + } + if ( !sa6_p.getCombinableDomainIds( ciona ).contains( new DomainId( "B" ) ) ) { + return false; + } + if ( !sa6_p.getCombinableDomainIds( ciona ).contains( new DomainId( "F" ) ) ) { + return false; + } + if ( !sa6_p.getCombinableDomainIds( ciona ).contains( new DomainId( "G" ) ) ) { + return false; + } + if ( !sa6_p.getCombinableDomainIds( ciona ).contains( new DomainId( "X" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa6_p.getMeanSimilarityScore(), + ( 1 + 1 - 2.0 / 4 + 1 - 2.0 / 4 + 1 + 1 + 1 - 2.0 / 4 ) / 6.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa6_p.getMaximalSimilarityScore(), 1 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa6_p.getMinimalSimilarityScore(), ( 1 - 2.0 / 4 ) ) ) { + return false; + } + if ( sa6_p.getN() != 6 ) { + return false; + } + if ( sa6_p.getMaximalDifference() != sa6_p.getMaximalDifferenceInCounts() ) { + return false; + } + if ( sa6_p.getMaximalDifference() != 2 ) { + return false; + } + if ( sa6_p.getMaximalDifferenceInCounts() != 2 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicDomainSimilarityCalculatorNotIgnoringSpeciesSpeficDomains() { + try { + final Domain A = new BasicDomain( "A", 1, 2, ( short ) 1, ( short ) 1, 0.15, -12 ); + final Domain B = new BasicDomain( "B", 1, 2, ( short ) 1, ( short ) 1, 0.2, -12 ); + final Domain D = new BasicDomain( "D", 1, 2, ( short ) 1, ( short ) 1, 0.5, -12 ); + final Domain E = new BasicDomain( "E", 1, 2, ( short ) 1, ( short ) 1, 0.5, -12 ); + final Domain F = new BasicDomain( "F", 1, 2, ( short ) 1, ( short ) 1, 0.01, -12 ); + final Domain G = new BasicDomain( "G", 1, 2, ( short ) 1, ( short ) 1, 0.001, -12 ); + final Domain X = new BasicDomain( "X", 1, 2, ( short ) 1, ( short ) 1, 0.0001, -12 ); + if ( !TestSurfacing.isEqual( X.getPerSequenceScore(), -12 ) ) { + return false; + } + final Protein mouse_1 = new BasicProtein( "1", "mouse" ); + final Protein rabbit_1 = new BasicProtein( "1", "rabbit" ); + final Protein ciona_1 = new BasicProtein( "1", "ciona" ); + final Protein nemve_1 = new BasicProtein( "1", "nemve" ); + mouse_1.addProteinDomain( A ); + mouse_1.addProteinDomain( D ); + mouse_1.addProteinDomain( E ); + rabbit_1.addProteinDomain( B ); + rabbit_1.addProteinDomain( E ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + rabbit_1.addProteinDomain( F ); + ciona_1.addProteinDomain( F ); + ciona_1.addProteinDomain( G ); + ciona_1.addProteinDomain( X ); + nemve_1.addProteinDomain( D ); + nemve_1.addProteinDomain( E ); + nemve_1.addProteinDomain( F ); + nemve_1.addProteinDomain( G ); + final List protein_list_mouse = new ArrayList(); + final List protein_list_rabbit = new ArrayList(); + final List protein_list_ciona = new ArrayList(); + final List protein_list_nemve = new ArrayList(); + protein_list_mouse.add( mouse_1 ); + protein_list_rabbit.add( rabbit_1 ); + protein_list_ciona.add( ciona_1 ); + protein_list_nemve.add( nemve_1 ); + final List cdc_list = new ArrayList(); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + true, + new BasicSpecies( "mouse" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + true, + new BasicSpecies( "rabbit" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + true, + new BasicSpecies( "ciona" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + true, + new BasicSpecies( "nemve" ) ) ); + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + false, + false ); + final SortedSet sims = calc + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list, + true, + false ); + final Iterator sims_it = sims.iterator(); + final DomainSimilarity sa = sims_it.next(); + if ( !sa.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa.getSpeciesData().size() != 1 ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "mouse" ) ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getMeanSimilarityScore(), 1.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getStandardDeviationOfSimilarityScore(), 0.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getMaximalSimilarityScore(), 1.0 ) ) { + return false; + } + if ( !TestSurfacing.isEqual( sa.getMinimalSimilarityScore(), 1.0 ) ) { + return false; + } + if ( sa.getN() != 0 ) { + return false; + } + if ( sa.getMaximalDifference() != 0 ) { + return false; + } + if ( sa.getMaximalDifferenceInCounts() != 0 ) { + return false; + } + final DomainSimilarity sb = sims_it.next(); + if ( !sb.getDomainId().getId().equals( "B" ) ) { + return false; + } + if ( sb.getSpeciesData().size() != 1 ) { + return false; + } + if ( !sb.getSpecies().contains( new BasicSpecies( "rabbit" ) ) ) { + return false; + } + final SortedSet sims2 = calc + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list, + true, + true ); + final Iterator sims_it2 = sims2.iterator(); + final DomainSimilarity sa2 = sims_it2.next(); + if ( !sa2.getDomainId().getId().equals( "D" ) ) { + return false; + } + if ( sa2.getSpeciesData().size() != 2 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicDomainSimilarityCalculatorRemovalOfSingles() { + try { + final Domain A = new BasicDomain( "A", 1, 2, ( short ) 1, ( short ) 1, 0.15, -12 ); + final Domain B = new BasicDomain( "B", 1, 2, ( short ) 1, ( short ) 1, 0.2, -12 ); + final Protein mouse_1 = new BasicProtein( "1", "mouse" ); + final Protein rabbit_1 = new BasicProtein( "1", "rabbit" ); + final Protein ciona_1 = new BasicProtein( "1", "ciona" ); + final Protein nemve_1 = new BasicProtein( "1", "nemve" ); + mouse_1.addProteinDomain( A ); + rabbit_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + ciona_1.addProteinDomain( A ); + nemve_1.addProteinDomain( A ); + final List protein_list_mouse = new ArrayList(); + final List protein_list_rabbit = new ArrayList(); + final List protein_list_ciona = new ArrayList(); + final List protein_list_nemve = new ArrayList(); + protein_list_mouse.add( mouse_1 ); + protein_list_rabbit.add( rabbit_1 ); + protein_list_ciona.add( ciona_1 ); + protein_list_nemve.add( nemve_1 ); + final List cdc_list = new ArrayList(); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse, + true, + new BasicSpecies( "mouse" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit, + true, + new BasicSpecies( "rabbit" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona, + true, + new BasicSpecies( "ciona" ) ) ); + cdc_list.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve, + true, + new BasicSpecies( "nemve" ) ) ); + final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( DomainSimilarity.DomainSimilaritySortField.DOMAIN_ID, + false, + false ); + final SortedSet sims = calc + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list, + false, + true ); + if ( sims.size() != 1 ) { + return false; + } + final Iterator sims_it = sims.iterator(); + final DomainSimilarity sa = sims_it.next(); + if ( !sa.getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sa.getSpeciesData().size() != 4 ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "ciona" ) ) ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "mouse" ) ) ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "nemve" ) ) ) { + return false; + } + if ( !sa.getSpecies().contains( new BasicSpecies( "rabbit" ) ) ) { + return false; + } + final SortedSet sims_ns = calc + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list, + true, + true ); + if ( sims_ns.size() != 0 ) { + return false; + } + final Protein mouse_2 = new BasicProtein( "1", "mouse" ); + final Protein rabbit_2 = new BasicProtein( "1", "rabbit" ); + final Protein ciona_2 = new BasicProtein( "1", "ciona" ); + final Protein nemve_2 = new BasicProtein( "1", "nemve" ); + mouse_2.addProteinDomain( A ); + rabbit_2.addProteinDomain( A ); + ciona_2.addProteinDomain( A ); + ciona_2.addProteinDomain( A ); + ciona_2.addProteinDomain( B ); + ciona_2.addProteinDomain( A ); + ciona_2.addProteinDomain( A ); + ciona_2.addProteinDomain( A ); + nemve_2.addProteinDomain( A ); + final List protein_list_mouse2 = new ArrayList(); + final List protein_list_rabbit2 = new ArrayList(); + final List protein_list_ciona2 = new ArrayList(); + final List protein_list_nemve2 = new ArrayList(); + protein_list_mouse2.add( mouse_2 ); + protein_list_rabbit2.add( rabbit_2 ); + protein_list_ciona2.add( ciona_2 ); + protein_list_nemve2.add( nemve_2 ); + final List cdc_list2 = new ArrayList(); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_mouse2, + true, + new BasicSpecies( "mouse" ) ) ); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_rabbit2, + true, + new BasicSpecies( "rabbit" ) ) ); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_ciona2, + true, + new BasicSpecies( "ciona" ) ) ); + cdc_list2.add( BasicGenomeWideCombinableDomains.createInstance( protein_list_nemve2, + true, + new BasicSpecies( "nemve" ) ) ); + final SortedSet sims2 = calc + .calculateSimilarities( new CombinationsBasedPairwiseDomainSimilarityCalculator(), + cdc_list2, + true, + true ); + if ( sims2.size() != 1 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicProtein() { + try { + // A0 A10 B15 A20 B25 A30 B35 B40 C50 A60 C70 D80 + final Domain A0 = new BasicDomain( "A", 0, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain A10 = new BasicDomain( "A", 10, 11, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain B15 = new BasicDomain( "B", 11, 16, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain A20 = new BasicDomain( "A", 20, 100, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain B25 = new BasicDomain( "B", 25, 26, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain A30 = new BasicDomain( "A", 30, 31, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain B35 = new BasicDomain( "B", 31, 40, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain B40 = new BasicDomain( "B", 40, 600, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain C50 = new BasicDomain( "C", 50, 59, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain A60 = new BasicDomain( "A", 60, 395, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain C70 = new BasicDomain( "C", 70, 71, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain D80 = new BasicDomain( "D", 80, 81, ( short ) 1, ( short ) 4, 0.1, -12 ); + final BasicProtein p = new BasicProtein( "p", "owl" ); + p.addProteinDomain( B15 ); + p.addProteinDomain( C50 ); + p.addProteinDomain( A60 ); + p.addProteinDomain( A30 ); + p.addProteinDomain( C70 ); + p.addProteinDomain( B35 ); + p.addProteinDomain( B40 ); + p.addProteinDomain( A0 ); + p.addProteinDomain( A10 ); + p.addProteinDomain( A20 ); + p.addProteinDomain( B25 ); + p.addProteinDomain( D80 ); + List domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "C" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids.add( new DomainId( "X" ) ); + if ( p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "D" ) ); + domains_ids.add( new DomainId( "C" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( !p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + domains_ids.add( new DomainId( "X" ) ); + if ( p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "X" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + domains_ids = new ArrayList(); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "B" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "A" ) ); + domains_ids.add( new DomainId( "C" ) ); + domains_ids.add( new DomainId( "D" ) ); + if ( !p.contains( domains_ids, false ) ) { + return false; + } + if ( p.contains( domains_ids, true ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBinaryDomainCombination() { + try { + final BasicBinaryDomainCombination s0 = new BasicBinaryDomainCombination( "a", "a" ); + final BasicBinaryDomainCombination s1 = new BasicBinaryDomainCombination( "b", "a" ); + final BasicBinaryDomainCombination s2 = new BasicBinaryDomainCombination( "a", "b" ); + final BasicBinaryDomainCombination s3 = new BasicBinaryDomainCombination( "B", "A" ); + final BasicBinaryDomainCombination s4 = new BasicBinaryDomainCombination( "A", "B" ); + final BasicBinaryDomainCombination s5 = new BasicBinaryDomainCombination( "c", "a" ); + final BasicBinaryDomainCombination s6 = new BasicBinaryDomainCombination( "b", "c" ); + final BasicBinaryDomainCombination s7 = new BasicBinaryDomainCombination( "d", "a" ); + final BasicBinaryDomainCombination s8 = new BasicBinaryDomainCombination( "b", "d" ); + final BinaryDomainCombination s9 = BasicBinaryDomainCombination.createInstance( " z-z=a-aa " ); + if ( !s9.toString().equals( "a-aa=z-z" ) ) { + return false; + } + if ( !s0.equals( s0 ) ) { + return false; + } + if ( s0.equals( s1 ) ) { + return false; + } + if ( s1.equals( s0 ) ) { + return false; + } + if ( !s1.equals( s2 ) ) { + return false; + } + if ( !s2.equals( s1 ) ) { + return false; + } + if ( s2.equals( s3 ) ) { + return false; + } + if ( s2.equals( s3 ) ) { + return false; + } + if ( s2.equals( s4 ) ) { + return false; + } + final SortedSet sorted = new TreeSet(); + sorted.add( s0 ); + sorted.add( s1 ); + sorted.add( s2 ); + sorted.add( s3 ); + sorted.add( s3 ); + sorted.add( s3 ); + sorted.add( s4 ); + sorted.add( s5 ); + sorted.add( s6 ); + sorted.add( s7 ); + sorted.add( s7 ); + sorted.add( s8 ); + if ( sorted.size() != 6 ) { + return false; + } + final DirectedBinaryDomainCombination aa = new DirectedBinaryDomainCombination( "a", "a" ); + final DirectedBinaryDomainCombination ba = new DirectedBinaryDomainCombination( "b", "a" ); + final DirectedBinaryDomainCombination ab = new DirectedBinaryDomainCombination( "a", "b" ); + final DirectedBinaryDomainCombination bb = new DirectedBinaryDomainCombination( "b", "b" ); + if ( !aa.equals( aa ) ) { + return false; + } + if ( aa.equals( bb ) ) { + return false; + } + if ( ab.equals( ba ) ) { + return false; + } + if ( ba.equals( ab ) ) { + return false; + } + if ( !ab.equals( ab ) ) { + return false; + } + if ( ab.equals( aa ) ) { + return false; + } + if ( ab.equals( bb ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBinaryStateMatrixToGainLossMatrix( final File test_dir ) { + final BinaryStates I = BinaryStates.PRESENT; + final BinaryStates O = BinaryStates.ABSENT; + try { + final CharacterStateMatrix binary_states_matrix_0 = new BasicCharacterStateMatrix( 7, + 6 ); + binary_states_matrix_0.setIdentifier( 0, "A" ); + binary_states_matrix_0.setIdentifier( 1, "B" ); + binary_states_matrix_0.setIdentifier( 2, "C" ); + binary_states_matrix_0.setIdentifier( 3, "D" ); + binary_states_matrix_0.setIdentifier( 4, "1" ); + binary_states_matrix_0.setIdentifier( 5, "2" ); + binary_states_matrix_0.setIdentifier( 6, "3" ); + binary_states_matrix_0.setState( 0, 0, O ); + binary_states_matrix_0.setState( 1, 0, O ); + binary_states_matrix_0.setState( 2, 0, O ); + binary_states_matrix_0.setState( 3, 0, O ); + binary_states_matrix_0.setState( 4, 0, O ); + binary_states_matrix_0.setState( 5, 0, O ); + binary_states_matrix_0.setState( 6, 0, O ); + binary_states_matrix_0.setState( 0, 1, I ); + binary_states_matrix_0.setState( 1, 1, O ); + binary_states_matrix_0.setState( 2, 1, O ); + binary_states_matrix_0.setState( 3, 1, O ); + binary_states_matrix_0.setState( 4, 1, O ); + binary_states_matrix_0.setState( 5, 1, O ); + binary_states_matrix_0.setState( 6, 1, O ); + binary_states_matrix_0.setState( 0, 2, O ); + binary_states_matrix_0.setState( 1, 2, O ); + binary_states_matrix_0.setState( 2, 2, O ); + binary_states_matrix_0.setState( 3, 2, O ); + binary_states_matrix_0.setState( 4, 2, I ); + binary_states_matrix_0.setState( 5, 2, O ); + binary_states_matrix_0.setState( 6, 2, O ); + binary_states_matrix_0.setState( 0, 3, I ); + binary_states_matrix_0.setState( 1, 3, O ); + binary_states_matrix_0.setState( 2, 3, O ); + binary_states_matrix_0.setState( 3, 3, O ); + binary_states_matrix_0.setState( 4, 3, I ); + binary_states_matrix_0.setState( 5, 3, O ); + binary_states_matrix_0.setState( 6, 3, I ); + binary_states_matrix_0.setState( 0, 4, I ); + binary_states_matrix_0.setState( 1, 4, O ); + binary_states_matrix_0.setState( 2, 4, I ); + binary_states_matrix_0.setState( 3, 4, O ); + binary_states_matrix_0.setState( 4, 4, I ); + binary_states_matrix_0.setState( 5, 4, O ); + binary_states_matrix_0.setState( 6, 4, I ); + binary_states_matrix_0.setState( 0, 5, I ); + binary_states_matrix_0.setState( 1, 5, I ); + binary_states_matrix_0.setState( 2, 5, I ); + binary_states_matrix_0.setState( 3, 5, I ); + binary_states_matrix_0.setState( 4, 5, I ); + binary_states_matrix_0.setState( 5, 5, I ); + binary_states_matrix_0.setState( 6, 5, I ); + final String[] character_labels_0 = new String[ 6 ]; + character_labels_0[ 0 ] = "first"; + character_labels_0[ 1 ] = "second"; + character_labels_0[ 2 ] = "third"; + character_labels_0[ 3 ] = "forth"; + character_labels_0[ 4 ] = "fifth"; + character_labels_0[ 5 ] = "sixth"; + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny phylogeny_0 = factory.create( "(((A,B)1,C)2,D)3", new NHXParser() )[ 0 ]; + final DomainParsimonyCalculator dom_pars = DomainParsimonyCalculator.createInstance( phylogeny_0 ); + dom_pars.executeOnGivenBinaryStatesMatrix( binary_states_matrix_0, character_labels_0 ); + final CharacterStateMatrix gl_matrix_0 = dom_pars.getGainLossMatrix(); + // final StringWriter sw = new StringWriter(); + // gl_matrix_0.toWriter( sw ); + // System.out.println( sw.toString() ); + if ( dom_pars.getCost() != 13 ) { + return false; + } + if ( dom_pars.getTotalGains() != 5 ) { + return false; + } + if ( dom_pars.getTotalLosses() != 8 ) { + return false; + } + if ( dom_pars.getTotalUnchanged() != 29 ) { + return false; + } + if ( gl_matrix_0.getState( "A", 1 ) != GainLossStates.GAIN ) { + return false; + } + if ( gl_matrix_0.getState( "A", 4 ) != GainLossStates.UNCHANGED_PRESENT ) { + return false; + } + if ( gl_matrix_0.getState( "B", 4 ) != GainLossStates.LOSS ) { + return false; + } + if ( gl_matrix_0.getState( "C", 4 ) != GainLossStates.GAIN ) { + return false; + } + if ( gl_matrix_0.getState( "D", 4 ) != GainLossStates.LOSS ) { + return false; + } + if ( gl_matrix_0.getState( "1", 4 ) != GainLossStates.GAIN ) { + return false; + } + if ( gl_matrix_0.getState( "2", 4 ) != GainLossStates.LOSS ) { + return false; + } + if ( gl_matrix_0.getState( "3", 4 ) != GainLossStates.UNCHANGED_PRESENT ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testCombinableDomains() { + try { + final Domain key0 = new BasicDomain( "key0", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain a = new BasicDomain( "a", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b = new BasicDomain( "b", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c = new BasicDomain( "c", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final CombinableDomains cd0 = new BasicCombinableDomains( key0.getDomainId(), new BasicSpecies( "eel" ) ); + cd0.addCombinableDomain( a.getDomainId() ); + cd0.addCombinableDomain( b.getDomainId() ); + cd0.addCombinableDomain( b.getDomainId() ); + cd0.addCombinableDomain( c.getDomainId() ); + cd0.addCombinableDomain( c.getDomainId() ); + cd0.addCombinableDomain( c.getDomainId() ); + if ( cd0.getNumberOfCombinableDomains() != 3 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( a.getDomainId() ) != 1 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( b.getDomainId() ) != 2 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( c.getDomainId() ) != 3 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( key0.getDomainId() ) != 0 ) { + return false; + } + if ( cd0.getAllDomains().size() != 4 ) { + return false; + } + if ( !cd0.getAllDomains().contains( a.getDomainId() ) ) { + return false; + } + if ( !cd0.getAllDomains().contains( b.getDomainId() ) ) { + return false; + } + if ( !cd0.getAllDomains().contains( c.getDomainId() ) ) { + return false; + } + if ( !cd0.getAllDomains().contains( key0.getDomainId() ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().size() != 3 ) { + return false; + } + final BasicBinaryDomainCombination s0 = new BasicBinaryDomainCombination( "key0", "a" ); + final BasicBinaryDomainCombination s1 = new BasicBinaryDomainCombination( "b", "key0" ); + final BasicBinaryDomainCombination s2 = new BasicBinaryDomainCombination( "key0", "c" ); + final BasicBinaryDomainCombination s3 = new BasicBinaryDomainCombination( "key0", "cc" ); + final BasicBinaryDomainCombination s4 = new BasicBinaryDomainCombination( "c", "key0" ); + if ( !cd0.toBinaryDomainCombinations().contains( s0 ) ) { + return false; + } + if ( !cd0.toBinaryDomainCombinations().contains( s1 ) ) { + return false; + } + if ( !cd0.toBinaryDomainCombinations().contains( s2 ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().contains( s3 ) ) { + return false; + } + if ( !cd0.toBinaryDomainCombinations().contains( s4 ) ) { + return false; + } + final Domain key1 = new BasicDomain( "key1", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain a1 = new BasicDomain( "a1", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b1 = new BasicDomain( "b1", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c1 = new BasicDomain( "c1", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final CombinableDomains cd1 = new BasicCombinableDomains( key1.getDomainId(), new BasicSpecies( "eel" ) ); + cd1.addCombinableDomain( a1.getDomainId() ); + cd1.addCombinableDomain( b1.getDomainId() ); + cd1.addCombinableDomain( c1.getDomainId() ); + cd1.addCombinableDomain( key1.getDomainId() ); + if ( cd1.getNumberOfCombinableDomains() != 4 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( a1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( b1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( c1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( key1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getAllDomains().size() != 4 ) { + return false; + } + if ( cd1.toBinaryDomainCombinations().size() != 4 ) { + return false; + } + final BasicBinaryDomainCombination kk = new BasicBinaryDomainCombination( "key1", "key1" ); + if ( !cd1.toBinaryDomainCombinations().contains( kk ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testCombinationsBasedPairwiseSimilarityCalculator() { + try { + final Domain a = new BasicDomain( "A", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b = new BasicDomain( "B", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c = new BasicDomain( "C", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain one_key = new BasicDomain( "bcl2", 4, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain two_key = new BasicDomain( "bcl2", 5, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final CombinableDomains one = new BasicCombinableDomains( one_key.getDomainId(), new BasicSpecies( "mouse" ) ); + final CombinableDomains two = new BasicCombinableDomains( two_key.getDomainId(), + new BasicSpecies( "rabbit" ) ); + one.addCombinableDomain( a.getDomainId() ); + one.addCombinableDomain( a.getDomainId() ); + two.addCombinableDomain( new BasicDomain( "A", 1, 5, ( short ) 1, ( short ) 4, 0.1, -12 ).getDomainId() ); + two.addCombinableDomain( b.getDomainId() ); + two.addCombinableDomain( c.getDomainId() ); + final PairwiseDomainSimilarityCalculator calc = new CombinationsBasedPairwiseDomainSimilarityCalculator(); + final PairwiseDomainSimilarity s1 = calc.calculateSimilarity( one, two ); + if ( !TestSurfacing.isEqual( s1.getSimilarityScore(), 1.0 / ( 1 + 2 ) ) ) { + return false; + } + if ( s1.getDifferenceInCounts() != ( 1 - 3 ) ) { + return false; + } + if ( ( ( CombinationsBasedPairwiseDomainSimilarity ) s1 ).getNumberOfDifferentDomains() != 2 ) { + return false; + } + one.addCombinableDomain( b.getDomainId() ); + one.addCombinableDomain( c.getDomainId() ); + final PairwiseDomainSimilarity s2 = calc.calculateSimilarity( one, two ); + if ( !TestSurfacing.isEqual( s2.getSimilarityScore(), 3.0 / ( 0 + 3 ) ) ) { + return false; + } + if ( s2.getDifferenceInCounts() != 0 ) { + return false; + } + if ( ( ( CombinationsBasedPairwiseDomainSimilarity ) s2 ).getNumberOfDifferentDomains() != 0 ) { + return false; + } + final Domain d = new BasicDomain( "D", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain e = new BasicDomain( "E", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain f = new BasicDomain( "F", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + one.addCombinableDomain( d.getDomainId() ); + one.addCombinableDomain( d.getDomainId() ); + one.addCombinableDomain( e.getDomainId() ); + one.addCombinableDomain( f.getDomainId() ); + final PairwiseDomainSimilarity s3 = calc.calculateSimilarity( one, two ); + if ( !TestSurfacing.isEqual( s3.getSimilarityScore(), 3.0 / ( 3 + 3 ) ) ) { + return false; + } + if ( s3.getDifferenceInCounts() != ( 6 - 3 ) ) { + return false; + } + if ( ( ( CombinationsBasedPairwiseDomainSimilarity ) s3 ).getNumberOfDifferentDomains() != 3 ) { + return false; + } + final Domain aaa = new BasicDomain( "aaa", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain bbb = new BasicDomain( "bbb", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain three_key = new BasicDomain( "bcl2", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain four_key = new BasicDomain( "bcl2", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final CombinableDomains three = new BasicCombinableDomains( three_key.getDomainId(), + new BasicSpecies( "mouse" ) ); + final CombinableDomains four = new BasicCombinableDomains( four_key.getDomainId(), + new BasicSpecies( "rabbit" ) ); + three.addCombinableDomain( aaa.getDomainId() ); + four.addCombinableDomain( bbb.getDomainId() ); + final PairwiseDomainSimilarityCalculator calc2 = new CombinationsBasedPairwiseDomainSimilarityCalculator(); + final PairwiseDomainSimilarity s4 = calc2.calculateSimilarity( three, four ); + if ( !TestSurfacing.isEqual( s4.getSimilarityScore(), 0.0 / ( 0 + 2 ) ) ) { + return false; + } + final Domain aaa2 = new BasicDomain( "aaa", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + four.addCombinableDomain( aaa2.getDomainId() ); + final PairwiseDomainSimilarity s5 = calc.calculateSimilarity( three, four ); + if ( !TestSurfacing.isEqual( s5.getSimilarityScore(), 1.0 / ( 1 + 1 ) ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testCopyNumberBasedPairwiseSimilarityCalculator() { + try { + final Domain one_key = new BasicDomain( "bcl2", 4, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain two_key = new BasicDomain( "bcl2", 5, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final CombinableDomains one = new BasicCombinableDomains( one_key.getDomainId(), new BasicSpecies( "mouse" ) ); + final CombinableDomains two = new BasicCombinableDomains( two_key.getDomainId(), + new BasicSpecies( "rabbit" ) ); + one.setKeyDomainCount( 2 ); + two.setKeyDomainCount( 3 ); + final PairwiseDomainSimilarityCalculator calc = new DomainCountsBasedPairwiseSimilarityCalculator(); + PairwiseDomainSimilarity s1 = calc.calculateSimilarity( one, two ); + if ( !TestSurfacing.isEqual( s1.getSimilarityScore(), 1.0 - ( 3 - 2.0 ) / ( 2 + 3 ) ) ) { + return false; + } + if ( s1.getDifferenceInCounts() != ( 2 - 3 ) ) { + return false; + } + one.setKeyDomainCount( 1 ); + two.setKeyDomainCount( 1 ); + s1 = calc.calculateSimilarity( one, two ); + if ( !TestSurfacing.isEqual( s1.getSimilarityScore(), 1.0 ) ) { + return false; + } + if ( s1.getDifferenceInCounts() != ( 1 - 1 ) ) { + return false; + } + one.setKeyDomainCount( 1 ); + two.setKeyDomainCount( 1000 ); + s1 = calc.calculateSimilarity( one, two ); + if ( !TestSurfacing.isEqual( s1.getSimilarityScore(), 1.0 - 999.0 / 1001 ) ) { + return false; + } + if ( s1.getDifferenceInCounts() != ( 1 - 1000 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDirectedCombinableDomains() { + try { + final Domain key0 = new BasicDomain( "key0", 10, 20, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain a = new BasicDomain( "a", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b = new BasicDomain( "b", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c = new BasicDomain( "c", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final CombinableDomains cd0 = new DirectedCombinableDomains( key0.getDomainId(), new BasicSpecies( "eel" ) ); + cd0.addCombinableDomain( a.getDomainId() ); + cd0.addCombinableDomain( b.getDomainId() ); + cd0.addCombinableDomain( b.getDomainId() ); + cd0.addCombinableDomain( c.getDomainId() ); + cd0.addCombinableDomain( c.getDomainId() ); + cd0.addCombinableDomain( c.getDomainId() ); + if ( cd0.getNumberOfCombinableDomains() != 3 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( a.getDomainId() ) != 1 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( b.getDomainId() ) != 2 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( c.getDomainId() ) != 3 ) { + return false; + } + if ( cd0.getNumberOfProteinsExhibitingCombination( key0.getDomainId() ) != 0 ) { + return false; + } + if ( cd0.getAllDomains().size() != 4 ) { + return false; + } + if ( !cd0.getAllDomains().contains( a.getDomainId() ) ) { + return false; + } + if ( !cd0.getAllDomains().contains( b.getDomainId() ) ) { + return false; + } + if ( !cd0.getAllDomains().contains( c.getDomainId() ) ) { + return false; + } + if ( !cd0.getAllDomains().contains( key0.getDomainId() ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().size() != 3 ) { + return false; + } + final BinaryDomainCombination s0 = new DirectedBinaryDomainCombination( "key0", "a" ); + final BinaryDomainCombination s1 = new DirectedBinaryDomainCombination( "b", "key0" ); + final BinaryDomainCombination s2 = new DirectedBinaryDomainCombination( "key0", "c" ); + final BinaryDomainCombination s3 = new DirectedBinaryDomainCombination( "key0", "cc" ); + final BinaryDomainCombination s4 = new DirectedBinaryDomainCombination( "a", "b" ); + final BinaryDomainCombination s5 = new DirectedBinaryDomainCombination( "b", "a" ); + final BinaryDomainCombination s6 = new DirectedBinaryDomainCombination( "key0", "b" ); + final BinaryDomainCombination s7 = new DirectedBinaryDomainCombination( "a", "key0" ); + final BinaryDomainCombination s8 = new DirectedBinaryDomainCombination( "c", "key0" ); + if ( !cd0.toBinaryDomainCombinations().contains( s0 ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().contains( s1 ) ) { + return false; + } + if ( !cd0.toBinaryDomainCombinations().contains( s2 ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().contains( s3 ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().contains( s4 ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().contains( s5 ) ) { + return false; + } + if ( !cd0.toBinaryDomainCombinations().contains( s6 ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().contains( s7 ) ) { + return false; + } + if ( cd0.toBinaryDomainCombinations().contains( s8 ) ) { + return false; + } + final Domain key1 = new BasicDomain( "key1", 1, 2, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain a1 = new BasicDomain( "a1", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b1 = new BasicDomain( "b1", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c1 = new BasicDomain( "c1", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final CombinableDomains cd1 = new DirectedCombinableDomains( key1.getDomainId(), new BasicSpecies( "eel" ) ); + cd1.addCombinableDomain( a1.getDomainId() ); + cd1.addCombinableDomain( b1.getDomainId() ); + cd1.addCombinableDomain( c1.getDomainId() ); + cd1.addCombinableDomain( key1.getDomainId() ); + if ( cd1.getNumberOfCombinableDomains() != 4 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( a1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( b1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( c1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getNumberOfProteinsExhibitingCombination( key1.getDomainId() ) != 1 ) { + return false; + } + if ( cd1.getAllDomains().size() != 4 ) { + return false; + } + if ( cd1.toBinaryDomainCombinations().size() != 4 ) { + return false; + } + final BinaryDomainCombination kk = new DirectedBinaryDomainCombination( "key1", "key1" ); + if ( !cd1.toBinaryDomainCombinations().contains( kk ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDirectedness() { + try { + final BinaryStates X = BinaryStates.PRESENT; + final BinaryStates O = BinaryStates.ABSENT; + final GainLossStates G = GainLossStates.GAIN; + final GainLossStates L = GainLossStates.LOSS; + final GainLossStates A = GainLossStates.UNCHANGED_ABSENT; + final GainLossStates P = GainLossStates.UNCHANGED_PRESENT; + final Protein one_1 = new BasicProtein( "one", "1" ); + final Protein two_1 = new BasicProtein( "two", "1" ); + final Protein three_1 = new BasicProtein( "three", "1" ); + final Protein four_1 = new BasicProtein( "four", "1" ); + final Protein five_1 = new BasicProtein( "five", "1" ); + one_1.addProteinDomain( new BasicDomain( "B", 12, 14, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "C", 13, 14, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "A", 11, 12, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "X", 100, 110, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "Y", 200, 210, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "A", 10, 20, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "B", 30, 40, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "Y", 1, 2, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "X", 10, 11, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "P", 10, 11, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "M", 1, 2, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "M", 5, 6, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "N", 7, 8, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "N", 3, 4, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + four_1.addProteinDomain( new BasicDomain( "XX", 10, 20, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + five_1.addProteinDomain( new BasicDomain( "YY", 30, 40, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + final List list_1 = new ArrayList(); + list_1.add( one_1 ); + list_1.add( two_1 ); + list_1.add( three_1 ); + list_1.add( four_1 ); + list_1.add( five_1 ); + final GenomeWideCombinableDomains gwcd_1 = BasicGenomeWideCombinableDomains + .createInstance( list_1, false, new BasicSpecies( "1" ), DomainCombinationType.DIRECTED ); + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "A", "B" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "B", "A" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "A", "A" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "A", "C" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "C", "A" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "B", "C" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "C", "X" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "C", "Y" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "A", "X" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "A", "Y" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "Y", "A" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "X", "A" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "C", "B" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "X", "Y" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "Y", "X" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "A", "Y" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "A", "X" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "Y", "C" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "M", "N" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "N", "M" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "N", "P" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "M", "P" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "P", "N" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "P", "M" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "XX", "YY" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "YY", "XX" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations().contains( new DirectedBinaryDomainCombination( "B", "B" ) ) ) { + return false; + } + // final List gwcd_list = new ArrayList(); + // gwcd_list.add( gwcd_1 ); + // gwcd_list.add( gwcd_2 ); + // final CharacterStateMatrix matrix_d = DomainParsimonyCalculator + // .createMatrixOfDomainPresenceOrAbsence( gwcd_list ); + // final CharacterStateMatrix matrix_bc = DomainParsimonyCalculator + // .createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ); + // if ( matrix_d.getState( 0, 0 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 0, 0 ) != X ) { + // return false; + // } + // + // + // final BasicCharacterStateMatrix dm = new BasicCharacterStateMatrix( new BinaryStates[][] { + // { X, X, X, X, X, X }, { X, X, X, X, X, X } } ); + // if ( !matrix_d.equals( dm ) ) { + // return false; + // } + // final BasicCharacterStateMatrix bcm = new BasicCharacterStateMatrix( new BinaryStates[][] { + // { X, O, X, X, X, X, O, X, X, O, X, X }, { X, X, X, O, O, O, O, X, O, O, X, X } } ); + // if ( !matrix_d.equals( dm ) ) { + // return false; + // } + //`````````````````````````` + // final List gwcd_list = new ArrayList(); + // gwcd_list.add( one ); + // gwcd_list.add( two ); + // gwcd_list.add( three ); + // gwcd_list.add( four ); + // final CharacterStateMatrix matrix_d = DomainParsimony + // .createMatrixOfDomainPresenceOrAbsence( gwcd_list ); + // final CharacterStateMatrix matrix_bc = DomainParsimony + // .createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ); + // // System.out.println( "d:" ); + // // System.out.println(matrix_d.toStringBuffer().toString() ); + // // System.out.println( "bc:" ); + // // System.out.println(matrix_bc.toStringBuffer().toString() ); + // // 1 a b c e f g h l m + // // 2 a b c e f g i n o + // // 3 a b d e f g j p q + // // 4 a b d p r + // if ( matrix_d.getState( 0, 0 ) != X ) { + // return false; + // } + // if ( matrix_d.getState( 0, 1 ) != X ) { + // return false; + // } + // if ( matrix_d.getState( 0, 2 ) != X ) { + // return false; + // } + // if ( matrix_d.getState( 0, 3 ) != O ) { + // return false; + // } + // if ( matrix_d.getState( 0, 4 ) != X ) { + // return false; + // } + // if ( matrix_d.getState( 0, 5 ) != X ) { + // return false; + // } + // if ( matrix_d.getState( 0, 6 ) != X ) { + // return false; + // } + // if ( matrix_d.getState( 0, 7 ) != X ) { + // return false; + // } + // if ( matrix_d.getState( 0, 8 ) != O ) { + // return false; + // } + // // 1 a-a a-b a-c e-f e-g e-h f-g f-h g-h l-m + // // 2 a-b a-c e-f e-g e-i f-g f-i g-i n-o + // // 3 a-b a-d e-f e-g e-j f-g f-j g-j p-q + // // 4 a-b a-d p-r + // if ( matrix_bc.getState( 0, 0 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 0, 1 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 0, 2 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 0, 3 ) != O ) { + // return false; + // } + // if ( matrix_bc.getState( 0, 4 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 1, 0 ) != O ) { + // return false; + // } + // if ( matrix_bc.getState( 1, 1 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 1, 2 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 1, 3 ) != O ) { + // return false; + // } + // if ( matrix_bc.getState( 1, 4 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 2, 0 ) != O ) { + // return false; + // } + // if ( matrix_bc.getState( 2, 1 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 2, 2 ) != O ) { + // return false; + // } + // if ( matrix_bc.getState( 2, 3 ) != X ) { + // return false; + // } + // if ( matrix_bc.getState( 2, 4 ) != X ) { + // return false; + // } + // final PhylogenyFactory factory0 = ParserBasedPhylogenyFactory.getInstance(); + // final String p0_str = "((one,two)1-2,(three,four)3-4)root"; + // final Phylogeny p0 = factory0.create( p0_str, new NHXParser() )[ 0 ]; + // final DomainParsimony dp0 = DomainParsimony.createInstance( p0, gwcd_list ); + // dp0.executeDolloParsimonyOnDomainPresence(); + // final CharacterStateMatrix gl_matrix_d = dp0.getGainLossMatrix(); + // final CharacterStateMatrix is_matrix_d = dp0.getInternalStatesMatrix(); + // dp0.executeDolloParsimonyOnBinaryDomainCombintionPresence(); + // final CharacterStateMatrix gl_matrix_bc = dp0.getGainLossMatrix(); + // final CharacterStateMatrix is_matrix_bc = dp0.getInternalStatesMatrix(); + // if ( is_matrix_d.getState( "root", "A" ) != X ) { + // return false; + // } + // if ( is_matrix_d.getState( "root", "B" ) != X ) { + // return false; + // } + // if ( is_matrix_d.getState( "root", "C" ) != O ) { + // return false; + // } + // if ( is_matrix_d.getState( "root", "D" ) != O ) { + // return false; + // } + // if ( is_matrix_d.getState( "root", "E" ) != X ) { + // return false; + // } + // if ( is_matrix_bc.getState( "root", "A=A" ) != O ) { + // return false; + // } + // if ( is_matrix_bc.getState( "root", "A=B" ) != X ) { + // return false; + // } + // if ( is_matrix_bc.getState( "root", "A=C" ) != O ) { + // return false; + // } + // if ( is_matrix_bc.getState( "root", "A=D" ) != O ) { + // return false; + // } + // if ( is_matrix_bc.getState( "root", "G=H" ) != O ) { + // return false; + // } + // if ( is_matrix_bc.getState( "1-2", "G=H" ) != O ) { + // return false; + // } + // if ( is_matrix_bc.getState( "root", "E=F" ) != X ) { + // return false; + // } + // if ( gl_matrix_bc.getState( "root", "E=F" ) != P ) { + // return false; + // } + // if ( gl_matrix_bc.getState( "root", "A=A" ) != A ) { + // return false; + // } + // if ( gl_matrix_bc.getState( "one", "A=A" ) != G ) { + // return false; + // } + // if ( gl_matrix_bc.getState( "root", "A=B" ) != P ) { + // return false; + // } + // if ( gl_matrix_bc.getState( "3-4", "A=D" ) != G ) { + // return false; + // } + // if ( gl_matrix_bc.getState( "four", "E=F" ) != L ) { + // return false; + // } + // if ( gl_matrix_d.getState( "3-4", "P" ) != G ) { + // return false; + // } + // final Protein ab_1 = new BasicProtein( "ab", "one" ); + // ab_1.addProteinDomain( a ); + // ab_1.addProteinDomain( b ); + // final Protein ac_1 = new BasicProtein( "ac", "one" ); + // ac_1.addProteinDomain( a ); + // ac_1.addProteinDomain( c ); + // final Protein de_1 = new BasicProtein( "de", "one" ); + // de_1.addProteinDomain( d ); + // de_1.addProteinDomain( e ); + // final Protein ac_2 = new BasicProtein( "ac", "two" ); + // ac_2.addProteinDomain( a ); + // ac_2.addProteinDomain( c ); + // final Protein ab_3 = new BasicProtein( "ab", "three" ); + // ab_3.addProteinDomain( a ); + // ab_3.addProteinDomain( b ); + // final Protein de_4 = new BasicProtein( "de", "four" ); + // de_4.addProteinDomain( d ); + // de_4.addProteinDomain( e ); + // final Protein ab_6 = new BasicProtein( "ab", "six" ); + // ab_6.addProteinDomain( a ); + // ab_6.addProteinDomain( b ); + // final List spec_one = new ArrayList(); + // final List spec_two = new ArrayList(); + // final List spec_three = new ArrayList(); + // final List spec_four = new ArrayList(); + // final List spec_five = new ArrayList(); + // final List spec_six = new ArrayList(); + // final List spec_seven = new ArrayList(); + // spec_one.add( ab_1 ); + // spec_one.add( ac_1 ); + // spec_one.add( de_1 ); + // spec_two.add( ac_2 ); + // spec_three.add( ab_3 ); + // spec_four.add( de_4 ); + // spec_six.add( ab_6 ); + // final GenomeWideCombinableDomains one_gwcd = BasicGenomeWideCombinableDomains + // .createInstance( spec_one, false, new BasicSpecies( "one" ), false ); + // final GenomeWideCombinableDomains two_gwcd = BasicGenomeWideCombinableDomains + // .createInstance( spec_two, false, new BasicSpecies( "two" ), false ); + // final GenomeWideCombinableDomains three_gwcd = BasicGenomeWideCombinableDomains + // .createInstance( spec_three, false, new BasicSpecies( "three" ), false ); + // final GenomeWideCombinableDomains four_gwcd = BasicGenomeWideCombinableDomains + // .createInstance( spec_four, false, new BasicSpecies( "four" ), false ); + // final GenomeWideCombinableDomains five_gwcd = BasicGenomeWideCombinableDomains + // .createInstance( spec_five, false, new BasicSpecies( "five" ), false ); + // final GenomeWideCombinableDomains six_gwcd = BasicGenomeWideCombinableDomains + // .createInstance( spec_six, false, new BasicSpecies( "six" ), false ); + // final GenomeWideCombinableDomains seven_gwcd = BasicGenomeWideCombinableDomains + // .createInstance( spec_seven, false, new BasicSpecies( "seven" ), false + // ); + // final List gwcd_list1 = new ArrayList(); + // gwcd_list1.add( one_gwcd ); + // gwcd_list1.add( two_gwcd ); + // gwcd_list1.add( three_gwcd ); + // gwcd_list1.add( four_gwcd ); + // gwcd_list1.add( five_gwcd ); + // gwcd_list1.add( six_gwcd ); + // gwcd_list1.add( seven_gwcd ); + // final PhylogenyFactory factory1 = ParserBasedPhylogenyFactory.getInstance(); + // final String p1_str = "(((((one,two)12,three)123,(four,five)45)12345,six)123456,seven)root"; + // final Phylogeny p1 = factory1.create( p1_str, new NHXParser() )[ 0 ]; + // final DomainParsimony dp1 = DomainParsimony.createInstance( p1, gwcd_list1 ); + // dp1.executeDolloParsimonyOnDomainPresence(); + // final CharacterStateMatrix gl_dollo_d = dp1.getGainLossMatrix(); + // final CharacterStateMatrix i_dollo_d = dp1.getInternalStatesMatrix(); + // if ( dp1.getCost() != 14 ) { + // return false; + // } + // if ( dp1.getTotalGains() != 5 ) { + // return false; + // } + // if ( dp1.getTotalLosses() != 9 ) { + // return false; + // } + // if ( dp1.getTotalUnchanged() != 51 ) { + // return false; + // } + // if ( dp1.getNetGainsOnNode( "45" ) != -2 ) { + // return false; + // } + // if ( dp1.getSumOfGainsOnNode( "45" ) != 0 ) { + // return false; + // } + // if ( dp1.getSumOfLossesOnNode( "45" ) != 2 ) { + // return false; + // } + // if ( dp1.getSumOfUnchangedOnNode( "45" ) != 3 ) { + // return false; + // } + // if ( dp1.getSumOfUnchangedPresentOnNode( "45" ) != 2 ) { + // return false; + // } + // if ( dp1.getSumOfUnchangedAbsentOnNode( "45" ) != 1 ) { + // return false; + // } + // if ( dp1.getUnitsGainedOnNode( "45" ).contains( "A" ) ) { + // return false; + // } + // if ( !dp1.getUnitsLostOnNode( "45" ).contains( "A" ) ) { + // return false; + // } + // if ( !dp1.getUnitsLostOnNode( "45" ).contains( "B" ) ) { + // return false; + // } + // if ( !dp1.getUnitsGainedOnNode( "12345" ).contains( "D" ) ) { + // return false; + // } + // if ( !dp1.getUnitsOnNode( "12" ).contains( "A" ) ) { + // return false; + // } + // if ( !dp1.getUnitsOnNode( "12" ).contains( "B" ) ) { + // return false; + // } + // if ( !dp1.getUnitsOnNode( "12" ).contains( "C" ) ) { + // return false; + // } + // if ( !dp1.getUnitsOnNode( "12" ).contains( "D" ) ) { + // return false; + // } + // if ( !dp1.getUnitsOnNode( "12" ).contains( "E" ) ) { + // return false; + // } + // if ( dp1.getNetGainsOnNode( "123456" ) != 2 ) { + // return false; + // } + // if ( dp1.getSumOfGainsOnNode( "123456" ) != 2 ) { + // return false; + // } + // dp1.executeDolloParsimonyOnBinaryDomainCombintionPresence(); + // final CharacterStateMatrix gl_dollo_bc = dp1.getGainLossMatrix(); + // final CharacterStateMatrix i_dollo_bc = dp1.getInternalStatesMatrix(); + // if ( dp1.getCost() != 8 ) { + // return false; + // } + // if ( dp1.getTotalGains() != 3 ) { + // return false; + // } + // if ( dp1.getTotalLosses() != 5 ) { + // return false; + // } + // if ( dp1.getTotalUnchanged() != 31 ) { + // return false; + // } + // if ( !dp1.getUnitsLostOnNode( "45" ).contains( "A=B" ) ) { + // return false; + // } + // if ( !dp1.getUnitsGainedOnNode( "12345" ).contains( "D=E" ) ) { + // return false; + // } + // dp1.executeFitchParsimonyOnDomainPresence(); + // final CharacterStateMatrix gl_fitch_d = dp1.getGainLossMatrix(); + // final CharacterStateMatrix i_fitch_d = dp1.getInternalStatesMatrix(); + // if ( dp1.getCost() != 10 ) { + // return false; + // } + // if ( dp1.getTotalGains() != 7 ) { + // return false; + // } + // if ( dp1.getTotalLosses() != 3 ) { + // return false; + // } + // if ( dp1.getTotalUnchanged() != 55 ) { + // return false; + // } + // if ( !dp1.getUnitsGainedOnNode( "four" ).contains( "E" ) ) { + // return false; + // } + // dp1.executeFitchParsimonyOnBinaryDomainCombintion(); + // final CharacterStateMatrix gl_fitch_bc = dp1.getGainLossMatrix(); + // final CharacterStateMatrix i_fitch_bc = dp1.getInternalStatesMatrix(); + // if ( dp1.getCost() != 6 ) { + // return false; + // } + // if ( dp1.getTotalGains() != 4 ) { + // return false; + // } + // if ( dp1.getTotalLosses() != 2 ) { + // return false; + // } + // if ( dp1.getTotalUnchanged() != 33 ) { + // return false; + // } + // if ( !dp1.getUnitsLostOnNode( "45" ).contains( "A=B" ) ) { + // return false; + // } + // if ( !dp1.getUnitsGainedOnNode( "four" ).contains( "D=E" ) ) { + // return false; + // } + // if ( dp1.getNetGainsOnNode( "two" ) != -1 ) { + // return false; + // } + // if ( dp1.getNetGainsOnNode( "123" ) != 0 ) { + // return false; + // } + // if ( dp1.getSumOfUnchangedPresentOnNode( "123" ) != 1 ) { + // return false; + // } + // if ( dp1.getSumOfUnchangedAbsentOnNode( "123" ) != 2 ) { + // return false; + // } + // if ( dp1.getSumOfUnchangedOnNode( "123" ) != 3 ) { + // return false; + // } + // if ( dp1.getSumOfUnchangedOnNode( "two" ) != 2 ) { + // return false; + // } + // if ( !dp1.getUnitsUnchangedAbsentOnNode( "two" ).contains( "D=E" ) ) { + // return false; + // } + // if ( !dp1.getUnitsUnchangedPresentOnNode( "two" ).contains( "A=C" ) ) { + // return false; + // } + // if ( !dp1.getUnitsUnchangedAbsentOnNode( "123" ).contains( "A=C" ) ) { + // return false; + // } + // if ( !dp1.getUnitsUnchangedPresentOnNode( "123" ).contains( "A=B" ) ) { + // return false; + // } + // if ( !dp1.getUnitsUnchangedAbsentOnNode( "123" ).contains( "D=E" ) ) { + // return false; + // } + // CharacterStateMatrix bsm = null; + // CharacterStateMatrix glm = null; + // bsm = new BasicCharacterStateMatrix( new BinaryStates[][] { { X, X, X, X, X }, + // { X, X, O, X, X }, { O, O, O, X, X }, { X, X, O, X, X }, { X, X, O, O, O }, { O, O, O, O, O } } ); + // if ( !bsm.equals( i_dollo_d ) ) { + // return false; + // } + // bsm = new BasicCharacterStateMatrix( new BinaryStates[][] { { X, X, X, O, O }, + // { X, X, O, O, O }, { O, O, O, O, O }, { X, X, O, O, O }, { X, X, O, O, O }, { O, O, O, O, O } } ); + // if ( !bsm.equals( i_fitch_d ) ) { + // return false; + // } + // glm = new BasicCharacterStateMatrix( new GainLossStates[][] { { P, P, P, P, P }, + // { P, L, P, L, L }, { P, P, G, P, P }, { P, P, A, L, L }, { P, P, A, P, P }, { A, A, A, P, P }, + // { A, A, A, L, L }, { L, L, A, P, P }, { P, P, A, G, G }, { P, P, A, A, A }, { G, G, A, A, A }, + // { A, A, A, A, A }, { A, A, A, A, A } } ); + // if ( !glm.equals( gl_dollo_d ) ) { + // return false; + // } + // glm = new BasicCharacterStateMatrix( new GainLossStates[][] { { P, P, P, G, G }, + // { P, L, P, A, A }, { P, P, G, A, A }, { P, P, A, A, A }, { P, P, A, A, A }, { A, A, A, G, G }, + // { A, A, A, A, A }, { L, L, A, A, A }, { P, P, A, A, A }, { P, P, A, A, A }, { G, G, A, A, A }, + // { A, A, A, A, A }, { A, A, A, A, A } } ); + // if ( !glm.equals( gl_fitch_d ) ) { + // return false; + // } + // bsm = new BasicCharacterStateMatrix( new BinaryStates[][] { { X, X, X }, { X, O, X }, + // { O, O, X }, { X, O, X }, { X, O, O }, { O, O, O } } ); + // if ( !bsm.equals( i_dollo_bc ) ) { + // return false; + // } + // bsm = new BasicCharacterStateMatrix( new BinaryStates[][] { { X, X, O }, { X, O, O }, + // { O, O, O }, { X, O, O }, { X, O, O }, { O, O, O } } ); + // if ( !bsm.equals( i_fitch_bc ) ) { + // return false; + // } + // glm = new BasicCharacterStateMatrix( new GainLossStates[][] { { P, P, P }, { L, P, L }, + // { P, G, P }, { P, A, L }, { P, A, P }, { A, A, P }, { A, A, L }, { L, A, P }, { P, A, G }, + // { P, A, A }, { G, A, A }, { A, A, A }, { A, A, A } } ); + // if ( !glm.equals( gl_dollo_bc ) ) { + // return false; + // } + // glm = new BasicCharacterStateMatrix( new GainLossStates[][] { { P, P, G }, { L, P, A }, + // { P, G, A }, { P, A, A }, { P, A, A }, { A, A, G }, { A, A, A }, { L, A, A }, { P, A, A }, + // { P, A, A }, { G, A, A }, { A, A, A }, { A, A, A } } ); + // if ( !glm.equals( gl_fitch_bc ) ) { + // return false; + // } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDirectednessAndAdjacency() { + try { + final Protein one_1 = new BasicProtein( "one", "1" ); + final Protein two_1 = new BasicProtein( "two", "1" ); + final Protein three_1 = new BasicProtein( "three", "1" ); + final Protein four_1 = new BasicProtein( "four", "1" ); + final Protein five_1 = new BasicProtein( "five", "1" ); + one_1.addProteinDomain( new BasicDomain( "B", 12, 14, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "C", 13, 14, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "A", 11, 12, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "X", 100, 110, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + one_1.addProteinDomain( new BasicDomain( "Y", 200, 210, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "A", 10, 20, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "B", 30, 40, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "Y", 1, 2, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + two_1.addProteinDomain( new BasicDomain( "X", 10, 11, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "P", 10, 11, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "M", 1, 2, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "M", 5, 6, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "N", 7, 8, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + three_1.addProteinDomain( new BasicDomain( "N", 3, 4, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + four_1.addProteinDomain( new BasicDomain( "XX", 10, 20, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + five_1.addProteinDomain( new BasicDomain( "YY", 30, 40, ( short ) 1, ( short ) 4, 0.1, -12 ) ); + final List list_1 = new ArrayList(); + list_1.add( one_1 ); + list_1.add( two_1 ); + list_1.add( three_1 ); + list_1.add( four_1 ); + list_1.add( five_1 ); + final GenomeWideCombinableDomains gwcd_1 = BasicGenomeWideCombinableDomains + .createInstance( list_1, false, new BasicSpecies( "1" ), DomainCombinationType.DIRECTED_ADJACTANT ); + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "A", "B" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "B", "A" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "A", "A" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "A", "C" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "C", "A" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "B", "C" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "C", "X" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "C", "Y" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "X", "Y" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "A", "X" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "A", "Y" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "Y", "A" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "X", "A" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "C", "B" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "X", "Y" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "Y", "X" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "A", "Y" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "A", "X" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "Y", "C" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "M", "N" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "N", "M" ) ) ) { + return false; + } + if ( !gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "N", "P" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "M", "P" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "P", "N" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "P", "M" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "XX", "YY" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "YY", "XX" ) ) ) { + return false; + } + if ( gwcd_1.toBinaryDomainCombinations() + .contains( new AdjactantDirectedBinaryDomainCombination( "B", "B" ) ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDomainArchitectureBasedGenomeSimilarityCalculator() { + try { + final Domain a = new BasicDomain( "a", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b = new BasicDomain( "b", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c = new BasicDomain( "c", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain d = new BasicDomain( "d", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain e = new BasicDomain( "e", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain f = new BasicDomain( "f", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain g = new BasicDomain( "g", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain h = new BasicDomain( "h", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain i = new BasicDomain( "i", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain j = new BasicDomain( "j", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain k = new BasicDomain( "k", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain l = new BasicDomain( "l", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain m = new BasicDomain( "m", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain n = new BasicDomain( "n", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Protein eel_0 = new BasicProtein( "0", "eel" ); + final Protein eel_1 = new BasicProtein( "1", "eel" ); + final Protein eel_2 = new BasicProtein( "2", "eel" ); + final Protein eel_3 = new BasicProtein( "3", "eel" ); + final Protein eel_4 = new BasicProtein( "4", "eel" ); + final Protein eel_5 = new BasicProtein( "5", "eel" ); + final Protein eel_6 = new BasicProtein( "6", "eel" ); + final Protein rat_0 = new BasicProtein( "0", "rat" ); + final Protein rat_1 = new BasicProtein( "1", "rat" ); + final Protein rat_2 = new BasicProtein( "2", "rat" ); + final Protein rat_3 = new BasicProtein( "3", "rat" ); + final Protein rat_4 = new BasicProtein( "4", "rat" ); + final Protein rat_5 = new BasicProtein( "5", "rat" ); + final Protein rat_6 = new BasicProtein( "6", "rat" ); + final Protein rat_7 = new BasicProtein( "7", "rat" ); + eel_1.addProteinDomain( a ); + eel_2.addProteinDomain( a ); + eel_2.addProteinDomain( b ); + eel_3.addProteinDomain( a ); + eel_3.addProteinDomain( a ); + eel_3.addProteinDomain( b ); + eel_4.addProteinDomain( a ); + eel_4.addProteinDomain( b ); + eel_4.addProteinDomain( c ); + eel_4.addProteinDomain( d ); + eel_4.addProteinDomain( e ); + eel_5.addProteinDomain( e ); + eel_5.addProteinDomain( e ); + eel_5.addProteinDomain( f ); + eel_5.addProteinDomain( f ); + eel_5.addProteinDomain( f ); + eel_5.addProteinDomain( f ); + eel_6.addProteinDomain( g ); + eel_6.addProteinDomain( h ); + rat_1.addProteinDomain( a ); + rat_2.addProteinDomain( a ); + rat_2.addProteinDomain( b ); + rat_3.addProteinDomain( a ); + rat_3.addProteinDomain( a ); + rat_3.addProteinDomain( b ); + rat_4.addProteinDomain( a ); + rat_4.addProteinDomain( b ); + rat_4.addProteinDomain( c ); + rat_4.addProteinDomain( i ); + rat_4.addProteinDomain( l ); + rat_5.addProteinDomain( i ); + rat_5.addProteinDomain( f ); + rat_5.addProteinDomain( f ); + rat_6.addProteinDomain( j ); + rat_6.addProteinDomain( k ); + rat_7.addProteinDomain( m ); + rat_7.addProteinDomain( n ); + final List protein_list_eel = new ArrayList(); + protein_list_eel.add( eel_0 ); + protein_list_eel.add( eel_1 ); + protein_list_eel.add( eel_2 ); + protein_list_eel.add( eel_3 ); + protein_list_eel.add( eel_4 ); + protein_list_eel.add( eel_5 ); + protein_list_eel.add( eel_6 ); + final List protein_list_rat = new ArrayList(); + protein_list_rat.add( rat_0 ); + protein_list_rat.add( rat_1 ); + protein_list_rat.add( rat_2 ); + protein_list_rat.add( rat_3 ); + protein_list_rat.add( rat_4 ); + protein_list_rat.add( rat_5 ); + protein_list_rat.add( rat_6 ); + protein_list_rat.add( rat_7 ); + final GenomeWideCombinableDomains eel_not_ignore = BasicGenomeWideCombinableDomains + .createInstance( protein_list_eel, false, new BasicSpecies( "eel" ) ); + final GenomeWideCombinableDomains eel_ignore = BasicGenomeWideCombinableDomains + .createInstance( protein_list_eel, true, new BasicSpecies( "eel" ) ); + final GenomeWideCombinableDomains rat_not_ignore = BasicGenomeWideCombinableDomains + .createInstance( protein_list_rat, false, new BasicSpecies( "rat" ) ); + final GenomeWideCombinableDomains rat_ignore = BasicGenomeWideCombinableDomains + .createInstance( protein_list_rat, true, new BasicSpecies( "rat" ) ); + final DomainArchitectureBasedGenomeSimilarityCalculator calc_ni = new DomainArchitectureBasedGenomeSimilarityCalculator( eel_not_ignore, + rat_not_ignore ); + final DomainArchitectureBasedGenomeSimilarityCalculator calc_i = new DomainArchitectureBasedGenomeSimilarityCalculator( eel_ignore, + rat_ignore ); + if ( calc_ni.getAllDomains().size() != 14 ) { + return false; + } + if ( calc_i.getAllDomains().size() != 14 ) { + return false; + } + if ( calc_ni.getDomainsSpecificToGenome0().size() != 4 ) { + return false; + } + if ( calc_i.getDomainsSpecificToGenome0().size() != 4 ) { + return false; + } + if ( calc_ni.getDomainsSpecificToGenome1().size() != 6 ) { + return false; + } + if ( calc_i.getDomainsSpecificToGenome1().size() != 6 ) { + return false; + } + if ( calc_i.getSharedDomains().size() != 4 ) { + return false; + } + if ( calc_ni.getSharedDomains().size() != 4 ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome0().contains( d.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome0().contains( e.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome0().contains( g.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome0().contains( h.getDomainId() ) ) { + return false; + } + if ( calc_ni.getDomainsSpecificToGenome0().contains( a.getDomainId() ) ) { + return false; + } + if ( calc_ni.getDomainsSpecificToGenome0().contains( i.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome0().contains( d.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome0().contains( e.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome0().contains( g.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome0().contains( h.getDomainId() ) ) { + return false; + } + if ( calc_i.getDomainsSpecificToGenome0().contains( a.getDomainId() ) ) { + return false; + } + if ( calc_i.getDomainsSpecificToGenome0().contains( i.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome1().contains( i.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome1().contains( l.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome1().contains( j.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome1().contains( k.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome1().contains( m.getDomainId() ) ) { + return false; + } + if ( !calc_ni.getDomainsSpecificToGenome1().contains( n.getDomainId() ) ) { + return false; + } + if ( calc_ni.getDomainsSpecificToGenome1().contains( a.getDomainId() ) ) { + return false; + } + if ( calc_ni.getDomainsSpecificToGenome1().contains( b.getDomainId() ) ) { + return false; + } + if ( calc_ni.getDomainsSpecificToGenome1().contains( d.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome1().contains( i.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome1().contains( l.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome1().contains( j.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome1().contains( k.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome1().contains( m.getDomainId() ) ) { + return false; + } + if ( !calc_i.getDomainsSpecificToGenome1().contains( n.getDomainId() ) ) { + return false; + } + if ( calc_i.getDomainsSpecificToGenome1().contains( a.getDomainId() ) ) { + return false; + } + if ( calc_i.getDomainsSpecificToGenome1().contains( b.getDomainId() ) ) { + return false; + } + if ( calc_i.getDomainsSpecificToGenome1().contains( d.getDomainId() ) ) { + return false; + } + if ( !calc_i.getSharedDomains().contains( a.getDomainId() ) ) { + return false; + } + if ( !calc_i.getSharedDomains().contains( b.getDomainId() ) ) { + return false; + } + if ( !calc_i.getSharedDomains().contains( c.getDomainId() ) ) { + return false; + } + if ( !calc_i.getSharedDomains().contains( f.getDomainId() ) ) { + return false; + } + final Set all = calc_ni.getAllDomains(); + if ( !all.contains( a.getDomainId() ) ) { + return false; + } + if ( !all.contains( b.getDomainId() ) ) { + return false; + } + if ( !all.contains( c.getDomainId() ) ) { + return false; + } + if ( !all.contains( d.getDomainId() ) ) { + return false; + } + if ( !all.contains( e.getDomainId() ) ) { + return false; + } + if ( !all.contains( f.getDomainId() ) ) { + return false; + } + if ( !all.contains( g.getDomainId() ) ) { + return false; + } + if ( !all.contains( h.getDomainId() ) ) { + return false; + } + if ( !all.contains( i.getDomainId() ) ) { + return false; + } + if ( !all.contains( l.getDomainId() ) ) { + return false; + } + if ( !all.contains( j.getDomainId() ) ) { + return false; + } + if ( !all.contains( k.getDomainId() ) ) { + return false; + } + if ( !all.contains( m.getDomainId() ) ) { + return false; + } + if ( !all.contains( n.getDomainId() ) ) { + return false; + } + final Set s_0_ni = calc_ni.getBinaryDomainCombinationsSpecificToGenome0(); + final Set s_0_i = calc_i.getBinaryDomainCombinationsSpecificToGenome0(); + final Set s_1_ni = calc_ni.getBinaryDomainCombinationsSpecificToGenome1(); + final Set s_1_i = calc_i.getBinaryDomainCombinationsSpecificToGenome1(); + final Set a_ni = calc_ni.getAllBinaryDomainCombinations(); + final Set a_i = calc_i.getAllBinaryDomainCombinations(); + final Set shared_ni = calc_ni.getSharedBinaryDomainCombinations(); + final Set shared_i = calc_i.getSharedBinaryDomainCombinations(); + if ( a_ni.size() != 25 ) { + return false; + } + if ( a_i.size() != 22 ) { + return false; + } + if ( s_0_ni.size() != 10 ) { + return false; + } + if ( s_0_i.size() != 9 ) { + return false; + } + if ( s_1_ni.size() != 10 ) { + return false; + } + if ( s_1_i.size() != 10 ) { + return false; + } + if ( shared_ni.size() != 5 ) { + return false; + } + if ( shared_i.size() != 3 ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "a", "a" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "b", "a" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "a", "c" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "a", "d" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "a", "e" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "b", "c" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "b", "d" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "b", "e" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "c", "d" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "c", "e" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "d", "e" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "e", "f" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "g", "h" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "f", "f" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "e", "e" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "a", "i" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "a", "l" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "b", "i" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "b", "l" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "c", "i" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "c", "l" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "i", "l" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "i", "f" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "m", "n" ) ) ) { + return false; + } + if ( !a_ni.contains( new BasicBinaryDomainCombination( "j", "k" ) ) ) { + return false; + } + if ( a_ni.contains( new BasicBinaryDomainCombination( "a", "g" ) ) ) { + return false; + } + if ( a_ni.contains( new BasicBinaryDomainCombination( "a", "m" ) ) ) { + return false; + } + if ( a_i.contains( new BasicBinaryDomainCombination( "a", "a" ) ) ) { + return false; + } + if ( a_i.contains( new BasicBinaryDomainCombination( "f", "f" ) ) ) { + return false; + } + if ( a_i.contains( new BasicBinaryDomainCombination( "e", "e" ) ) ) { + return false; + } + if ( !shared_ni.contains( new BasicBinaryDomainCombination( "a", "a" ) ) ) { + return false; + } + if ( !shared_ni.contains( new BasicBinaryDomainCombination( "a", "b" ) ) ) { + return false; + } + if ( !shared_ni.contains( new BasicBinaryDomainCombination( "a", "c" ) ) ) { + return false; + } + if ( !shared_ni.contains( new BasicBinaryDomainCombination( "b", "c" ) ) ) { + return false; + } + if ( !shared_ni.contains( new BasicBinaryDomainCombination( "f", "f" ) ) ) { + return false; + } + if ( shared_ni.contains( new BasicBinaryDomainCombination( "m", "n" ) ) ) { + return false; + } + if ( shared_i.contains( new BasicBinaryDomainCombination( "a", "a" ) ) ) { + return false; + } + if ( !shared_i.contains( new BasicBinaryDomainCombination( "a", "b" ) ) ) { + return false; + } + if ( !shared_i.contains( new BasicBinaryDomainCombination( "a", "c" ) ) ) { + return false; + } + if ( !shared_i.contains( new BasicBinaryDomainCombination( "b", "c" ) ) ) { + return false; + } + if ( shared_i.contains( new BasicBinaryDomainCombination( "f", "f" ) ) ) { + return false; + } + if ( shared_i.contains( new BasicBinaryDomainCombination( "m", "n" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "a", "d" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "a", "e" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "b", "d" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "b", "e" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "c", "d" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "c", "e" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "d", "e" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "e", "f" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "g", "h" ) ) ) { + return false; + } + if ( !s_0_ni.contains( new BasicBinaryDomainCombination( "e", "e" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "a", "d" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "a", "e" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "b", "d" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "b", "e" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "c", "d" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "c", "e" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "d", "e" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "e", "f" ) ) ) { + return false; + } + if ( !s_0_i.contains( new BasicBinaryDomainCombination( "g", "h" ) ) ) { + return false; + } + if ( s_0_i.contains( new BasicBinaryDomainCombination( "e", "e" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "a", "i" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "a", "l" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "b", "i" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "b", "l" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "c", "i" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "c", "l" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "l", "i" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "i", "f" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "m", "n" ) ) ) { + return false; + } + if ( !s_1_ni.contains( new BasicBinaryDomainCombination( "j", "k" ) ) ) { + return false; + } + if ( s_1_ni.contains( new BasicBinaryDomainCombination( "a", "b" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "a", "i" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "a", "l" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "b", "i" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "b", "l" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "c", "i" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "c", "l" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "l", "i" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "i", "f" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "m", "n" ) ) ) { + return false; + } + if ( !s_1_i.contains( new BasicBinaryDomainCombination( "j", "k" ) ) ) { + return false; + } + if ( s_1_i.contains( new BasicBinaryDomainCombination( "a", "b" ) ) ) { + return false; + } + if ( !isEqual( calc_ni.calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore(), + 1.0 - ( 25.0 - 5.0 ) / 25.0 ) ) { + return false; + } + if ( !isEqual( calc_i.calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore(), + 1.0 - ( 22.0 - 3.0 ) / 22.0 ) ) { + return false; + } + if ( !isEqual( calc_ni.calculateSharedDomainsBasedGenomeSimilarityScore(), 1.0 - ( 14.0 - 4.0 ) / 14.0 ) ) { + return false; + } + if ( !isEqual( calc_i.calculateSharedDomainsBasedGenomeSimilarityScore(), 1.0 - ( 14.0 - 4.0 ) / 14.0 ) ) { + return false; + } + final Domain u = new BasicDomain( "u", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain v = new BasicDomain( "v", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain w = new BasicDomain( "w", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain x = new BasicDomain( "x", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain y = new BasicDomain( "y", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain z = new BasicDomain( "z", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Protein a_0 = new BasicProtein( "0", "a" ); + final Protein a_1 = new BasicProtein( "1", "a" ); + final Protein a_2 = new BasicProtein( "2", "a" ); + final Protein b_0 = new BasicProtein( "0", "b" ); + final Protein b_1 = new BasicProtein( "1", "b" ); + a_0.addProteinDomain( u ); + a_0.addProteinDomain( v ); + a_0.addProteinDomain( w ); + a_1.addProteinDomain( w ); + a_1.addProteinDomain( x ); + a_2.addProteinDomain( y ); + a_2.addProteinDomain( z ); + b_0.addProteinDomain( u ); + b_0.addProteinDomain( w ); + b_1.addProteinDomain( y ); + b_1.addProteinDomain( z ); + final List protein_list_a = new ArrayList(); + protein_list_a.add( a_0 ); + protein_list_a.add( a_1 ); + protein_list_a.add( a_2 ); + final List protein_list_b = new ArrayList(); + protein_list_b.add( b_0 ); + protein_list_b.add( b_1 ); + final GenomeWideCombinableDomains ca = BasicGenomeWideCombinableDomains + .createInstance( protein_list_a, false, new BasicSpecies( "a" ) ); + final GenomeWideCombinableDomains cb = BasicGenomeWideCombinableDomains + .createInstance( protein_list_b, true, new BasicSpecies( "b" ) ); + final DomainArchitectureBasedGenomeSimilarityCalculator calc_u = new DomainArchitectureBasedGenomeSimilarityCalculator( ca, + cb ); + calc_u.setAllowDomainsToBeIgnored( true ); + if ( calc_u.getAllDomains().size() != 6 ) { + return false; + } + if ( calc_u.getDomainsSpecificToGenome0().size() != 2 ) { + return false; + } + if ( calc_u.getDomainsSpecificToGenome1().size() != 0 ) { + return false; + } + if ( !calc_u.getDomainsSpecificToGenome0().contains( v.getDomainId() ) ) { + return false; + } + if ( !calc_u.getDomainsSpecificToGenome0().contains( x.getDomainId() ) ) { + return false; + } + if ( calc_u.getSharedDomains().size() != 4 ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( u.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( w.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( y.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( z.getDomainId() ) ) { + return false; + } + if ( calc_u.getAllDomains().size() != 6 ) { + return false; + } + if ( !calc_u.getAllDomains().contains( u.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( w.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( y.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( z.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( v.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( x.getDomainId() ) ) { + return false; + } + if ( calc_u.getBinaryDomainCombinationsSpecificToGenome0().size() != 3 ) { + return false; + } + if ( calc_u.getBinaryDomainCombinationsSpecificToGenome1().size() != 0 ) { + return false; + } + if ( calc_u.getSharedBinaryDomainCombinations().size() != 2 ) { + return false; + } + if ( calc_u.getAllBinaryDomainCombinations().size() != 5 ) { + return false; + } + if ( !calc_u.getBinaryDomainCombinationsSpecificToGenome0() + .contains( new BasicBinaryDomainCombination( "v", "u" ) ) ) { + return false; + } + if ( !calc_u.getBinaryDomainCombinationsSpecificToGenome0() + .contains( new BasicBinaryDomainCombination( "w", "v" ) ) ) { + return false; + } + if ( !calc_u.getBinaryDomainCombinationsSpecificToGenome0() + .contains( new BasicBinaryDomainCombination( "w", "x" ) ) ) { + return false; + } + if ( !calc_u.getSharedBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "u" ) ) ) { + return false; + } + if ( !calc_u.getSharedBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "z", "y" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "v", "u" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "v" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "x" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "u" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "z", "y" ) ) ) { + return false; + } + calc_u.setAllowDomainsToBeIgnored( true ); + calc_u.addDomainIdToIgnore( u.getDomainId() ); + calc_u.addDomainIdToIgnore( new DomainId( "other" ) ); + calc_u.addDomainIdToIgnore( new DomainId( "other_too" ) ); + if ( calc_u.getAllDomains().size() != 5 ) { + return false; + } + if ( calc_u.getDomainsSpecificToGenome0().size() != 2 ) { + return false; + } + if ( calc_u.getDomainsSpecificToGenome1().size() != 0 ) { + return false; + } + if ( !calc_u.getDomainsSpecificToGenome0().contains( v.getDomainId() ) ) { + return false; + } + if ( !calc_u.getDomainsSpecificToGenome0().contains( x.getDomainId() ) ) { + return false; + } + if ( calc_u.getSharedDomains().size() != 3 ) { + return false; + } + if ( calc_u.getSharedDomains().contains( u.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( w.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( y.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( z.getDomainId() ) ) { + return false; + } + if ( calc_u.getAllDomains().size() != 5 ) { + return false; + } + if ( calc_u.getAllDomains().contains( u.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( w.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( y.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( z.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( v.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( x.getDomainId() ) ) { + return false; + } + if ( calc_u.getBinaryDomainCombinationsSpecificToGenome0().size() != 2 ) { + return false; + } + if ( calc_u.getBinaryDomainCombinationsSpecificToGenome1().size() != 0 ) { + return false; + } + if ( calc_u.getSharedBinaryDomainCombinations().size() != 1 ) { + return false; + } + if ( calc_u.getAllBinaryDomainCombinations().size() != 3 ) { + return false; + } + if ( calc_u.getBinaryDomainCombinationsSpecificToGenome0() + .contains( new BasicBinaryDomainCombination( "v", "u" ) ) ) { + return false; + } + if ( !calc_u.getBinaryDomainCombinationsSpecificToGenome0() + .contains( new BasicBinaryDomainCombination( "w", "v" ) ) ) { + return false; + } + if ( !calc_u.getBinaryDomainCombinationsSpecificToGenome0() + .contains( new BasicBinaryDomainCombination( "w", "x" ) ) ) { + return false; + } + if ( calc_u.getSharedBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "u" ) ) ) { + return false; + } + if ( !calc_u.getSharedBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "z", "y" ) ) ) { + return false; + } + if ( calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "v", "u" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "v" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "x" ) ) ) { + return false; + } + if ( calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "w", "u" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "z", "y" ) ) ) { + return false; + } + calc_u.setAllowDomainsToBeIgnored( false ); + if ( calc_u.getAllDomains().size() != 6 ) { + return false; + } + //------------ + calc_u.setAllowDomainsToBeIgnored( true ); + calc_u.deleteAllDomainIdsToIgnore(); + calc_u.addDomainIdToIgnore( new DomainId( "v" ) ); + calc_u.addDomainIdToIgnore( new DomainId( "w" ) ); + calc_u.addDomainIdToIgnore( new DomainId( "other" ) ); + calc_u.addDomainIdToIgnore( new DomainId( "other_too" ) ); + if ( calc_u.getAllDomains().size() != 4 ) { + return false; + } + if ( calc_u.getDomainsSpecificToGenome0().size() != 1 ) { + return false; + } + if ( calc_u.getDomainsSpecificToGenome1().size() != 0 ) { + return false; + } + if ( calc_u.getDomainsSpecificToGenome0().contains( v.getDomainId() ) ) { + return false; + } + if ( !calc_u.getDomainsSpecificToGenome0().contains( x.getDomainId() ) ) { + return false; + } + if ( calc_u.getSharedDomains().size() != 3 ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( u.getDomainId() ) ) { + return false; + } + if ( calc_u.getSharedDomains().contains( w.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( y.getDomainId() ) ) { + return false; + } + if ( !calc_u.getSharedDomains().contains( z.getDomainId() ) ) { + return false; + } + if ( calc_u.getAllDomains().size() != 4 ) { + return false; + } + if ( !calc_u.getAllDomains().contains( u.getDomainId() ) ) { + return false; + } + if ( calc_u.getAllDomains().contains( w.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( y.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( z.getDomainId() ) ) { + return false; + } + if ( calc_u.getAllDomains().contains( v.getDomainId() ) ) { + return false; + } + if ( !calc_u.getAllDomains().contains( x.getDomainId() ) ) { + return false; + } + if ( calc_u.getBinaryDomainCombinationsSpecificToGenome0().size() != 0 ) { + return false; + } + if ( calc_u.getBinaryDomainCombinationsSpecificToGenome1().size() != 0 ) { + return false; + } + if ( calc_u.getSharedBinaryDomainCombinations().size() != 1 ) { + return false; + } + if ( calc_u.getAllBinaryDomainCombinations().size() != 1 ) { + return false; + } + if ( !calc_u.getSharedBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "y", "z" ) ) ) { + return false; + } + if ( !calc_u.getAllBinaryDomainCombinations().contains( new BasicBinaryDomainCombination( "z", "y" ) ) ) { + return false; + } + if ( !isEqual( calc_u.calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore(), + 1.0 - ( 1.0 - 1.0 ) / 1.0 ) ) { + return false; + } + if ( !isEqual( calc_u.calculateSharedDomainsBasedGenomeSimilarityScore(), 1.0 - ( 4.0 - 3.0 ) / 4.0 ) ) { + return false; + } + calc_u.setAllowDomainsToBeIgnored( false ); + if ( !isEqual( calc_u.calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore(), + 1.0 - ( 5.0 - 2.0 ) / 5.0 ) ) { + return false; + } + if ( !isEqual( calc_u.calculateSharedDomainsBasedGenomeSimilarityScore(), 1.0 - ( 6.0 - 4.0 ) / 6.0 ) ) { + return false; + } + calc_u.setAllowDomainsToBeIgnored( true ); + if ( !isEqual( calc_u.calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore(), + 1.0 - ( 1.0 - 1.0 ) / 1.0 ) ) { + return false; + } + if ( !isEqual( calc_u.calculateSharedDomainsBasedGenomeSimilarityScore(), 1.0 - ( 4.0 - 3.0 ) / 4.0 ) ) { + return false; + } + calc_u.deleteAllDomainIdsToIgnore(); + if ( !isEqual( calc_u.calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore(), + 1.0 - ( 5.0 - 2.0 ) / 5.0 ) ) { + return false; + } + if ( !isEqual( calc_u.calculateSharedDomainsBasedGenomeSimilarityScore(), 1.0 - ( 6.0 - 4.0 ) / 6.0 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDomainCombinationCounting( final File test_dir ) { + try { + final HmmPfamOutputParser parser = new HmmPfamOutputParser( new File( test_dir + + ForesterUtil.getFileSeparator() + "hmmpfam_output2" ), "human", "ls" ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + final List domain_collections = parser.parse(); + final BasicGenomeWideCombinableDomains cdcc = BasicGenomeWideCombinableDomains + .createInstance( domain_collections, false, new BasicSpecies( "human" ) ); + CombinableDomains cd = cdcc.get( new DomainId( "A" ) ); + if ( cd.getKeyDomainCount() != 9 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 7 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 11 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "A" ).getDomainId() ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "B" ).getDomainId() ) != 6 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "C" ).getDomainId() ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "D" ).getDomainId() ) != 3 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "E" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "U" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "V" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "W" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "X" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "Y" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "Z" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "NN" ).getDomainId() ) != 0 ) { + return false; + } + if ( cd.getKeyDomainCount() != 9 ) { + return false; + } + cd = cdcc.get( new DomainId( "B" ) ); + if ( cd.getKeyDomainCount() != 12 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 7 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 11 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "A" ).getDomainId() ) != 6 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "B" ).getDomainId() ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "C" ).getDomainId() ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "D" ).getDomainId() ) != 3 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "E" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "U" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "V" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "W" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "X" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "Y" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "Z" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "NN" ).getDomainId() ) != 0 ) { + return false; + } + if ( cd.getKeyDomainCount() != 12 ) { + return false; + } + cd = cdcc.get( new DomainId( "C" ) ); + if ( cd.getKeyDomainCount() != 10 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 7 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 11 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "A" ).getDomainId() ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "B" ).getDomainId() ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "C" ).getDomainId() ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "D" ).getDomainId() ) != 3 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "E" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "U" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "V" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "W" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "X" ).getDomainId() ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "Y" ).getDomainId() ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "Z" ).getDomainId() ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "NN" ).getDomainId() ) != 0 ) { + return false; + } + cd = cdcc.get( new DomainId( "D" ) ); + if ( cd.getKeyDomainCount() != 15 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 6 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 11 ) { + return false; + } + cd = cdcc.get( new DomainId( "E" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + if ( cd.getKeyDomainCount() != 1 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 1 ) { + return false; + } + cd = cdcc.get( new DomainId( "U" ) ); + if ( cd.getNumberOfCombinableDomains() != 11 ) { + return false; + } + if ( cd.getKeyDomainCount() != 6 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 3 ) { + return false; + } + cd = cdcc.get( new DomainId( "V" ) ); + if ( cd.getNumberOfCombinableDomains() != 11 ) { + return false; + } + if ( cd.getKeyDomainCount() != 3 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 2 ) { + return false; + } + cd = cdcc.get( new DomainId( "W" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + if ( cd.getKeyDomainCount() != 2 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 2 ) { + return false; + } + cd = cdcc.get( new DomainId( "X" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + if ( cd.getKeyDomainCount() != 2 ) { + return false; + } + cd = cdcc.get( new DomainId( "Y" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc.get( new DomainId( "Z" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc.get( new DomainId( "NN" ) ); + if ( cd.getKeyDomainCount() != 1 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 1 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "NN" ).getDomainId() ) != 0 ) { + return false; + } + cd = cdcc.get( new DomainId( "MM" ) ); + if ( cd.getNumberOfCombinableDomains() != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "MM" ).getDomainId() ) != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "OO" ).getDomainId() ) != 1 ) { + return false; + } + cd = cdcc.get( new DomainId( "OO" ) ); + if ( cd.getNumberOfCombinableDomains() != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "OO" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "MM" ).getDomainId() ) != 1 ) { + return false; + } + cd = cdcc.get( new DomainId( "QQ" ) ); + if ( cd.getNumberOfCombinableDomains() != 1 ) { + return false; + } + if ( cd.getKeyDomainCount() != 17 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "QQ" ).getDomainId() ) != 3 ) { + return false; + } + cd = cdcc.get( new DomainId( "PP" ) ); + if ( cd.getNumberOfCombinableDomains() != 0 ) { + return false; + } + if ( cd.getKeyDomainCount() != 2 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 2 ) { + return false; + } + cd = cdcc.get( new DomainId( "singlet" ) ); + if ( cd.getKeyDomainCount() != 1 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 1 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "singlet" ).getDomainId() ) != 0 ) { + return false; + } + cd = cdcc.get( new DomainId( "three" ) ); + if ( cd.getKeyDomainCount() != 3 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 1 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "three" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "so_far_so_bad" ) ) != 0 ) { + return false; + } + // Ignore combinations with same: + final BasicGenomeWideCombinableDomains cdcc2 = BasicGenomeWideCombinableDomains + .createInstance( domain_collections, + true, + new BasicSpecies( "human" ), + null, + DomainCombinationType.BASIC ); + cd = cdcc2.get( new DomainId( "A" ) ); + if ( cd.getKeyDomainCount() != 9 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 7 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "A" ).getDomainId() ) != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "B" ).getDomainId() ) != 6 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "C" ).getDomainId() ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "D" ).getDomainId() ) != 3 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new SimpleDomain( "E" ).getDomainId() ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "U" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "V" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "W" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "X" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "Y" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "Z" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "NN" ) ) != 0 ) { + return false; + } + cd = cdcc2.get( new DomainId( "B" ) ); + if ( cd.getKeyDomainCount() != 12 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 7 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "A" ) ) != 6 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "B" ) ) != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "C" ) ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "D" ) ) != 3 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "E" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "U" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "V" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "W" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "X" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "Y" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "Z" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "NN" ) ) != 0 ) { + return false; + } + cd = cdcc2.get( new DomainId( "C" ) ); + if ( cd.getKeyDomainCount() != 10 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 7 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "A" ) ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "B" ) ) != 4 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "C" ) ) != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "D" ) ) != 3 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "E" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "U" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "V" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "W" ) ) != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "X" ) ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "Y" ) ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "Z" ) ) != 2 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "NN" ) ) != 0 ) { + return false; + } + cd = cdcc2.get( new DomainId( "D" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc2.get( new DomainId( "E" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + if ( cd.getKeyDomainCount() != 1 ) { + return false; + } + cd = cdcc2.get( new DomainId( "U" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc2.get( new DomainId( "V" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc2.get( new DomainId( "W" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc2.get( new DomainId( "X" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc2.get( new DomainId( "Y" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc2.get( new DomainId( "Z" ) ); + if ( cd.getNumberOfCombinableDomains() != 10 ) { + return false; + } + cd = cdcc2.get( new DomainId( "NN" ) ); + if ( cd.getNumberOfCombinableDomains() != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "NN" ) ) != 0 ) { + return false; + } + cd = cdcc2.get( new DomainId( "MM" ) ); + if ( cd.getNumberOfCombinableDomains() != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "MM" ) ) != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "OO" ) ) != 1 ) { + return false; + } + cd = cdcc2.get( new DomainId( "OO" ) ); + if ( cd.getNumberOfCombinableDomains() != 1 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "OO" ) ) != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "MM" ) ) != 1 ) { + return false; + } + cd = cdcc2.get( new DomainId( "QQ" ) ); + if ( cd.getNumberOfCombinableDomains() != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "QQ" ) ) != 0 ) { + return false; + } + cd = cdcc2.get( new DomainId( "singlet" ) ); + if ( cd.getKeyDomainCount() != 1 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 1 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "singlet" ) ) != 0 ) { + return false; + } + cd = cdcc2.get( new DomainId( "three" ) ); + if ( cd.getKeyDomainCount() != 3 ) { + return false; + } + if ( cd.getKeyDomainProteinsCount() != 1 ) { + return false; + } + if ( cd.getNumberOfCombinableDomains() != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "three" ) ) != 0 ) { + return false; + } + if ( cd.getNumberOfProteinsExhibitingCombination( new DomainId( "so_far_so_bad" ) ) != 0 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDomainId() { + try { + final DomainId id1 = new DomainId( "a" ); + final DomainId id2 = new DomainId( "a" ); + final DomainId id3 = new DomainId( "A" ); + final DomainId id4 = new DomainId( "b" ); + if ( !id1.equals( id1 ) ) { + return false; + } + if ( id1.getId().equals( "x" ) ) { + return false; + } + if ( id1.getId().equals( null ) ) { + return false; + } + if ( !id1.equals( id2 ) ) { + return false; + } + if ( id1.equals( id3 ) ) { + return false; + } + if ( id1.hashCode() != id1.hashCode() ) { + return false; + } + if ( id1.hashCode() != id2.hashCode() ) { + return false; + } + if ( id1.hashCode() == id3.hashCode() ) { + return false; + } + if ( id1.compareTo( id1 ) != 0 ) { + return false; + } + if ( id1.compareTo( id2 ) != 0 ) { + return false; + } + if ( id1.compareTo( id3 ) != 0 ) { + return false; + } + if ( id1.compareTo( id4 ) >= 0 ) { + return false; + } + if ( id4.compareTo( id1 ) <= 0 ) { + return false; + } + if ( !id4.getId().equals( "b" ) ) { + return false; + } + final DomainId id5 = new DomainId( " C " ); + if ( !id5.getId().equals( "C" ) ) { + return false; + } + if ( id5.equals( id1 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDomainSorting() { + try { + final Domain A = new BasicDomain( "A", ( short ) 1, ( short ) 2, ( short ) 1, ( short ) 1, 0.1, -12 ); + final Domain B = new BasicDomain( "B", ( short ) 1, ( short ) 2, ( short ) 1, ( short ) 1, 0.1, -12 ); + final Domain C = new BasicDomain( "C", ( short ) 1, ( short ) 2, ( short ) 1, ( short ) 1, 0.2, -12 ); + final Domain D = new BasicDomain( "D", ( short ) 1, ( short ) 2, ( short ) 1, ( short ) 1, 0.3, -12 ); + final Domain E = new BasicDomain( "E", ( short ) 1, ( short ) 2, ( short ) 1, ( short ) 1, 0.4, -12 ); + final Domain F = new BasicDomain( "F", ( short ) 1, ( short ) 2, ( short ) 1, ( short ) 1, 0.5, -12 ); + final Domain G = new BasicDomain( "G", ( short ) 1, ( short ) 2, ( short ) 1, ( short ) 1, 0.6, -12 ); + final Domain H1 = new BasicDomain( "H", ( short ) 100, ( short ) 200, ( short ) 1, ( short ) 5, 0.7, -12 ); + final Domain H2 = new BasicDomain( "H", ( short ) 300, ( short ) 400, ( short ) 2, ( short ) 5, 0.7, -12 ); + final Domain H3 = new BasicDomain( "H", ( short ) 500, ( short ) 600, ( short ) 3, ( short ) 5, 0.7, -12 ); + final Domain H4 = new BasicDomain( "H", ( short ) 700, ( short ) 800, ( short ) 4, ( short ) 5, 0.7, -12 ); + final Domain H5 = new BasicDomain( "H", ( short ) 700, ( short ) 800, ( short ) 5, ( short ) 5, 0.7, -12 ); + final Domain H6 = new BasicDomain( "H", + ( short ) 1199, + ( short ) 1299, + ( short ) 6, + ( short ) 6, + 0.7, + -0.111 ); + final Domain H7 = new BasicDomain( "H7", ( short ) 700, ( short ) 800, ( short ) 5, ( short ) 5, 0.7, -12 ); + final Domain H8 = new BasicDomain( "H7", ( short ) 700, ( short ) 800, ( short ) 5, ( short ) 200, 0.7, -12 ); + final Protein protein = new BasicProtein( "00", "bat" ); + protein.addProteinDomain( H5 ); + protein.addProteinDomain( H2 ); + protein.addProteinDomain( H7 ); + protein.addProteinDomain( H6 ); + protein.addProteinDomain( A ); + protein.addProteinDomain( G ); + protein.addProteinDomain( H4 ); + protein.addProteinDomain( D ); + protein.addProteinDomain( H1 ); + protein.addProteinDomain( C ); + protein.addProteinDomain( E ); + protein.addProteinDomain( F ); + protein.addProteinDomain( B ); + protein.addProteinDomain( H3 ); + protein.addProteinDomain( H7 ); + protein.addProteinDomain( H7 ); + protein.addProteinDomain( H8 ); + final List sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein ); + if ( sorted.size() != 17 ) { + return false; + } + if ( !sorted.get( 0 ).getDomainId().getId().equals( "A" ) ) { + return false; + } + if ( sorted.get( 0 ).getNumber() != 1 ) { + return false; + } + if ( !sorted.get( 1 ).getDomainId().getId().equals( "B" ) ) { + return false; + } + if ( sorted.get( 1 ).getNumber() != 1 ) { + return false; + } + if ( !sorted.get( 2 ).getDomainId().getId().equals( "C" ) ) { + return false; + } + if ( sorted.get( 2 ).getNumber() != 1 ) { + return false; + } + if ( !sorted.get( 3 ).getDomainId().getId().equals( "D" ) ) { + return false; + } + if ( sorted.get( 3 ).getNumber() != 1 ) { + return false; + } + if ( !sorted.get( 4 ).getDomainId().getId().equals( "E" ) ) { + return false; + } + if ( sorted.get( 4 ).getNumber() != 1 ) { + return false; + } + if ( !sorted.get( 5 ).getDomainId().getId().equals( "F" ) ) { + return false; + } + if ( sorted.get( 5 ).getNumber() != 1 ) { + return false; + } + if ( !sorted.get( 6 ).getDomainId().getId().equals( "G" ) ) { + return false; + } + if ( sorted.get( 6 ).getNumber() != 1 ) { + return false; + } + if ( !sorted.get( 7 ).getDomainId().getId().equals( "H" ) ) { + return false; + } + if ( sorted.get( 7 ).getNumber() != 5 ) { + return false; + } + if ( !sorted.get( 8 ).getDomainId().getId().equals( "H" ) ) { + return false; + } + if ( sorted.get( 8 ).getNumber() != 2 ) { + return false; + } + if ( !sorted.get( 9 ).getDomainId().getId().equals( "H" ) ) { + return false; + } + if ( sorted.get( 9 ).getNumber() != 6 ) { + return false; + } + if ( !sorted.get( 10 ).getDomainId().getId().equals( "H" ) ) { + return false; + } + if ( sorted.get( 10 ).getNumber() != 4 ) { + return false; + } + if ( !sorted.get( 11 ).getDomainId().getId().equals( "H" ) ) { + return false; + } + if ( sorted.get( 11 ).getNumber() != 1 ) { + return false; + } + if ( sorted.get( 11 ).getTotalCount() != 5 ) { + return false; + } + if ( !sorted.get( 12 ).getDomainId().getId().equals( "H" ) ) { + return false; + } + if ( sorted.get( 12 ).getNumber() != 3 ) { + return false; + } + if ( !sorted.get( 13 ).getDomainId().getId().equals( "H7" ) ) { + return false; + } + if ( sorted.get( 13 ).getNumber() != 5 ) { + return false; + } + if ( !sorted.get( 14 ).getDomainId().getId().equals( "H7" ) ) { + return false; + } + if ( sorted.get( 14 ).getNumber() != 5 ) { + return false; + } + if ( !sorted.get( 15 ).getDomainId().getId().equals( "H7" ) ) { + return false; + } + if ( sorted.get( 15 ).getNumber() != 5 ) { + return false; + } + // To check if sorting is stable [as claimed by Sun for + // Collections.sort( List )] + if ( !sorted.get( 16 ).getDomainId().getId().equals( "H7" ) ) { + return false; + } + if ( sorted.get( 16 ).getNumber() != 5 ) { + return false; + } + if ( sorted.get( 16 ).getTotalCount() != 200 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testEngulfingOverlapRemoval() { + try { + final Domain d0 = new BasicDomain( "d0", 0, 8, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d1 = new BasicDomain( "d1", 0, 1, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d2 = new BasicDomain( "d2", 0, 2, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d3 = new BasicDomain( "d3", 7, 8, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d4 = new BasicDomain( "d4", 7, 9, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d5 = new BasicDomain( "d4", 0, 9, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d6 = new BasicDomain( "d4", 4, 5, ( short ) 1, ( short ) 1, 0.1, 1 ); + final List covered = new ArrayList(); + covered.add( true ); // 0 + covered.add( false ); // 1 + covered.add( true ); // 2 + covered.add( false ); // 3 + covered.add( true ); // 4 + covered.add( true ); // 5 + covered.add( false ); // 6 + covered.add( true ); // 7 + covered.add( true ); // 8 + if ( SurfacingUtil.isEngulfed( d0, covered ) ) { + return false; + } + if ( SurfacingUtil.isEngulfed( d1, covered ) ) { + return false; + } + if ( SurfacingUtil.isEngulfed( d2, covered ) ) { + return false; + } + if ( !SurfacingUtil.isEngulfed( d3, covered ) ) { + return false; + } + if ( SurfacingUtil.isEngulfed( d4, covered ) ) { + return false; + } + if ( SurfacingUtil.isEngulfed( d5, covered ) ) { + return false; + } + if ( !SurfacingUtil.isEngulfed( d6, covered ) ) { + return false; + } + final Domain a = new BasicDomain( "a", 0, 10, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain b = new BasicDomain( "b", 8, 20, ( short ) 1, ( short ) 1, 0.2, 1 ); + final Domain c = new BasicDomain( "c", 15, 16, ( short ) 1, ( short ) 1, 0.3, 1 ); + final Protein abc = new BasicProtein( "abc", "nemve" ); + abc.addProteinDomain( a ); + abc.addProteinDomain( b ); + abc.addProteinDomain( c ); + final Protein abc_r1 = SurfacingUtil.removeOverlappingDomains( 3, false, abc ); + final Protein abc_r2 = SurfacingUtil.removeOverlappingDomains( 3, true, abc ); + if ( abc.getNumberOfProteinDomains() != 3 ) { + return false; + } + if ( abc_r1.getNumberOfProteinDomains() != 3 ) { + return false; + } + if ( abc_r2.getNumberOfProteinDomains() != 2 ) { + return false; + } + if ( !abc_r2.getProteinDomain( 0 ).getDomainId().getId().equals( "a" ) ) { + return false; + } + if ( !abc_r2.getProteinDomain( 1 ).getDomainId().getId().equals( "b" ) ) { + return false; + } + final Domain d = new BasicDomain( "d", 0, 10, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain e = new BasicDomain( "e", 8, 20, ( short ) 1, ( short ) 1, 0.3, 1 ); + final Domain f = new BasicDomain( "f", 15, 16, ( short ) 1, ( short ) 1, 0.2, 1 ); + final Protein def = new BasicProtein( "def", "nemve" ); + def.addProteinDomain( d ); + def.addProteinDomain( e ); + def.addProteinDomain( f ); + final Protein def_r1 = SurfacingUtil.removeOverlappingDomains( 5, false, def ); + final Protein def_r2 = SurfacingUtil.removeOverlappingDomains( 5, true, def ); + if ( def.getNumberOfProteinDomains() != 3 ) { + return false; + } + if ( def_r1.getNumberOfProteinDomains() != 3 ) { + return false; + } + if ( def_r2.getNumberOfProteinDomains() != 3 ) { + return false; + } + if ( !def_r2.getProteinDomain( 0 ).getDomainId().getId().equals( "d" ) ) { + return false; + } + if ( !def_r2.getProteinDomain( 1 ).getDomainId().getId().equals( "f" ) ) { + return false; + } + if ( !def_r2.getProteinDomain( 2 ).getDomainId().getId().equals( "e" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testGenomeWideCombinableDomains() { + try { + final Domain a = new BasicDomain( "a", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b = new BasicDomain( "b", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c = new BasicDomain( "c", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain d = new BasicDomain( "d", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain e = new BasicDomain( "e", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain f = new BasicDomain( "f", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain g = new BasicDomain( "g", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain h = new BasicDomain( "h", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain x = new BasicDomain( "x", 23, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Protein eel_0 = new BasicProtein( "0", "eel" ); + final Protein eel_1 = new BasicProtein( "1", "eel" ); + final Protein eel_2 = new BasicProtein( "2", "eel" ); + final Protein eel_3 = new BasicProtein( "3", "eel" ); + final Protein eel_4 = new BasicProtein( "4", "eel" ); + final Protein eel_5 = new BasicProtein( "5", "eel" ); + final Protein eel_6 = new BasicProtein( "6", "eel" ); + eel_1.addProteinDomain( a ); + eel_2.addProteinDomain( a ); + eel_2.addProteinDomain( b ); + eel_3.addProteinDomain( a ); + eel_3.addProteinDomain( a ); + eel_3.addProteinDomain( b ); + eel_4.addProteinDomain( a ); + eel_4.addProteinDomain( b ); + eel_4.addProteinDomain( c ); + eel_4.addProteinDomain( d ); + eel_4.addProteinDomain( e ); + eel_5.addProteinDomain( e ); + eel_5.addProteinDomain( e ); + eel_5.addProteinDomain( f ); + eel_5.addProteinDomain( f ); + eel_5.addProteinDomain( f ); + eel_5.addProteinDomain( f ); + eel_6.addProteinDomain( g ); + eel_6.addProteinDomain( h ); + final List protein_list_eel = new ArrayList(); + protein_list_eel.add( eel_0 ); + protein_list_eel.add( eel_1 ); + protein_list_eel.add( eel_2 ); + protein_list_eel.add( eel_3 ); + protein_list_eel.add( eel_4 ); + protein_list_eel.add( eel_5 ); + protein_list_eel.add( eel_6 ); + final BasicGenomeWideCombinableDomains eel_not_ignore = BasicGenomeWideCombinableDomains + .createInstance( protein_list_eel, false, new BasicSpecies( "eel" ) ); + final BasicGenomeWideCombinableDomains eel_ignore = BasicGenomeWideCombinableDomains + .createInstance( protein_list_eel, true, new BasicSpecies( "eel" ) ); + if ( !eel_not_ignore.contains( new DomainId( "a" ) ) ) { + return false; + } + if ( !eel_not_ignore.contains( new DomainId( "b" ) ) ) { + return false; + } + if ( !eel_not_ignore.contains( new DomainId( "c" ) ) ) { + return false; + } + if ( !eel_not_ignore.contains( new DomainId( "d" ) ) ) { + return false; + } + if ( !eel_not_ignore.contains( new DomainId( "e" ) ) ) { + return false; + } + if ( !eel_not_ignore.contains( new DomainId( "f" ) ) ) { + return false; + } + if ( !eel_not_ignore.contains( new DomainId( "g" ) ) ) { + return false; + } + if ( !eel_not_ignore.contains( new DomainId( "h" ) ) ) { + return false; + } + if ( eel_not_ignore.contains( new DomainId( "x" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "a" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "b" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "c" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "d" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "e" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "f" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "g" ) ) ) { + return false; + } + if ( !eel_ignore.contains( new DomainId( "h" ) ) ) { + return false; + } + if ( eel_ignore.contains( new DomainId( "x" ) ) ) { + return false; + } + if ( eel_not_ignore.getSize() != 8 ) { + return false; + } + if ( eel_ignore.getSize() != 8 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "a" ) ).getCombinableDomainsIds().size() != 5 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "b" ) ).getCombinableDomainsIds().size() != 4 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "c" ) ).getCombinableDomainsIds().size() != 4 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "d" ) ).getCombinableDomainsIds().size() != 4 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "e" ) ).getCombinableDomainsIds().size() != 6 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "f" ) ).getCombinableDomainsIds().size() != 2 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "g" ) ).getCombinableDomainsIds().size() != 1 ) { + return false; + } + if ( eel_not_ignore.get( new DomainId( "h" ) ).getCombinableDomainsIds().size() != 1 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "a" ) ).getCombinableDomainsIds().size() != 4 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "b" ) ).getCombinableDomainsIds().size() != 4 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "c" ) ).getCombinableDomainsIds().size() != 4 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "d" ) ).getCombinableDomainsIds().size() != 4 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "e" ) ).getCombinableDomainsIds().size() != 5 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "f" ) ).getCombinableDomainsIds().size() != 1 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "g" ) ).getCombinableDomainsIds().size() != 1 ) { + return false; + } + if ( eel_ignore.get( new DomainId( "h" ) ).getCombinableDomainsIds().size() != 1 ) { + return false; + } + if ( eel_not_ignore.getAllDomainIds().size() != 8 ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( a.getDomainId() ) ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( b.getDomainId() ) ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( c.getDomainId() ) ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( d.getDomainId() ) ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( e.getDomainId() ) ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( f.getDomainId() ) ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( g.getDomainId() ) ) { + return false; + } + if ( !eel_not_ignore.getAllDomainIds().contains( h.getDomainId() ) ) { + return false; + } + if ( eel_not_ignore.getAllDomainIds().contains( x.getDomainId() ) ) { + return false; + } + if ( eel_ignore.getAllDomainIds().size() != 8 ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( a.getDomainId() ) ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( b.getDomainId() ) ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( c.getDomainId() ) ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( d.getDomainId() ) ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( e.getDomainId() ) ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( f.getDomainId() ) ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( g.getDomainId() ) ) { + return false; + } + if ( !eel_ignore.getAllDomainIds().contains( h.getDomainId() ) ) { + return false; + } + if ( eel_ignore.getAllDomainIds().contains( x.getDomainId() ) ) { + return false; + } + final SortedSet bc0 = eel_not_ignore.toBinaryDomainCombinations(); + if ( bc0.size() != 15 ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "a", "a" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "a", "b" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "b", "a" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "a", "c" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "a", "d" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "a", "e" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "b", "c" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "b", "d" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "b", "e" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "c", "d" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "c", "e" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "d", "e" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "e", "f" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "e", "e" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "f", "f" ) ) ) { + return false; + } + if ( !bc0.contains( new BasicBinaryDomainCombination( "g", "h" ) ) ) { + return false; + } + if ( bc0.contains( new BasicBinaryDomainCombination( "f", "a" ) ) ) { + return false; + } + if ( bc0.contains( new BasicBinaryDomainCombination( "f", "b" ) ) ) { + return false; + } + if ( bc0.contains( new BasicBinaryDomainCombination( "a", "h" ) ) ) { + return false; + } + if ( bc0.contains( new BasicBinaryDomainCombination( "a", "g" ) ) ) { + return false; + } + final SortedSet bc1 = eel_ignore.toBinaryDomainCombinations(); + if ( bc1.size() != 12 ) { + return false; + } + if ( bc1.contains( new BasicBinaryDomainCombination( "a", "a" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "a", "b" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "b", "a" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "a", "c" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "a", "d" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "a", "e" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "b", "c" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "b", "d" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "b", "e" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "c", "d" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "c", "e" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "d", "e" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "e", "f" ) ) ) { + return false; + } + if ( !bc1.contains( new BasicBinaryDomainCombination( "g", "h" ) ) ) { + return false; + } + if ( bc1.contains( new BasicBinaryDomainCombination( "e", "e" ) ) ) { + return false; + } + if ( bc1.contains( new BasicBinaryDomainCombination( "f", "f" ) ) ) { + return false; + } + if ( bc1.contains( new BasicBinaryDomainCombination( "f", "a" ) ) ) { + return false; + } + if ( bc1.contains( new BasicBinaryDomainCombination( "f", "b" ) ) ) { + return false; + } + if ( bc1.contains( new BasicBinaryDomainCombination( "a", "g" ) ) ) { + return false; + } + if ( bc1.contains( new BasicBinaryDomainCombination( "b", "g" ) ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testHmmPfamOutputParser( final File test_dir ) { + try { + final HmmPfamOutputParser parser = new HmmPfamOutputParser( new File( test_dir + + ForesterUtil.getFileSeparator() + "hmmpfam_output" ), "human", "ls" ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + List domain_collections = null; + domain_collections = parser.parse(); + if ( parser.getDomainsEncountered() != 4 ) { + return false; + } + if ( parser.getDomainsIgnoredDueToDuf() != 0 ) { + return false; + } + if ( parser.getDomainsIgnoredDueToEval() != 1 ) { + return false; + } + if ( parser.getDomainsIgnoredDueToOverlap() != 0 ) { + return false; + } + if ( parser.getDomainsStored() != 3 ) { + return false; + } + if ( domain_collections.size() != 1 ) { + return false; + } + final Protein pdc = ( Protein ) domain_collections.get( 0 ); + if ( !pdc.getProteinId().equals( new ProteinId( "ENSP00000285681" ) ) ) { + return false; + } + if ( !pdc.getSpecies().getSpeciesId().equals( "human" ) ) { + return false; + } + if ( pdc.getNumberOfProteinDomains() != 3 ) { + return false; + } + if ( !pdc.getAccession().equals( "acc_ENSP00000285681" ) ) { + return false; + } + if ( !pdc + .getDescription() + .equals( "pep:known chromosome:NCBI36:21:16024215:16174248:1 gene:ENSG00000155313 transcript:ENST00000285681" ) ) { + return false; + } + final List uba = pdc.getProteinDomains( new DomainId( "UBA" ) ); + final List uim = pdc.getProteinDomains( new DomainId( "UIM" ) ); + final List uch = pdc.getProteinDomains( new DomainId( "UCH" ) ); + if ( uba.size() != 1 ) { + return false; + } + if ( uim.size() != 2 ) { + return false; + } + if ( uch.size() != 0 ) { + return false; + } + final BasicDomain uim_domain = ( BasicDomain ) uim.get( 1 ); + if ( !uim_domain.getDomainId().equals( new DomainId( "UIM" ) ) ) { + return false; + } + if ( uim_domain.getTotalCount() != 2 ) { + return false; + } + final BasicDomain uba_domain = ( BasicDomain ) uba.get( 0 ); + if ( !uba_domain.getDomainId().equals( new DomainId( "UBA" ) ) ) { + return false; + } + if ( uba_domain.getNumber() != 1 ) { + return false; + } + if ( uba_domain.getTotalCount() != 1 ) { + return false; + } + if ( uba_domain.getFrom() != 16 ) { + return false; + } + if ( uba_domain.getTo() != 57 ) { + return false; + } + if ( !Test.isEqual( uba_domain.getPerSequenceEvalue(), 0.00084 ) ) { + return false; + } + if ( !Test.isEqual( uba_domain.getPerSequenceScore(), 23.2 ) ) { + return false; + } + final HmmPfamOutputParser parser2 = new HmmPfamOutputParser( new File( test_dir + + ForesterUtil.getFileSeparator() + "hmmpfam_output_short" ), "human", "ls" ); + parser2.setEValueMaximum( 0.2 ); + parser2.setIgnoreDufs( true ); + parser2.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + List domain_collections2 = null; + domain_collections2 = parser2.parse(); + if ( parser2.getDomainsEncountered() != 4 ) { + return false; + } + if ( parser.getDomainsIgnoredDueToDuf() != 0 ) { + return false; + } + if ( parser.getDomainsIgnoredDueToEval() != 1 ) { + return false; + } + if ( parser.getDomainsIgnoredDueToOverlap() != 0 ) { + return false; + } + if ( parser2.getDomainsStored() != 3 ) { + return false; + } + if ( domain_collections2.size() != 1 ) { + return false; + } + final Protein pdc2 = domain_collections2.get( 0 ); + if ( !pdc2.getProteinId().getId().equals( "ENSP00000285681" ) ) { + return false; + } + if ( !pdc2.getSpecies().getSpeciesId().equals( "human" ) ) { + return false; + } + if ( !pdc2.getName().equals( "" ) ) { + return false; + } + if ( !pdc2.getAccession().equals( "223" ) ) { + return false; + } + if ( !pdc2 + .getDescription() + .equals( "pep:known chromosome:NCBI36:21:16024215:16174248:1 gene:ENSG00000155313 transcript:ENST00000285681" ) ) { + return false; + } + if ( pdc2.getNumberOfProteinDomains() != 3 ) { + return false; + } + final List uba2 = pdc2.getProteinDomains( new DomainId( "UBA" ) ); + final List uim2 = pdc2.getProteinDomains( new DomainId( "UIM" ) ); + final List uch2 = pdc2.getProteinDomains( new DomainId( "UCH" ) ); + if ( uba2.size() != 1 ) { + return false; + } + if ( uim2.size() != 2 ) { + return false; + } + if ( uch2.size() != 0 ) { + return false; + } + final BasicDomain uim_domain2 = ( BasicDomain ) uim2.get( 1 ); + if ( !uim_domain2.getDomainId().getId().equals( "UIM" ) ) { + return false; + } + if ( uim_domain2.getTotalCount() != 2 ) { + return false; + } + final BasicDomain uba_domain2 = ( BasicDomain ) uba2.get( 0 ); + if ( !uba_domain2.getDomainId().getId().equals( "UBA" ) ) { + return false; + } + if ( uba_domain2.getNumber() != 1 ) { + return false; + } + if ( uba_domain2.getTotalCount() != 1 ) { + return false; + } + if ( uba_domain2.getFrom() != 16 ) { + return false; + } + if ( uba_domain2.getTo() != 57 ) { + return false; + } + if ( !Test.isEqual( uba_domain2.getPerSequenceEvalue(), 0.00084 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testHmmPfamOutputParserWithFilter( final File test_dir ) { + try { + HmmPfamOutputParser parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + + "hmmpfam_output3" ), "human", "ls" ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + List proteins = null; + proteins = parser.parse(); + if ( parser.getProteinsIgnoredDueToFilter() != 0 ) { + return false; + } + if ( proteins.size() != 4 ) { + return false; + } + // + Set filter = new TreeSet(); + filter.add( new DomainId( "beauty" ) ); + filter.add( new DomainId( "strange" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.NEGATIVE_PROTEIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getProteinsIgnoredDueToFilter() != 0 ) { + return false; + } + if ( proteins.size() != 4 ) { + return false; + } + // + filter = new TreeSet(); + filter.add( new DomainId( "beauty" ) ); + filter.add( new DomainId( "strange" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.POSITIVE_PROTEIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getProteinsIgnoredDueToFilter() != 4 ) { + return false; + } + if ( proteins.size() != 0 ) { + return false; + } + // + filter = new TreeSet(); + filter.add( new DomainId( "UIM" ) ); + filter.add( new DomainId( "A" ) ); + filter.add( new DomainId( "C" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.POSITIVE_PROTEIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getProteinsIgnoredDueToFilter() != 0 ) { + return false; + } + if ( proteins.size() != 4 ) { + return false; + } + // + filter = new TreeSet(); + filter.add( new DomainId( "UIM" ) ); + filter.add( new DomainId( "A" ) ); + filter.add( new DomainId( "C" ) ); + filter.add( new DomainId( "X" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.NEGATIVE_DOMAIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getDomainsIgnoredDueToNegativeDomainFilter() != 7 ) { + return false; + } + if ( proteins.size() != 3 ) { + return false; + } + // + filter = new TreeSet(); + filter.add( new DomainId( "UIM" ) ); + filter.add( new DomainId( "A" ) ); + filter.add( new DomainId( "C" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.NEGATIVE_PROTEIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getProteinsIgnoredDueToFilter() != 4 ) { + return false; + } + if ( proteins.size() != 0 ) { + return false; + } + // + filter = new TreeSet(); + filter.add( new DomainId( "UIM" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.NEGATIVE_PROTEIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getProteinsIgnoredDueToFilter() != 1 ) { + return false; + } + if ( parser.getProteinsStored() != 3 ) { + return false; + } + if ( proteins.size() != 3 ) { + return false; + } + // + filter = new TreeSet(); + filter.add( new DomainId( "UIM" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.POSITIVE_PROTEIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getProteinsIgnoredDueToFilter() != 3 ) { + return false; + } + if ( parser.getProteinsStored() != 1 ) { + return false; + } + if ( proteins.size() != 1 ) { + return false; + } + // + filter = new TreeSet(); + filter.add( new DomainId( "A" ) ); + filter.add( new DomainId( "C" ) ); + parser = new HmmPfamOutputParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmpfam_output3" ), + "human", + "ls", + filter, + HmmPfamOutputParser.FilterType.POSITIVE_PROTEIN ); + parser.setEValueMaximum( 0.2 ); + parser.setIgnoreDufs( true ); + parser.setReturnType( HmmPfamOutputParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN ); + proteins = null; + proteins = parser.parse(); + if ( parser.getDomainsEncountered() != 11 ) { + return false; + } + if ( parser.getProteinsEncountered() != 4 ) { + return false; + } + if ( parser.getProteinsIgnoredDueToFilter() != 1 ) { + return false; + } + if ( parser.getProteinsStored() != 3 ) { + return false; + } + if ( proteins.size() != 3 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testOverlapRemoval() { + try { + final Domain d0 = new BasicDomain( "d0", ( short ) 2, ( short ) 5, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d1 = new BasicDomain( "d1", ( short ) 7, ( short ) 10, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d2 = new BasicDomain( "d2", ( short ) 0, ( short ) 20, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d3 = new BasicDomain( "d3", ( short ) 9, ( short ) 10, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Domain d4 = new BasicDomain( "d4", ( short ) 7, ( short ) 8, ( short ) 1, ( short ) 1, 0.1, 1 ); + final List covered = new ArrayList(); + covered.add( true ); // 0 + covered.add( false ); // 1 + covered.add( true ); // 2 + covered.add( false ); // 3 + covered.add( true ); // 4 + covered.add( true ); // 5 + covered.add( false ); // 6 + covered.add( true ); // 7 + covered.add( true ); // 8 + if ( SurfacingUtil.calculateOverlap( d0, covered ) != 3 ) { + return false; + } + if ( SurfacingUtil.calculateOverlap( d1, covered ) != 2 ) { + return false; + } + if ( SurfacingUtil.calculateOverlap( d2, covered ) != 6 ) { + return false; + } + if ( SurfacingUtil.calculateOverlap( d3, covered ) != 0 ) { + return false; + } + if ( SurfacingUtil.calculateOverlap( d4, covered ) != 2 ) { + return false; + } + final Domain a = new BasicDomain( "a", ( short ) 2, ( short ) 5, ( short ) 1, ( short ) 1, 0.01, 1 ); + final Domain b = new BasicDomain( "b", ( short ) 2, ( short ) 10, ( short ) 1, ( short ) 1, 0.1, 1 ); + final Protein ab = new BasicProtein( "ab", "varanus" ); + ab.addProteinDomain( a ); + ab.addProteinDomain( b ); + final Protein ab_s0 = SurfacingUtil.removeOverlappingDomains( 3, false, ab ); + if ( ab.getNumberOfProteinDomains() != 2 ) { + return false; + } + if ( ab_s0.getNumberOfProteinDomains() != 1 ) { + return false; + } + if ( !ab_s0.getProteinDomain( 0 ).getDomainId().getId().equals( "a" ) ) { + return false; + } + final Protein ab_s1 = SurfacingUtil.removeOverlappingDomains( 4, false, ab ); + if ( ab.getNumberOfProteinDomains() != 2 ) { + return false; + } + if ( ab_s1.getNumberOfProteinDomains() != 2 ) { + return false; + } + final Domain c = new BasicDomain( "c", ( short ) 20000, ( short ) 20500, ( short ) 1, ( short ) 1, 10, 1 ); + final Domain d = new BasicDomain( "d", + ( short ) 10000, + ( short ) 10500, + ( short ) 1, + ( short ) 1, + 0.0000001, + 1 ); + final Domain e = new BasicDomain( "e", ( short ) 5000, ( short ) 5500, ( short ) 1, ( short ) 1, 0.0001, 1 ); + final Protein cde = new BasicProtein( "cde", "varanus" ); + cde.addProteinDomain( c ); + cde.addProteinDomain( d ); + cde.addProteinDomain( e ); + final Protein cde_s0 = SurfacingUtil.removeOverlappingDomains( 0, false, cde ); + if ( cde.getNumberOfProteinDomains() != 3 ) { + return false; + } + if ( cde_s0.getNumberOfProteinDomains() != 3 ) { + return false; + } + final Domain f = new BasicDomain( "f", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 1, 10, 1 ); + final Domain g = new BasicDomain( "g", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 1, 0.01, 1 ); + final Domain h = new BasicDomain( "h", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 1, 0.0001, 1 ); + final Domain i = new BasicDomain( "i", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 1, 0.5, 1 ); + final Domain i2 = new BasicDomain( "i", ( short ) 5, ( short ) 30, ( short ) 1, ( short ) 1, 0.5, 10 ); + final Protein fghi = new BasicProtein( "fghi", "varanus" ); + fghi.addProteinDomain( f ); + fghi.addProteinDomain( g ); + fghi.addProteinDomain( h ); + fghi.addProteinDomain( i ); + fghi.addProteinDomain( i ); + fghi.addProteinDomain( i ); + fghi.addProteinDomain( i2 ); + final Protein fghi_s0 = SurfacingUtil.removeOverlappingDomains( 10, false, fghi ); + if ( fghi.getNumberOfProteinDomains() != 7 ) { + return false; + } + if ( fghi_s0.getNumberOfProteinDomains() != 1 ) { + return false; + } + if ( !fghi_s0.getProteinDomain( 0 ).getDomainId().getId().equals( "h" ) ) { + return false; + } + final Protein fghi_s1 = SurfacingUtil.removeOverlappingDomains( 11, false, fghi ); + if ( fghi.getNumberOfProteinDomains() != 7 ) { + return false; + } + if ( fghi_s1.getNumberOfProteinDomains() != 7 ) { + return false; + } + final Domain j = new BasicDomain( "j", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 1, 10, 1 ); + final Domain k = new BasicDomain( "k", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 1, 0.01, 1 ); + final Domain l = new BasicDomain( "l", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 1, 0.0001, 1 ); + final Domain m = new BasicDomain( "m", ( short ) 10, ( short ) 20, ( short ) 1, ( short ) 4, 0.5, 1 ); + final Domain m0 = new BasicDomain( "m", ( short ) 10, ( short ) 20, ( short ) 2, ( short ) 4, 0.5, 1 ); + final Domain m1 = new BasicDomain( "m", ( short ) 10, ( short ) 20, ( short ) 3, ( short ) 4, 0.5, 1 ); + final Domain m2 = new BasicDomain( "m", ( short ) 5, ( short ) 30, ( short ) 4, ( short ) 4, 0.5, 10 ); + final Protein jklm = new BasicProtein( "jklm", "varanus" ); + jklm.addProteinDomain( j ); + jklm.addProteinDomain( k ); + jklm.addProteinDomain( l ); + jklm.addProteinDomain( m ); + jklm.addProteinDomain( m0 ); + jklm.addProteinDomain( m1 ); + jklm.addProteinDomain( m2 ); + final Protein jklm_s0 = SurfacingUtil.removeOverlappingDomains( 10, false, jklm ); + if ( jklm.getNumberOfProteinDomains() != 7 ) { + return false; + } + if ( jklm_s0.getNumberOfProteinDomains() != 1 ) { + return false; + } + if ( !jklm_s0.getProteinDomain( 0 ).getDomainId().getId().equals( "l" ) ) { + return false; + } + final Protein jklm_s1 = SurfacingUtil.removeOverlappingDomains( 11, false, jklm ); + if ( jklm.getNumberOfProteinDomains() != 7 ) { + return false; + } + if ( jklm_s1.getNumberOfProteinDomains() != 7 ) { + return false; + } + final Domain only = new BasicDomain( "only", ( short ) 5, ( short ) 30, ( short ) 4, ( short ) 4, 0.5, 10 ); + final Protein od = new BasicProtein( "od", "varanus" ); + od.addProteinDomain( only ); + final Protein od_s0 = SurfacingUtil.removeOverlappingDomains( 0, false, od ); + if ( od.getNumberOfProteinDomains() != 1 ) { + return false; + } + if ( od_s0.getNumberOfProteinDomains() != 1 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testParsimony() { + try { + final BinaryStates X = BinaryStates.PRESENT; + final BinaryStates O = BinaryStates.ABSENT; + final GainLossStates G = GainLossStates.GAIN; + final GainLossStates L = GainLossStates.LOSS; + final GainLossStates A = GainLossStates.UNCHANGED_ABSENT; + final GainLossStates P = GainLossStates.UNCHANGED_PRESENT; + final Domain a = new BasicDomain( "A", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b = new BasicDomain( "B", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c = new BasicDomain( "C", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain d = new BasicDomain( "D", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain e = new BasicDomain( "E", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain f = new BasicDomain( "F", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain g = new BasicDomain( "G", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain h = new BasicDomain( "H", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain i = new BasicDomain( "I", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain j = new BasicDomain( "J", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain l = new BasicDomain( "L", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain m = new BasicDomain( "M", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain n = new BasicDomain( "N", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain o = new BasicDomain( "O", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain p = new BasicDomain( "P", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain q = new BasicDomain( "Q", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain r = new BasicDomain( "R", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + // 1 a-a a-b a-c e-f-g-h l-m + // 2 a-b a-c e-f-g-i n-o + // 3 a-b a-d e-f-g-j p-q + // 4 a-b a-d p-r + // 1 a-a a-b a-c e-f e-g e-h f-g f-h g-h l-m + // 2 a-b a-c e-f e-g e-i f-g f-i g-i n-o + // 3 a-b a-d e-f e-g e-j f-g f-j g-j p-q + // 4 a-b a-d p-r + // 1 a b c e f g h l m + // 2 a b c e f g i n o + // 3 a b d e f g j p q + // 4 a b d p r + final Protein aa1 = new BasicProtein( "aa1", "one" ); + aa1.addProteinDomain( a ); + aa1.addProteinDomain( a ); + final Protein ab1 = new BasicProtein( "ab1", "one" ); + ab1.addProteinDomain( a ); + ab1.addProteinDomain( b ); + final Protein ac1 = new BasicProtein( "ac1", "one" ); + ac1.addProteinDomain( a ); + ac1.addProteinDomain( c ); + final Protein efgh1 = new BasicProtein( "efgh1", "one" ); + efgh1.addProteinDomain( e ); + efgh1.addProteinDomain( f ); + efgh1.addProteinDomain( g ); + efgh1.addProteinDomain( h ); + final Protein lm1 = new BasicProtein( "lm1", "one" ); + lm1.addProteinDomain( l ); + lm1.addProteinDomain( m ); + final Protein ab2 = new BasicProtein( "ab2", "two" ); + ab2.addProteinDomain( a ); + ab2.addProteinDomain( b ); + final Protein ac2 = new BasicProtein( "ac2", "two" ); + ac2.addProteinDomain( a ); + ac2.addProteinDomain( c ); + final Protein efgi2 = new BasicProtein( "efgi2", "two" ); + efgi2.addProteinDomain( e ); + efgi2.addProteinDomain( f ); + efgi2.addProteinDomain( g ); + efgi2.addProteinDomain( i ); + final Protein no2 = new BasicProtein( "no2", "two" ); + no2.addProteinDomain( n ); + no2.addProteinDomain( o ); + final Protein ab3 = new BasicProtein( "ab3", "three" ); + ab3.addProteinDomain( a ); + ab3.addProteinDomain( b ); + final Protein ad3 = new BasicProtein( "ad3", "three" ); + ad3.addProteinDomain( a ); + ad3.addProteinDomain( d ); + final Protein efgj3 = new BasicProtein( "efgj3", "three" ); + efgj3.addProteinDomain( e ); + efgj3.addProteinDomain( f ); + efgj3.addProteinDomain( g ); + efgj3.addProteinDomain( j ); + final Protein pq3 = new BasicProtein( "pq3", "three" ); + pq3.addProteinDomain( p ); + pq3.addProteinDomain( q ); + final Protein ab4 = new BasicProtein( "ab4", "four" ); + ab4.addProteinDomain( a ); + ab4.addProteinDomain( b ); + final Protein ad4 = new BasicProtein( "ad4", "four" ); + ad4.addProteinDomain( a ); + ad4.addProteinDomain( d ); + final Protein pr4 = new BasicProtein( "pr4", "four" ); + pr4.addProteinDomain( p ); + pr4.addProteinDomain( r ); + final List one_list = new ArrayList(); + one_list.add( aa1 ); + one_list.add( ab1 ); + one_list.add( ac1 ); + one_list.add( efgh1 ); + one_list.add( lm1 ); + final List two_list = new ArrayList(); + two_list.add( ab2 ); + two_list.add( ac2 ); + two_list.add( efgi2 ); + two_list.add( no2 ); + final List three_list = new ArrayList(); + three_list.add( ab3 ); + three_list.add( ad3 ); + three_list.add( efgj3 ); + three_list.add( pq3 ); + final List four_list = new ArrayList(); + four_list.add( ab4 ); + four_list.add( ad4 ); + four_list.add( pr4 ); + final GenomeWideCombinableDomains one = BasicGenomeWideCombinableDomains + .createInstance( one_list, false, new BasicSpecies( "one" ) ); + final GenomeWideCombinableDomains two = BasicGenomeWideCombinableDomains + .createInstance( two_list, false, new BasicSpecies( "two" ) ); + final GenomeWideCombinableDomains three = BasicGenomeWideCombinableDomains + .createInstance( three_list, false, new BasicSpecies( "three" ) ); + final GenomeWideCombinableDomains four = BasicGenomeWideCombinableDomains + .createInstance( four_list, false, new BasicSpecies( "four" ) ); + final List gwcd_list = new ArrayList(); + gwcd_list.add( one ); + gwcd_list.add( two ); + gwcd_list.add( three ); + gwcd_list.add( four ); + final CharacterStateMatrix matrix_d = DomainParsimonyCalculator + .createMatrixOfDomainPresenceOrAbsence( gwcd_list ); + final CharacterStateMatrix matrix_bc = DomainParsimonyCalculator + .createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ); + // 1 a b c e f g h l m + // 2 a b c e f g i n o + // 3 a b d e f g j p q + // 4 a b d p r + if ( matrix_d.getState( 0, 0 ) != X ) { + return false; + } + if ( matrix_d.getState( 0, 1 ) != X ) { + return false; + } + if ( matrix_d.getState( 0, 2 ) != X ) { + return false; + } + if ( matrix_d.getState( 0, 3 ) != O ) { + return false; + } + if ( matrix_d.getState( 0, 4 ) != X ) { + return false; + } + if ( matrix_d.getState( 0, 5 ) != X ) { + return false; + } + if ( matrix_d.getState( 0, 6 ) != X ) { + return false; + } + if ( matrix_d.getState( 0, 7 ) != X ) { + return false; + } + if ( matrix_d.getState( 0, 8 ) != O ) { + return false; + } + // 1 a-a a-b a-c e-f e-g e-h f-g f-h g-h l-m + // 2 a-b a-c e-f e-g e-i f-g f-i g-i n-o + // 3 a-b a-d e-f e-g e-j f-g f-j g-j p-q + // 4 a-b a-d p-r + if ( matrix_bc.getState( 0, 0 ) != X ) { + return false; + } + if ( matrix_bc.getState( 0, 1 ) != X ) { + return false; + } + if ( matrix_bc.getState( 0, 2 ) != X ) { + return false; + } + if ( matrix_bc.getState( 0, 3 ) != O ) { + return false; + } + if ( matrix_bc.getState( 0, 4 ) != X ) { + return false; + } + if ( matrix_bc.getState( 1, 0 ) != O ) { + return false; + } + if ( matrix_bc.getState( 1, 1 ) != X ) { + return false; + } + if ( matrix_bc.getState( 1, 2 ) != X ) { + return false; + } + if ( matrix_bc.getState( 1, 3 ) != O ) { + return false; + } + if ( matrix_bc.getState( 1, 4 ) != X ) { + return false; + } + if ( matrix_bc.getState( 2, 0 ) != O ) { + return false; + } + if ( matrix_bc.getState( 2, 1 ) != X ) { + return false; + } + if ( matrix_bc.getState( 2, 2 ) != O ) { + return false; + } + if ( matrix_bc.getState( 2, 3 ) != X ) { + return false; + } + if ( matrix_bc.getState( 2, 4 ) != X ) { + return false; + } + final PhylogenyFactory factory0 = ParserBasedPhylogenyFactory.getInstance(); + final String p0_str = "((one,two)1-2,(three,four)3-4)root"; + final Phylogeny p0 = factory0.create( p0_str, new NHXParser() )[ 0 ]; + final DomainParsimonyCalculator dp0 = DomainParsimonyCalculator.createInstance( p0, gwcd_list ); + dp0.executeDolloParsimonyOnDomainPresence(); + final CharacterStateMatrix gl_matrix_d = dp0.getGainLossMatrix(); + final CharacterStateMatrix is_matrix_d = dp0.getInternalStatesMatrix(); + dp0.executeDolloParsimonyOnBinaryDomainCombintionPresence(); + final CharacterStateMatrix gl_matrix_bc = dp0.getGainLossMatrix(); + final CharacterStateMatrix is_matrix_bc = dp0.getInternalStatesMatrix(); + if ( is_matrix_d.getState( "root", "A" ) != X ) { + return false; + } + if ( is_matrix_d.getState( "root", "B" ) != X ) { + return false; + } + if ( is_matrix_d.getState( "root", "C" ) != O ) { + return false; + } + if ( is_matrix_d.getState( "root", "D" ) != O ) { + return false; + } + if ( is_matrix_d.getState( "root", "E" ) != X ) { + return false; + } + if ( is_matrix_bc.getState( "root", "A=A" ) != O ) { + return false; + } + if ( is_matrix_bc.getState( "root", "A=B" ) != X ) { + return false; + } + if ( is_matrix_bc.getState( "root", "A=C" ) != O ) { + return false; + } + if ( is_matrix_bc.getState( "root", "A=D" ) != O ) { + return false; + } + if ( is_matrix_bc.getState( "root", "G=H" ) != O ) { + return false; + } + if ( is_matrix_bc.getState( "1-2", "G=H" ) != O ) { + return false; + } + if ( is_matrix_bc.getState( "root", "E=F" ) != X ) { + return false; + } + if ( gl_matrix_bc.getState( "root", "E=F" ) != P ) { + return false; + } + if ( gl_matrix_bc.getState( "root", "A=A" ) != A ) { + return false; + } + if ( gl_matrix_bc.getState( "one", "A=A" ) != G ) { + return false; + } + if ( gl_matrix_bc.getState( "root", "A=B" ) != P ) { + return false; + } + if ( gl_matrix_bc.getState( "3-4", "A=D" ) != G ) { + return false; + } + if ( gl_matrix_bc.getState( "four", "E=F" ) != L ) { + return false; + } + if ( gl_matrix_d.getState( "3-4", "P" ) != G ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testParsimonyOnSecondaryFeatures() { + try { + final BinaryStates X = BinaryStates.PRESENT; + final BinaryStates O = BinaryStates.ABSENT; + final GainLossStates G = GainLossStates.GAIN; + final GainLossStates L = GainLossStates.LOSS; + final GainLossStates A = GainLossStates.UNCHANGED_ABSENT; + final GainLossStates P = GainLossStates.UNCHANGED_PRESENT; + final Domain a = new BasicDomain( "A", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain b = new BasicDomain( "B", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain c = new BasicDomain( "C", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain d = new BasicDomain( "D", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain e = new BasicDomain( "E", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain f = new BasicDomain( "F", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain g = new BasicDomain( "G", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain h = new BasicDomain( "H", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain i = new BasicDomain( "I", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain j = new BasicDomain( "J", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain l = new BasicDomain( "L", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain m = new BasicDomain( "M", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain n = new BasicDomain( "N", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain o = new BasicDomain( "O", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain p = new BasicDomain( "P", 1, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain q = new BasicDomain( "Q", 2, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + final Domain r = new BasicDomain( "R", 3, 25, ( short ) 1, ( short ) 4, 0.1, -12 ); + // 1 a-a a-b a-c e-f-g-h l-m + // 2 a-b a-c e-f-g-i n-o + // 3 a-b a-d e-f-g-j p-q + // 4 a-b a-d p-r + // 1 a-a a-b a-c e-f e-g e-h f-g f-h g-h l-m + // 2 a-b a-c e-f e-g e-i f-g f-i g-i n-o + // 3 a-b a-d e-f e-g e-j f-g f-j g-j p-q + // 4 a-b a-d p-r + // 1 a b c e f g h l m + // 2 a b c e f g i n o + // 3 a b d e f g j p q + // 4 a b d p r + final Protein aa1 = new BasicProtein( "aa1", "one" ); + aa1.addProteinDomain( a ); + aa1.addProteinDomain( a ); + final Protein ab1 = new BasicProtein( "ab1", "one" ); + ab1.addProteinDomain( a ); + ab1.addProteinDomain( b ); + final Protein ac1 = new BasicProtein( "ac1", "one" ); + ac1.addProteinDomain( a ); + ac1.addProteinDomain( c ); + final Protein efgh1 = new BasicProtein( "efgh1", "one" ); + efgh1.addProteinDomain( e ); + efgh1.addProteinDomain( f ); + efgh1.addProteinDomain( g ); + efgh1.addProteinDomain( h ); + final Protein lm1 = new BasicProtein( "lm1", "one" ); + lm1.addProteinDomain( l ); + lm1.addProteinDomain( m ); + final Protein ab2 = new BasicProtein( "ab2", "two" ); + ab2.addProteinDomain( a ); + ab2.addProteinDomain( b ); + final Protein ac2 = new BasicProtein( "ac2", "two" ); + ac2.addProteinDomain( a ); + ac2.addProteinDomain( c ); + final Protein efgi2 = new BasicProtein( "efgi2", "two" ); + efgi2.addProteinDomain( e ); + efgi2.addProteinDomain( f ); + efgi2.addProteinDomain( g ); + efgi2.addProteinDomain( i ); + final Protein no2 = new BasicProtein( "no2", "two" ); + no2.addProteinDomain( n ); + no2.addProteinDomain( o ); + final Protein ab3 = new BasicProtein( "ab3", "three" ); + ab3.addProteinDomain( a ); + ab3.addProteinDomain( b ); + final Protein ad3 = new BasicProtein( "ad3", "three" ); + ad3.addProteinDomain( a ); + ad3.addProteinDomain( d ); + final Protein efgj3 = new BasicProtein( "efgj3", "three" ); + efgj3.addProteinDomain( e ); + efgj3.addProteinDomain( f ); + efgj3.addProteinDomain( g ); + efgj3.addProteinDomain( j ); + final Protein pq3 = new BasicProtein( "pq3", "three" ); + pq3.addProteinDomain( p ); + pq3.addProteinDomain( q ); + final Protein ab4 = new BasicProtein( "ab4", "four" ); + ab4.addProteinDomain( a ); + ab4.addProteinDomain( b ); + final Protein ad4 = new BasicProtein( "ad4", "four" ); + ad4.addProteinDomain( a ); + ad4.addProteinDomain( d ); + final Protein pr4 = new BasicProtein( "pr4", "four" ); + pr4.addProteinDomain( p ); + pr4.addProteinDomain( r ); + final List one_list = new ArrayList(); + one_list.add( aa1 ); + one_list.add( ab1 ); + one_list.add( ac1 ); + one_list.add( efgh1 ); + one_list.add( lm1 ); + final List two_list = new ArrayList(); + two_list.add( ab2 ); + two_list.add( ac2 ); + two_list.add( efgi2 ); + two_list.add( no2 ); + final List three_list = new ArrayList(); + three_list.add( ab3 ); + three_list.add( ad3 ); + three_list.add( efgj3 ); + three_list.add( pq3 ); + final List four_list = new ArrayList(); + four_list.add( ab4 ); + four_list.add( ad4 ); + four_list.add( pr4 ); + final GenomeWideCombinableDomains one = BasicGenomeWideCombinableDomains + .createInstance( one_list, false, new BasicSpecies( "one" ) ); + final GenomeWideCombinableDomains two = BasicGenomeWideCombinableDomains + .createInstance( two_list, false, new BasicSpecies( "two" ) ); + final GenomeWideCombinableDomains three = BasicGenomeWideCombinableDomains + .createInstance( three_list, false, new BasicSpecies( "three" ) ); + final GenomeWideCombinableDomains four = BasicGenomeWideCombinableDomains + .createInstance( four_list, false, new BasicSpecies( "four" ) ); + final List gwcd_list = new ArrayList(); + gwcd_list.add( one ); + gwcd_list.add( two ); + gwcd_list.add( three ); + gwcd_list.add( four ); + final Map> map_same = new HashMap>(); + final HashSet a_s = new HashSet(); + a_s.add( "AAA" ); + final HashSet b_s = new HashSet(); + b_s.add( "BBB" ); + final HashSet c_s = new HashSet(); + c_s.add( "CCC" ); + final HashSet d_s = new HashSet(); + d_s.add( "DDD" ); + final HashSet e_s = new HashSet(); + e_s.add( "EEE" ); + final HashSet f_s = new HashSet(); + f_s.add( "FFF" ); + final HashSet g_s = new HashSet(); + g_s.add( "GGG" ); + final HashSet h_s = new HashSet(); + h_s.add( "HHH" ); + final HashSet i_s = new HashSet(); + i_s.add( "III" ); + final HashSet j_s = new HashSet(); + j_s.add( "JJJ" ); + final HashSet l_s = new HashSet(); + l_s.add( "LLL" ); + final HashSet m_s = new HashSet(); + m_s.add( "MMM" ); + final HashSet n_s = new HashSet(); + n_s.add( "NNN" ); + final HashSet o_s = new HashSet(); + o_s.add( "OOO" ); + final HashSet p_s = new HashSet(); + p_s.add( "PPP" ); + final HashSet q_s = new HashSet(); + q_s.add( "QQQ" ); + final HashSet r_s = new HashSet(); + r_s.add( "RRR" ); + map_same.put( a.getDomainId(), a_s ); + map_same.put( b.getDomainId(), b_s ); + map_same.put( c.getDomainId(), c_s ); + map_same.put( d.getDomainId(), d_s ); + map_same.put( e.getDomainId(), e_s ); + map_same.put( f.getDomainId(), f_s ); + map_same.put( g.getDomainId(), g_s ); + map_same.put( h.getDomainId(), h_s ); + map_same.put( i.getDomainId(), i_s ); + map_same.put( j.getDomainId(), j_s ); + map_same.put( l.getDomainId(), l_s ); + map_same.put( m.getDomainId(), m_s ); + map_same.put( n.getDomainId(), n_s ); + map_same.put( o.getDomainId(), o_s ); + map_same.put( p.getDomainId(), p_s ); + map_same.put( q.getDomainId(), q_s ); + map_same.put( r.getDomainId(), r_s ); + final CharacterStateMatrix matrix_s = DomainParsimonyCalculator + .createMatrixOfSecondaryFeaturePresenceOrAbsence( gwcd_list, map_same, null ); + // 1 a b c e f g h l m + // 2 a b c e f g i n o + // 3 a b d e f g j p q + // 4 a b d p r + if ( matrix_s.getState( 0, 0 ) != X ) { + return false; + } + if ( matrix_s.getState( 0, 1 ) != X ) { + return false; + } + if ( matrix_s.getState( 0, 2 ) != X ) { + return false; + } + if ( matrix_s.getState( 0, 3 ) != O ) { + return false; + } + if ( matrix_s.getState( 0, 4 ) != X ) { + return false; + } + if ( matrix_s.getState( 0, 5 ) != X ) { + return false; + } + if ( matrix_s.getState( 0, 6 ) != X ) { + return false; + } + if ( matrix_s.getState( 0, 7 ) != X ) { + return false; + } + if ( matrix_s.getState( 0, 8 ) != O ) { + return false; + } + final PhylogenyFactory factory0 = ParserBasedPhylogenyFactory.getInstance(); + final String p0_str = "((one,two)1-2,(three,four)3-4)root"; + final Phylogeny p0 = factory0.create( p0_str, new NHXParser() )[ 0 ]; + final DomainParsimonyCalculator dp0 = DomainParsimonyCalculator.createInstance( p0, gwcd_list, map_same ); + dp0.executeDolloParsimonyOnSecondaryFeatures( null ); + final CharacterStateMatrix gl_matrix_d = dp0.getGainLossMatrix(); + final CharacterStateMatrix is_matrix_d = dp0.getInternalStatesMatrix(); + if ( is_matrix_d.getState( "root", "AAA" ) != X ) { + return false; + } + if ( is_matrix_d.getState( "root", "BBB" ) != X ) { + return false; + } + if ( is_matrix_d.getState( "root", "CCC" ) != O ) { + return false; + } + if ( is_matrix_d.getState( "root", "DDD" ) != O ) { + return false; + } + if ( is_matrix_d.getState( "root", "EEE" ) != X ) { + return false; + } + if ( gl_matrix_d.getState( "3-4", "PPP" ) != G ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPaupLogParser( final File test_dir ) { + try { + final PaupLogParser parser = new PaupLogParser(); + parser.setSource( new File( test_dir + ForesterUtil.getFileSeparator() + "paup_log_test_1" ) ); + final CharacterStateMatrix matrix = parser.parse(); + if ( matrix.getNumberOfIdentifiers() != 8 ) { + return false; + } + if ( !matrix.getIdentifier( 0 ).equals( "MOUSE" ) ) { + return false; + } + if ( !matrix.getIdentifier( 1 ).equals( "NEMVE" ) ) { + return false; + } + if ( !matrix.getIdentifier( 2 ).equals( "MONBE" ) ) { + return false; + } + if ( !matrix.getIdentifier( 3 ).equals( "DICDI" ) ) { + return false; + } + if ( !matrix.getIdentifier( 4 ).equals( "ARATH" ) ) { + return false; + } + if ( !matrix.getIdentifier( 5 ).equals( "6" ) ) { + return false; + } + if ( !matrix.getIdentifier( 6 ).equals( "7" ) ) { + return false; + } + if ( !matrix.getIdentifier( 7 ).equals( "8" ) ) { + return false; + } + if ( matrix.getNumberOfCharacters() != ( 66 + 66 + 28 ) ) { + return false; + } + if ( matrix.getState( 0, 4 ) != BinaryStates.ABSENT ) { + return false; + } + if ( matrix.getState( 0, 5 ) != BinaryStates.PRESENT ) { + return false; + } + if ( matrix.getState( 1, 5 ) != BinaryStates.PRESENT ) { + return false; + } + if ( matrix.getState( 7, 154 ) != BinaryStates.ABSENT ) { + return false; + } + if ( matrix.getState( 7, 155 ) != BinaryStates.PRESENT ) { + return false; + } + if ( matrix.getState( 7, 156 ) != BinaryStates.PRESENT ) { + return false; + } + if ( matrix.getState( 7, 157 ) != BinaryStates.ABSENT ) { + return false; + } + if ( matrix.getState( 7, 158 ) != BinaryStates.PRESENT ) { + return false; + } + if ( matrix.getState( 7, 159 ) != BinaryStates.ABSENT ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testProteinId() { + try { + final ProteinId id1 = new ProteinId( "a" ); + final ProteinId id2 = new ProteinId( "a" ); + final ProteinId id3 = new ProteinId( "A" ); + final ProteinId id4 = new ProteinId( "b" ); + if ( !id1.equals( id1 ) ) { + return false; + } + if ( id1.getId().equals( "x" ) ) { + return false; + } + if ( id1.getId().equals( null ) ) { + return false; + } + if ( !id1.equals( id2 ) ) { + return false; + } + if ( id1.equals( id3 ) ) { + return false; + } + if ( id1.hashCode() != id1.hashCode() ) { + return false; + } + if ( id1.hashCode() != id2.hashCode() ) { + return false; + } + if ( id1.hashCode() == id3.hashCode() ) { + return false; + } + if ( id1.compareTo( id1 ) != 0 ) { + return false; + } + if ( id1.compareTo( id2 ) != 0 ) { + return false; + } + if ( id1.compareTo( id3 ) != 0 ) { + return false; + } + if ( id1.compareTo( id4 ) >= 0 ) { + return false; + } + if ( id4.compareTo( id1 ) <= 0 ) { + return false; + } + if ( !id4.getId().equals( "b" ) ) { + return false; + } + final ProteinId id5 = new ProteinId( " C " ); + if ( !id5.getId().equals( "C" ) ) { + return false; + } + if ( id5.equals( id1 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testSpecies() { + try { + final Species s1 = new BasicSpecies( "a" ); + final Species s2 = new BasicSpecies( "a" ); + final Species s3 = new BasicSpecies( "A" ); + final Species s4 = new BasicSpecies( "b" ); + if ( !s1.equals( s1 ) ) { + return false; + } + if ( s1.getSpeciesId().equals( "x" ) ) { + return false; + } + if ( s1.getSpeciesId().equals( null ) ) { + return false; + } + if ( !s1.equals( s2 ) ) { + return false; + } + if ( s1.equals( s3 ) ) { + return false; + } + if ( s1.hashCode() != s1.hashCode() ) { + return false; + } + if ( s1.hashCode() != s2.hashCode() ) { + return false; + } + if ( s1.hashCode() == s3.hashCode() ) { + return false; + } + if ( s1.compareTo( s1 ) != 0 ) { + return false; + } + if ( s1.compareTo( s2 ) != 0 ) { + return false; + } + if ( s1.compareTo( s3 ) != 0 ) { + return false; + } + if ( s1.compareTo( s4 ) >= 0 ) { + return false; + } + if ( s4.compareTo( s1 ) <= 0 ) { + return false; + } + if ( !s4.getSpeciesId().equals( "b" ) ) { + return false; + } + final Species s5 = new BasicSpecies( " C " ); + if ( !s5.getSpeciesId().equals( "C" ) ) { + return false; + } + if ( s5.equals( s1 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } +} diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java new file mode 100644 index 0000000..d27c23c --- /dev/null +++ b/forester/java/src/org/forester/test/Test.java @@ -0,0 +1,7939 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.test; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.forester.application.support_transfer; +import org.forester.development.DevelopmentTools; +import org.forester.evoinference.TestPhylogenyReconstruction; +import org.forester.evoinference.matrix.character.CharacterStateMatrix; +import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; +import org.forester.go.TestGo; +import org.forester.io.parsers.FastaParser; +import org.forester.io.parsers.GeneralMsaParser; +import org.forester.io.parsers.HmmscanPerDomainTableParser; +import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF; +import org.forester.io.parsers.nexus.NexusBinaryStatesMatrixParser; +import org.forester.io.parsers.nexus.NexusCharactersParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.parsers.tol.TolParser; +import org.forester.io.writers.PhylogenyWriter; +import org.forester.msa.Mafft; +import org.forester.msa.Msa; +import org.forester.msa.MsaInferrer; +import org.forester.pccx.TestPccx; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyBranch; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.BinaryCharacters; +import org.forester.phylogeny.data.BranchWidth; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Distribution; +import org.forester.phylogeny.data.DomainArchitecture; +import org.forester.phylogeny.data.Event; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PhylogenyData; +import org.forester.phylogeny.data.Polygon; +import org.forester.phylogeny.data.PropertiesMap; +import org.forester.phylogeny.data.Property; +import org.forester.phylogeny.data.ProteinDomain; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.data.Property.AppliesTo; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.SDI; +import org.forester.sdi.SDIR; +import org.forester.sdi.SDIse; +import org.forester.sdi.TaxonomyAssigner; +import org.forester.sdi.TestGSDI; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.Sequence; +import org.forester.surfacing.Protein; +import org.forester.surfacing.TestSurfacing; +import org.forester.tools.ConfidenceAssessor; +import org.forester.tools.SupportCount; +import org.forester.tools.TreeSplitMatrix; +import org.forester.util.AsciiHistogram; +import org.forester.util.BasicDescriptiveStatistics; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterConstants; +import org.forester.util.ForesterUtil; +import org.forester.util.GeneralTable; +import org.forester.ws.uniprot.UniProtTaxonomy; +import org.forester.ws.uniprot.UniProtWsTools; +import org.forester.ws.wabi.TxSearch; +import org.forester.ws.wabi.TxSearch.RANKS; +import org.forester.ws.wabi.TxSearch.TAX_NAME_CLASS; +import org.forester.ws.wabi.TxSearch.TAX_RANK; + +@SuppressWarnings( "unused") +public final class Test { + + private final static double ZERO_DIFF = 1.0E-9; + private final static String PATH_TO_TEST_DATA = System.getProperty( "user.dir" ) + + ForesterUtil.getFileSeparator() + "test_data" + + ForesterUtil.getFileSeparator(); + private final static String PATH_TO_RESOURCES = System.getProperty( "user.dir" ) + + ForesterUtil.getFileSeparator() + "resources" + + ForesterUtil.getFileSeparator(); + private final static boolean USE_LOCAL_PHYLOXML_SCHEMA = true; + private static final String PHYLOXML_REMOTE_XSD = ForesterConstants.PHYLO_XML_LOCATION + "/" + + ForesterConstants.PHYLO_XML_VERSION + "/" + + ForesterConstants.PHYLO_XML_XSD; + private static final String PHYLOXML_LOCAL_XSD = PATH_TO_RESOURCES + "phyloxml_schema/" + + ForesterConstants.PHYLO_XML_VERSION + "/" + + ForesterConstants.PHYLO_XML_XSD; + + private final static Phylogeny createPhylogeny( final String nhx ) throws IOException { + final Phylogeny p = ParserBasedPhylogenyFactory.getInstance().create( nhx, new NHXParser() )[ 0 ]; + return p; + } + + private final static Event getEvent( final Phylogeny p, final String n1, final String n2 ) { + final PhylogenyMethods pm = PhylogenyMethods.getInstance(); + return pm.obtainLCA( p.getNode( n1 ), p.getNode( n2 ) ).getNodeData().getEvent(); + } + + public static boolean isEqual( final double a, final double b ) { + return ( ( Math.abs( a - b ) ) < Test.ZERO_DIFF ); + } + + public static void main( final String[] args ) { + System.out.println( "[Java version: " + ForesterUtil.JAVA_VERSION + " " + ForesterUtil.JAVA_VENDOR + "]" ); + System.out.println( "[OS: " + ForesterUtil.OS_NAME + " " + ForesterUtil.OS_ARCH + " " + ForesterUtil.OS_VERSION + + "]" ); + Locale.setDefault( Locale.US ); + System.out.println( "[Locale: " + Locale.getDefault() + "]" ); + int failed = 0; + int succeeded = 0; + System.out.print( "[Test if directory with files for testing exists/is readable: " ); + if ( Test.testDir( PATH_TO_TEST_DATA ) ) { + System.out.println( "OK.]" ); + } + else { + System.out.println( "could not find/read from directory \"" + PATH_TO_TEST_DATA + "\".]" ); + System.out.println( "Testing aborted." ); + System.exit( -1 ); + } + System.out.print( "[Test if resources directory exists/is readable: " ); + if ( testDir( PATH_TO_RESOURCES ) ) { + System.out.println( "OK.]" ); + } + else { + System.out.println( "could not find/read from directory \"" + Test.PATH_TO_RESOURCES + "\".]" ); + System.out.println( "Testing aborted." ); + System.exit( -1 ); + } + final long start_time = new Date().getTime(); + System.out.print( "Hmmscan output parser: " ); + if ( testHmmscanOutputParser() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Basic node methods: " ); + if ( Test.testBasicNodeMethods() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Basic node construction and parsing of NHX (node level): " ); + if ( Test.testNHXNodeParsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "NH parsing: " ); + if ( Test.testNHParsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Conversion to NHX (node level): " ); + if ( Test.testNHXconversion() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "NHX parsing: " ); + if ( Test.testNHXParsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "NHX parsing with quotes: " ); + if ( Test.testNHXParsingQuotes() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Nexus characters parsing: " ); + if ( Test.testNexusCharactersParsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Nexus tree parsing: " ); + if ( Test.testNexusTreeParsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Nexus tree parsing (translating): " ); + if ( Test.testNexusTreeParsingTranslating() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Nexus matrix parsing: " ); + if ( Test.testNexusMatrixParsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Basic phyloXML parsing: " ); + if ( Test.testBasicPhyloXMLparsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Basic phyloXML parsing (validating against schema): " ); + if ( testBasicPhyloXMLparsingValidating() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Roundtrip phyloXML parsing (validating against schema): " ); + if ( Test.testBasicPhyloXMLparsingRoundtrip() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "phyloXML Distribution Element: " ); + if ( Test.testPhyloXMLparsingOfDistributionElement() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Tol XML parsing: " ); + if ( Test.testBasicTolXMLparsing() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Copying of node data: " ); + if ( Test.testCopyOfNodeData() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Basic tree methods: " ); + if ( Test.testBasicTreeMethods() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Postorder Iterator: " ); + if ( Test.testPostOrderIterator() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Preorder Iterator: " ); + if ( Test.testPreOrderIterator() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Levelorder Iterator: " ); + if ( Test.testLevelOrderIterator() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Re-id methods: " ); + if ( Test.testReIdMethods() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Methods on last external nodes: " ); + if ( Test.testLastExternalNodeMethods() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Methods on external nodes: " ); + if ( Test.testExternalNodeRelatedMethods() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Deletion of external nodes: " ); + if ( Test.testDeletionOfExternalNodes() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Subtree deletion: " ); + if ( Test.testSubtreeDeletion() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Phylogeny branch: " ); + if ( Test.testPhylogenyBranch() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Rerooting: " ); + if ( Test.testRerooting() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Mipoint rooting: " ); + if ( Test.testMidpointrooting() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Support count: " ); + if ( Test.testSupportCount() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Support transfer: " ); + if ( Test.testSupportTransfer() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Finding of LCA: " ); + if ( Test.testGetLCA() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Calculation of distance between nodes: " ); + if ( Test.testGetDistance() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "SDIse: " ); + if ( Test.testSDIse() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Taxonomy assigner: " ); + if ( Test.testTaxonomyAssigner() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "SDIunrooted: " ); + if ( Test.testSDIunrooted() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "GSDI: " ); + if ( TestGSDI.test() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Descriptive statistics: " ); + if ( Test.testDescriptiveStatistics() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Data objects and methods: " ); + if ( Test.testDataObjects() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Properties map: " ); + if ( Test.testPropertiesMap() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Phylogeny reconstruction:" ); + System.out.println(); + if ( TestPhylogenyReconstruction.test( new File( PATH_TO_TEST_DATA ) ) ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Analysis of domain architectures: " ); + System.out.println(); + if ( TestSurfacing.test( new File( PATH_TO_TEST_DATA ) ) ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "GO: " ); + System.out.println(); + if ( TestGo.test( new File( PATH_TO_TEST_DATA ) ) ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Modeling tools: " ); + if ( TestPccx.test() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Split Matrix strict: " ); + if ( Test.testSplitStrict() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Split Matrix: " ); + if ( Test.testSplit() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Confidence Assessor: " ); + if ( Test.testConfidenceAssessor() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Basic table: " ); + if ( Test.testBasicTable() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "General table: " ); + if ( Test.testGeneralTable() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Amino acid sequence: " ); + if ( Test.testAminoAcidSequence() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "General MSA parser: " ); + if ( Test.testGeneralMsaParser() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Fasta parser for msa: " ); + if ( Test.testFastaParser() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Creation of balanced phylogeny: " ); + if ( Test.testCreateBalancedPhylogeny() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Uniprot Taxonomy Search: " ); + if ( Test.testUniprotTaxonomySearch() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out + .println( "failed [will not count towards failed tests since it might be due to absence internet connection]" ); + } + if ( Mafft.isInstalled() ) { + System.out.print( "MAFFT (external program): " ); + if ( Test.testMafft() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed [will not count towards failed tests]" ); + } + } + // System.out.print( "WABI TxSearch: " ); + // if ( Test.testWabiTxSearch() ) { + // System.out.println( "OK." ); + // succeeded++; + // } + // else { + // System.out + // .println( "failed [will not count towards failed tests since it might be due to absence internet connection]" ); + // } + System.out.println(); + final Runtime rt = java.lang.Runtime.getRuntime(); + final long free_memory = rt.freeMemory() / 1000000; + final long total_memory = rt.totalMemory() / 1000000; + System.out.println( "Running time : " + ( new Date().getTime() - start_time ) + "ms " + "(free memory: " + + free_memory + "MB, total memory: " + total_memory + "MB)" ); + System.out.println(); + System.out.println( "Successful tests: " + succeeded ); + System.out.println( "Failed tests: " + failed ); + System.out.println(); + if ( failed < 1 ) { + System.out.println( "OK." ); + } + else { + System.out.println( "Not OK." ); + } + // System.out.println(); + // Development.setTime( true ); + //try { + // final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + // final String clc = System.getProperty( "user.dir" ) + ForesterUtil.getFileSeparator() + // + "examples" + ForesterUtil.getFileSeparator() + "CLC.nhx"; + // final String multi = Test.PATH_TO_EXAMPLE_FILES + + // "multifurcations_ex_1.nhx"; + // final String domains = Test.PATH_TO_EXAMPLE_FILES + "domains1.nhx"; + // final Phylogeny t1 = factory.create( new File( domains ), new + // NHXParser() )[ 0 ]; + // final Phylogeny t2 = factory.create( new File( clc ), new NHXParser() )[ 0 ]; + // } + // catch ( final Exception e ) { + // e.printStackTrace(); + // } + // t1.getRoot().preorderPrint(); + // final PhylogenyFactory factory = ParserBasedPhylogenyFactory + // .getInstance(); + // try { + // + // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES + // + "\\AtNBSpos.nhx" ) ); + // factory.create( + // new File( PATH_TO_EXAMPLE_FILES + "\\AtNBSpos.nhx" ), + // new NHXParser() ); + // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES + // + "\\AtNBSpos.nhx" ) ); + // factory.create( + // new File( PATH_TO_EXAMPLE_FILES + "\\AtNBSpos.nhx" ), + // new NHXParser() ); + // + // + // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES + // + "\\big_tree.nhx" ) ); + // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES + // + "\\big_tree.nhx" ) ); + // factory.create( + // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), + // new NHXParser() ); + // factory.create( + // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), + // new NHXParser() ); + // + // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES + // + "\\big_tree.nhx" ) ); + // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES + // + "\\big_tree.nhx" ) ); + // + // factory.create( + // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), + // new NHXParser() ); + // factory.create( + // new File( PATH_TO_EXAMPLE_FILES + "\\big_tree.nhx" ), + // new NHXParser() ); + // + // Helper.readNHtree( new File( PATH_TO_EXAMPLE_FILES + // + "\\AtNBSpos.nhx" ) ); + // factory.create( + // new File( PATH_TO_EXAMPLE_FILES + "\\AtNBSpos.nhx" ), + // new NHXParser() ); + // + // } + // catch ( IOException e ) { + // // TODO Auto-generated catch block + // e.printStackTrace(); + // } + } + + private static boolean testBasicNodeMethods() { + try { + if ( PhylogenyNode.getNodeCount() != 0 ) { + return false; + } + final PhylogenyNode n1 = new PhylogenyNode(); + final PhylogenyNode n2 = new PhylogenyNode( "", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + final PhylogenyNode n3 = new PhylogenyNode( "n3", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + final PhylogenyNode n4 = new PhylogenyNode( "n4:0.01", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( n1.isHasAssignedEvent() ) { + return false; + } + if ( PhylogenyNode.getNodeCount() != 4 ) { + return false; + } + if ( n3.getIndicator() != 0 ) { + return false; + } + if ( n3.getNumberOfExternalNodes() != 1 ) { + return false; + } + if ( !n3.isExternal() ) { + return false; + } + if ( !n3.isRoot() ) { + return false; + } + if ( !n4.getName().equals( "n4" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicPhyloXMLparsing() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhyloXmlParser xml_parser = new PhyloXmlParser(); + final Phylogeny[] phylogenies_0 = factory.create( Test.PATH_TO_TEST_DATA + "phyloxml_test_t1.xml", + xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_0.length != 4 ) { + return false; + } + final Phylogeny t1 = phylogenies_0[ 0 ]; + final Phylogeny t2 = phylogenies_0[ 1 ]; + final Phylogeny t3 = phylogenies_0[ 2 ]; + final Phylogeny t4 = phylogenies_0[ 3 ]; + if ( t1.getNumberOfExternalNodes() != 1 ) { + return false; + } + if ( !t1.isRooted() ) { + return false; + } + if ( t1.isRerootable() ) { + return false; + } + if ( !t1.getType().equals( "gene_tree" ) ) { + return false; + } + if ( t2.getNumberOfExternalNodes() != 2 ) { + return false; + } + if ( !isEqual( t2.getNode( "node a" ).getDistanceToParent(), 1.0 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "node b" ).getDistanceToParent(), 2.0 ) ) { + return false; + } + if ( t2.getNode( "node a" ).getNodeData().getTaxonomies().size() != 2 ) { + return false; + } + if ( !t2.getNode( "node a" ).getNodeData().getTaxonomy( 0 ).getCommonName().equals( "some parasite" ) ) { + return false; + } + if ( !t2.getNode( "node a" ).getNodeData().getTaxonomy( 1 ).getCommonName().equals( "the host" ) ) { + return false; + } + if ( t2.getNode( "node a" ).getNodeData().getSequences().size() != 2 ) { + return false; + } + if ( !t2.getNode( "node a" ).getNodeData().getSequence( 0 ).getMolecularSequence() + .startsWith( "actgtgggggt" ) ) { + return false; + } + if ( !t2.getNode( "node a" ).getNodeData().getSequence( 1 ).getMolecularSequence() + .startsWith( "ctgtgatgcat" ) ) { + return false; + } + if ( t3.getNumberOfExternalNodes() != 4 ) { + return false; + } + if ( !t1.getName().equals( "t1" ) ) { + return false; + } + if ( !t2.getName().equals( "t2" ) ) { + return false; + } + if ( !t3.getName().equals( "t3" ) ) { + return false; + } + if ( !t4.getName().equals( "t4" ) ) { + return false; + } + if ( !t3.getIdentifier().getValue().equals( "1-1" ) ) { + return false; + } + if ( !t3.getIdentifier().getProvider().equals( "treebank" ) ) { + return false; + } + if ( !t3.getNode( "root node" ).getNodeData().getSequence().getType().equals( "protein" ) ) { + return false; + } + if ( !t3.getNode( "root node" ).getNodeData().getSequence().getName() + .equals( "Apoptosis facilitator Bcl-2-like 14 protein" ) ) { + return false; + } + if ( !t3.getNode( "root node" ).getNodeData().getSequence().getSymbol().equals( "BCL2L14" ) ) { + return false; + } + if ( !t3.getNode( "root node" ).getNodeData().getSequence().getAccession().getValue().equals( "Q9BZR8" ) ) { + return false; + } + if ( !t3.getNode( "root node" ).getNodeData().getSequence().getAccession().getSource().equals( "UniProtKB" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getDesc() + .equals( "apoptosis" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getRef() + .equals( "GO:0006915" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getSource() + .equals( "UniProtKB" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getEvidence() + .equals( "experimental" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getType() + .equals( "function" ) ) { + return false; + } + if ( ( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getConfidence() + .getValue() != 1 ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getConfidence() + .getType().equals( "ml" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getDesc() + .equals( "apoptosis" ) ) { + return false; + } + if ( ( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getAppliesTo() != AppliesTo.ANNOTATION ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getDataType().equals( "xsd:double" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getRef().equals( "AFFY:expression" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getUnit().equals( "AFFY:x" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getValue().equals( "0.2" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "MED:disease" ).getValue().equals( "lymphoma" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 0 ) ).getRef() + .equals( "GO:0005829" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 2 ) ).getDesc() + .equals( "intracellular organelle" ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getUri( 0 ).getType().equals( "source" ) ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getUri( 0 ).getDescription() + .equals( "UniProt link" ) ) ) { + return false; + } + if ( !( t3.getNode( "root node" ).getNodeData().getSequence().getLocation().equals( "12p13-p12" ) ) ) { + return false; + } + //if ( !( t3.getNode( "root node" ).getNodeData().getDistribution().getDesc().equals( "irgendwo" ) ) ) { + // return false; + //} + // if ( !( t3.getNode( "root node" ).getNodeData().getReference().getDoi().equals( "10.1074/jbc.M005889200" ) ) ) { + // return false; + // } + // if ( !t3.getNode( "root node" ).getNodeData().getTaxonomy().getType().equals( "host" ) ) { + // return false; + // } + // if ( !t3.getNode( "root node" ).getNodeData().getTaxonomy().getTaxonomyCode().equals( "ECDYS" ) ) { + // return false; + // } + // if ( !t3.getNode( "root node" ).getNodeData().getTaxonomy().getScientificName().equals( "ecdysozoa" ) ) { + // return false; + // } + // if ( !t3.getNode( "root node" ).getNodeData().getTaxonomy().getCommonName().equals( "molting animals" ) ) { + // return false; + // } + // if ( !t3.getNode( "root node" ).getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) { + // return false; + // } + // if ( !t3.getNode( "root node" ).getNodeData().getTaxonomy().getIdentifier().getType().equals( "ncbi" ) ) { + // return false; + // } + // if ( t3.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getTotalLength() != 124 ) { + // return false; + // } + // if ( !t3.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ).getName() + // .equals( "B" ) ) { + // return false; + // } + // if ( t3.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ).getFrom() != 21 ) { + // return false; + // } + // if ( t3.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ).getTo() != 44 ) { + // return false; + // } + // if ( t3.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ).getLength() != 24 ) { + // return false; + // } + // if ( t3.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ) + // .getConfidence() != 2144 ) { + // return false; + // } + // if ( !t3.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ).getId() + // .equals( "pfam" ) ) { + // return false; + // } + // if ( t3.getNode( "node bb" ).getNodeData().getBinaryCharacters().getGainedCharacters().size() != 3 ) { + // return false; + // } + // if ( t3.getNode( "node bb" ).getNodeData().getBinaryCharacters().getPresentCharacters().size() != 2 ) { + // return false; + // } + // if ( t3.getNode( "node bb" ).getNodeData().getBinaryCharacters().getLostCharacters().size() != 1 ) { + // return false; + // } + // if ( !t3.getNode( "node bb" ).getNodeData().getBinaryCharacters().getType().equals( "domains" ) ) { + // return false; + // } + // if ( ( ( BinaryCharacters ) t3.getNode( "node bb" ).getNodeData().getBinaryCharacters().copy() ) + // .getLostCount() != BinaryCharacters.COUNT_DEFAULT ) { + // ; + // return false; + // } + // if ( t3.getNode( "node b" ).getNodeData().getBinaryCharacters().getGainedCount() != 1 ) { + // return false; + // } + // if ( t3.getNode( "node b" ).getNodeData().getBinaryCharacters().getGainedCharacters().size() != 1 ) { + // return false; + // } + // if ( t3.getNode( "node b" ).getNodeData().getBinaryCharacters().getLostCount() != 3 ) { + // return false; + // } + // if ( t3.getNode( "node b" ).getNodeData().getBinaryCharacters().getLostCharacters().size() != 3 ) { + // return false; + // } + // if ( t3.getNode( "node b" ).getNodeData().getBinaryCharacters().getPresentCount() != 2 ) { + // return false; + // } + // if ( t3.getNode( "node b" ).getNodeData().getBinaryCharacters().getPresentCharacters().size() != 2 ) { + // return false; + // } + // if ( !t3.getNode( "node b" ).getNodeData().getBinaryCharacters().getType().equals( "characters" ) ) { + // return false; + // } + // final Phylogeny[] phylogenies_1 = factory.create( Test.PATH_TO_TEST_DATA + "phyloxml_test_t4.xml", + // xml_parser ); + // if ( xml_parser.getErrorCount() > 0 ) { + // System.out.println( xml_parser.getErrorMessages().toString() ); + // return false; + // } + // if ( phylogenies_1.length != 2 ) { + // return false; + // } + // final Phylogeny a = phylogenies_1[ 0 ]; + // if ( !a.getName().equals( "tree 4" ) ) { + // return false; + // } + // if ( a.getNumberOfExternalNodes() != 3 ) { + // return false; + // } + // if ( !a.getNode( "node b1" ).getNodeData().getSequence().getName().equals( "b1 gene" ) ) { + // return false; + // } + // if ( !a.getNode( "node b1" ).getNodeData().getTaxonomy().getCommonName().equals( "b1 species" ) ) { + // return false; + // } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicPhyloXMLparsingRoundtrip() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhyloXmlParser xml_parser = new PhyloXmlParser(); + if ( USE_LOCAL_PHYLOXML_SCHEMA ) { + xml_parser.setValidateAgainstSchema( PHYLOXML_LOCAL_XSD ); + } + else { + xml_parser.setValidateAgainstSchema( PHYLOXML_REMOTE_XSD ); + } + final Phylogeny[] phylogenies_0 = factory.create( Test.PATH_TO_TEST_DATA + "phyloxml_test_t1.xml", + xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_0.length != 4 ) { + return false; + } + final StringBuffer t1_sb = new StringBuffer( phylogenies_0[ 0 ].toPhyloXML( 0 ) ); + final Phylogeny[] phylogenies_t1 = factory.create( t1_sb, xml_parser ); + if ( phylogenies_t1.length != 1 ) { + return false; + } + final Phylogeny t1_rt = phylogenies_t1[ 0 ]; + if ( !t1_rt.getDistanceUnit().equals( "cc" ) ) { + return false; + } + if ( !t1_rt.isRooted() ) { + return false; + } + if ( t1_rt.isRerootable() ) { + return false; + } + if ( !t1_rt.getType().equals( "gene_tree" ) ) { + return false; + } + final StringBuffer t2_sb = new StringBuffer( phylogenies_0[ 1 ].toPhyloXML( 0 ) ); + final Phylogeny[] phylogenies_t2 = factory.create( t2_sb, xml_parser ); + final Phylogeny t2_rt = phylogenies_t2[ 0 ]; + if ( t2_rt.getNode( "node a" ).getNodeData().getTaxonomies().size() != 2 ) { + return false; + } + if ( !t2_rt.getNode( "node a" ).getNodeData().getTaxonomy( 0 ).getCommonName().equals( "some parasite" ) ) { + return false; + } + if ( !t2_rt.getNode( "node a" ).getNodeData().getTaxonomy( 1 ).getCommonName().equals( "the host" ) ) { + return false; + } + if ( t2_rt.getNode( "node a" ).getNodeData().getSequences().size() != 2 ) { + return false; + } + if ( !t2_rt.getNode( "node a" ).getNodeData().getSequence( 0 ).getMolecularSequence() + .startsWith( "actgtgggggt" ) ) { + return false; + } + if ( !t2_rt.getNode( "node a" ).getNodeData().getSequence( 1 ).getMolecularSequence() + .startsWith( "ctgtgatgcat" ) ) { + return false; + } + final StringBuffer t3_sb_0 = new StringBuffer( phylogenies_0[ 2 ].toPhyloXML( 0 ) ); + final Phylogeny[] phylogenies_1_0 = factory.create( t3_sb_0, xml_parser ); + final StringBuffer t3_sb = new StringBuffer( phylogenies_1_0[ 0 ].toPhyloXML( 0 ) ); + final Phylogeny[] phylogenies_1 = factory.create( t3_sb, xml_parser ); + if ( phylogenies_1.length != 1 ) { + return false; + } + final Phylogeny t3_rt = phylogenies_1[ 0 ]; + if ( !t3_rt.getName().equals( "t3" ) ) { + return false; + } + if ( t3_rt.getNumberOfExternalNodes() != 4 ) { + return false; + } + if ( !t3_rt.getIdentifier().getValue().equals( "1-1" ) ) { + return false; + } + if ( !t3_rt.getIdentifier().getProvider().equals( "treebank" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getSequence().getType().equals( "protein" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getSequence().getName() + .equals( "Apoptosis facilitator Bcl-2-like 14 protein" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getSequence().getSymbol().equals( "BCL2L14" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getSequence().getAccession().getValue().equals( "Q9BZR8" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getSequence().getAccession().getSource() + .equals( "UniProtKB" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getDesc() + .equals( "apoptosis" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getRef() + .equals( "GO:0006915" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getSource() + .equals( "UniProtKB" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getEvidence() + .equals( "experimental" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getType() + .equals( "function" ) ) { + return false; + } + if ( ( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getConfidence() + .getValue() != 1 ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getConfidence() + .getType().equals( "ml" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getDesc() + .equals( "apoptosis" ) ) { + return false; + } + if ( ( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getAppliesTo() != AppliesTo.ANNOTATION ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getDataType().equals( "xsd:double" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getRef().equals( "AFFY:expression" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getUnit().equals( "AFFY:x" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "AFFY:expression" ).getValue().equals( "0.2" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 1 ) ).getProperties() + .getProperty( "MED:disease" ).getValue().equals( "lymphoma" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 0 ) ).getRef() + .equals( "GO:0005829" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getAnnotation( 2 ) ).getDesc() + .equals( "intracellular organelle" ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getUri( 0 ).getType().equals( "source" ) ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getUri( 0 ).getDescription() + .equals( "UniProt link" ) ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getSequence().getLocation().equals( "12p13-p12" ) ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getReference().getDoi().equals( "10.1038/387489a0" ) ) ) { + return false; + } + if ( !( t3_rt.getNode( "root node" ).getNodeData().getReference().getDescription() + .equals( "Aguinaldo, A. M. A.; J. M. Turbeville, L. S. Linford, M. C. Rivera, J. R. Garey, R. A. Raff, & J. A. Lake (1997). \"Evidence for a clade of nematodes, arthropods and other moulting animals\". Nature 387 (6632): 489–493." ) ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getTaxonomy().getTaxonomyCode().equals( "ECDYS" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getTaxonomy().getScientificName().equals( "ecdysozoa" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getTaxonomy().getCommonName().equals( "molting animals" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getTaxonomy().getIdentifier().getValue().equals( "1" ) ) { + return false; + } + if ( !t3_rt.getNode( "root node" ).getNodeData().getTaxonomy().getIdentifier().getProvider() + .equals( "ncbi" ) ) { + return false; + } + if ( t3_rt.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getTotalLength() != 124 ) { + return false; + } + if ( !t3_rt.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ) + .getName().equals( "B" ) ) { + return false; + } + if ( t3_rt.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ) + .getFrom() != 21 ) { + return false; + } + if ( t3_rt.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ).getTo() != 44 ) { + return false; + } + if ( t3_rt.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ) + .getLength() != 24 ) { + return false; + } + if ( t3_rt.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ) + .getConfidence() != 2144 ) { + return false; + } + if ( !t3_rt.getNode( "node bc" ).getNodeData().getSequence().getDomainArchitecture().getDomain( 0 ).getId() + .equals( "pfam" ) ) { + return false; + } + if ( t3_rt.getNode( "node bb" ).getNodeData().getBinaryCharacters().getGainedCharacters().size() != 3 ) { + return false; + } + if ( t3_rt.getNode( "node bb" ).getNodeData().getBinaryCharacters().getPresentCharacters().size() != 2 ) { + return false; + } + if ( t3_rt.getNode( "node bb" ).getNodeData().getBinaryCharacters().getLostCharacters().size() != 1 ) { + return false; + } + if ( !t3_rt.getNode( "node bb" ).getNodeData().getBinaryCharacters().getType().equals( "domains" ) ) { + return false; + } + final Taxonomy taxbb = t3_rt.getNode( "node bb" ).getNodeData().getTaxonomy(); + if ( !taxbb.getAuthority().equals( "Stephenson, 1935" ) ) { + return false; + } + if ( !taxbb.getCommonName().equals( "starlet sea anemone" ) ) { + return false; + } + if ( !taxbb.getIdentifier().getProvider().equals( "EOL" ) ) { + return false; + } + if ( !taxbb.getIdentifier().getValue().equals( "704294" ) ) { + return false; + } + if ( !taxbb.getTaxonomyCode().equals( "NEMVE" ) ) { + return false; + } + if ( !taxbb.getScientificName().equals( "Nematostella vectensis" ) ) { + return false; + } + if ( taxbb.getSynonyms().size() != 2 ) { + return false; + } + if ( !taxbb.getSynonyms().contains( "Nematostella vectensis Stephenson1935" ) ) { + return false; + } + if ( !taxbb.getSynonyms().contains( "See Anemone" ) ) { + return false; + } + if ( !taxbb.getUri( 0 ).getDescription().equals( "EOL" ) ) { + return false; + } + if ( !taxbb.getUri( 0 ).getType().equals( "linkout" ) ) { + return false; + } + if ( !taxbb.getUri( 0 ).getValue().toString().equals( "http://www.eol.org/pages/704294" ) ) { + return false; + } + if ( ( ( BinaryCharacters ) t3_rt.getNode( "node bb" ).getNodeData().getBinaryCharacters().copy() ) + .getLostCount() != BinaryCharacters.COUNT_DEFAULT ) { + ; + return false; + } + if ( t3_rt.getNode( "node b" ).getNodeData().getBinaryCharacters().getGainedCount() != 1 ) { + return false; + } + if ( t3_rt.getNode( "node b" ).getNodeData().getBinaryCharacters().getGainedCharacters().size() != 1 ) { + return false; + } + if ( t3_rt.getNode( "node b" ).getNodeData().getBinaryCharacters().getLostCount() != 3 ) { + return false; + } + if ( t3_rt.getNode( "node b" ).getNodeData().getBinaryCharacters().getLostCharacters().size() != 3 ) { + return false; + } + if ( t3_rt.getNode( "node b" ).getNodeData().getBinaryCharacters().getPresentCount() != 2 ) { + return false; + } + if ( t3_rt.getNode( "node b" ).getNodeData().getBinaryCharacters().getPresentCharacters().size() != 2 ) { + return false; + } + if ( !t3_rt.getNode( "node b" ).getNodeData().getBinaryCharacters().getType().equals( "characters" ) ) { + return false; + } + // + if ( !t3_rt.getNode( "node ba" ).getNodeData().getDate().getDesc().equals( "Silurian" ) ) { + return false; + } + if ( !t3_rt.getNode( "node ba" ).getNodeData().getDate().getValue().toPlainString() + .equalsIgnoreCase( "435" ) ) { + return false; + } + if ( !t3_rt.getNode( "node ba" ).getNodeData().getDate().getMin().toPlainString().equalsIgnoreCase( "416" ) ) { + return false; + } + if ( !t3_rt.getNode( "node ba" ).getNodeData().getDate().getMax().toPlainString() + .equalsIgnoreCase( "443.7" ) ) { + return false; + } + if ( !t3_rt.getNode( "node ba" ).getNodeData().getDate().getUnit().equals( "mya" ) ) { + return false; + } + if ( !t3_rt.getNode( "node bb" ).getNodeData().getDate().getDesc().equals( "Triassic" ) ) { + return false; + } + if ( !t3_rt.getNode( "node bc" ).getNodeData().getDate().getValue().toPlainString() + .equalsIgnoreCase( "433" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicPhyloXMLparsingValidating() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + PhyloXmlParser xml_parser = null; + try { + xml_parser = PhyloXmlParser.createPhyloXmlParserXsdValidating(); + } + catch ( final Exception e ) { + // Do nothing -- means were not running from jar. + } + if ( xml_parser == null ) { + xml_parser = new PhyloXmlParser(); + if ( USE_LOCAL_PHYLOXML_SCHEMA ) { + xml_parser.setValidateAgainstSchema( PHYLOXML_LOCAL_XSD ); + } + else { + xml_parser.setValidateAgainstSchema( PHYLOXML_REMOTE_XSD ); + } + } + final Phylogeny[] phylogenies_0 = factory.create( Test.PATH_TO_TEST_DATA + "phyloxml_test_t1.xml", + xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_0.length != 4 ) { + return false; + } + final Phylogeny t1 = phylogenies_0[ 0 ]; + final Phylogeny t2 = phylogenies_0[ 1 ]; + final Phylogeny t3 = phylogenies_0[ 2 ]; + final Phylogeny t4 = phylogenies_0[ 3 ]; + if ( !t1.getName().equals( "t1" ) ) { + return false; + } + if ( !t2.getName().equals( "t2" ) ) { + return false; + } + if ( !t3.getName().equals( "t3" ) ) { + return false; + } + if ( !t4.getName().equals( "t4" ) ) { + return false; + } + if ( t1.getNumberOfExternalNodes() != 1 ) { + return false; + } + if ( t2.getNumberOfExternalNodes() != 2 ) { + return false; + } + if ( t3.getNumberOfExternalNodes() != 4 ) { + return false; + } + final String x2 = Test.PATH_TO_TEST_DATA + "phyloxml_test_t1.xml"; + final Phylogeny[] phylogenies_1 = factory.create( x2, xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( "errors:" ); + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_1.length != 4 ) { + return false; + } + final Phylogeny[] phylogenies_2 = factory.create( Test.PATH_TO_TEST_DATA + "phyloxml_test_t3.xml", + xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( "errors:" ); + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_2.length != 1 ) { + return false; + } + if ( phylogenies_2[ 0 ].getNumberOfExternalNodes() != 2 ) { + return false; + } + final Phylogeny[] phylogenies_3 = factory.create( Test.PATH_TO_TEST_DATA + "phyloxml_test_t4.xml", + xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_3.length != 2 ) { + return false; + } + final Phylogeny a = phylogenies_3[ 0 ]; + if ( !a.getName().equals( "tree 4" ) ) { + return false; + } + if ( a.getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !a.getNode( "node b1" ).getNodeData().getSequence().getName().equals( "b1 gene" ) ) { + return false; + } + if ( !a.getNode( "node b1" ).getNodeData().getTaxonomy().getCommonName().equals( "b1 species" ) ) { + return false; + } + final Phylogeny[] phylogenies_4 = factory.create( Test.PATH_TO_TEST_DATA + "special_characters.xml", + xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_4.length != 1 ) { + return false; + } + final Phylogeny s = phylogenies_4[ 0 ]; + if ( s.getNumberOfExternalNodes() != 6 ) { + return false; + } + s.getNode( "first" ); + s.getNode( "<>" ); + s.getNode( "\"\"" ); + s.getNode( "'''\"" ); + s.getNode( "\"\"\"" ); + s.getNode( "dick & doof" ); + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicTable() { + try { + final BasicTable t0 = new BasicTable(); + if ( t0.getNumberOfColumns() != 0 ) { + return false; + } + if ( t0.getNumberOfRows() != 0 ) { + return false; + } + t0.setValue( 3, 2, "23" ); + t0.setValue( 10, 1, "error" ); + t0.setValue( 10, 1, "110" ); + t0.setValue( 9, 1, "19" ); + t0.setValue( 1, 10, "101" ); + t0.setValue( 10, 10, "1010" ); + t0.setValue( 100, 10, "10100" ); + t0.setValue( 0, 0, "00" ); + if ( !t0.getValue( 3, 2 ).equals( "23" ) ) { + return false; + } + if ( !t0.getValue( 10, 1 ).equals( "110" ) ) { + return false; + } + if ( !t0.getValueAsString( 1, 10 ).equals( "101" ) ) { + return false; + } + if ( !t0.getValueAsString( 10, 10 ).equals( "1010" ) ) { + return false; + } + if ( !t0.getValueAsString( 100, 10 ).equals( "10100" ) ) { + return false; + } + if ( !t0.getValueAsString( 9, 1 ).equals( "19" ) ) { + return false; + } + if ( !t0.getValueAsString( 0, 0 ).equals( "00" ) ) { + return false; + } + if ( t0.getNumberOfColumns() != 101 ) { + return false; + } + if ( t0.getNumberOfRows() != 11 ) { + return false; + } + if ( t0.getValueAsString( 49, 4 ) != null ) { + return false; + } + final String l = ForesterUtil.getLineSeparator(); + final StringBuffer source = new StringBuffer(); + source.append( "" + l ); + source.append( "# 1 1 1 1 1 1 1 1" + l ); + source.append( " 00 01 02 03" + l ); + source.append( " 10 11 12 13 " + l ); + source.append( "20 21 22 23 " + l ); + source.append( " 30 31 32 33" + l ); + source.append( "40 41 42 43" + l ); + source.append( " # 1 1 1 1 1 " + l ); + source.append( "50 51 52 53 54" + l ); + final BasicTable t1 = BasicTableParser.parse( source.toString(), " " ); + if ( t1.getNumberOfColumns() != 5 ) { + return false; + } + if ( t1.getNumberOfRows() != 6 ) { + return false; + } + if ( !t1.getValueAsString( 0, 0 ).equals( "00" ) ) { + return false; + } + if ( !t1.getValueAsString( 1, 0 ).equals( "01" ) ) { + return false; + } + if ( !t1.getValueAsString( 3, 0 ).equals( "03" ) ) { + return false; + } + if ( !t1.getValueAsString( 4, 5 ).equals( "54" ) ) { + return false; + } + final StringBuffer source1 = new StringBuffer(); + source1.append( "" + l ); + source1.append( "# 1; 1; 1; 1 ;1 ;1; 1 ;1;" + l ); + source1.append( " 00; 01 ;02;03" + l ); + source1.append( " 10; 11; 12; 13 " + l ); + source1.append( "20; 21; 22; 23 " + l ); + source1.append( " 30; 31; 32; 33" + l ); + source1.append( "40;41;42;43" + l ); + source1.append( " # 1 1 1 1 1 " + l ); + source1.append( ";;;50 ; ;52; 53;;54 " + l ); + final BasicTable t2 = BasicTableParser.parse( source1.toString(), ";" ); + if ( t2.getNumberOfColumns() != 5 ) { + return false; + } + if ( t2.getNumberOfRows() != 6 ) { + return false; + } + if ( !t2.getValueAsString( 0, 0 ).equals( "00" ) ) { + return false; + } + if ( !t2.getValueAsString( 1, 0 ).equals( "01" ) ) { + return false; + } + if ( !t2.getValueAsString( 3, 0 ).equals( "03" ) ) { + return false; + } + if ( !t2.getValueAsString( 3, 3 ).equals( "33" ) ) { + return false; + } + if ( !t2.getValueAsString( 3, 5 ).equals( "53" ) ) { + return false; + } + if ( !t2.getValueAsString( 1, 5 ).equals( "" ) ) { + return false; + } + final StringBuffer source2 = new StringBuffer(); + source2.append( "" + l ); + source2.append( "comment: 1; 1; 1; 1 ;1 ;1; 1 ;1;" + l ); + source2.append( " 00; 01 ;02;03" + l ); + source2.append( " 10; 11; 12; 13 " + l ); + source2.append( "20; 21; 22; 23 " + l ); + source2.append( " " + l ); + source2.append( " 30; 31; 32; 33" + l ); + source2.append( "40;41;42;43" + l ); + source2.append( " comment: 1 1 1 1 1 " + l ); + source2.append( ";;;50 ; 52; 53;;54 " + l ); + final List> tl = BasicTableParser.parse( source2.toString(), + ";", + false, + "comment:", + false ); + if ( tl.size() != 2 ) { + return false; + } + final BasicTable t3 = tl.get( 0 ); + final BasicTable t4 = tl.get( 1 ); + if ( t3.getNumberOfColumns() != 4 ) { + return false; + } + if ( t3.getNumberOfRows() != 3 ) { + return false; + } + if ( t4.getNumberOfColumns() != 4 ) { + return false; + } + if ( t4.getNumberOfRows() != 3 ) { + return false; + } + if ( !t3.getValueAsString( 0, 0 ).equals( "00" ) ) { + return false; + } + if ( !t4.getValueAsString( 0, 0 ).equals( "30" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicTolXMLparsing() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final TolParser parser = new TolParser(); + final Phylogeny[] phylogenies_0 = factory.create( Test.PATH_TO_TEST_DATA + "tol_2484.tol", parser ); + if ( parser.getErrorCount() > 0 ) { + System.out.println( parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_0.length != 1 ) { + return false; + } + final Phylogeny t1 = phylogenies_0[ 0 ]; + if ( t1.getNumberOfExternalNodes() != 5 ) { + return false; + } + if ( !t1.isRooted() ) { + return false; + } + if ( !t1.getRoot().getNodeData().getTaxonomy().getScientificName().equals( "Mesozoa" ) ) { + return false; + } + if ( !t1.getRoot().getNodeData().getTaxonomy().getIdentifier().getValue().equals( "2484" ) ) { + return false; + } + if ( !t1.getRoot().getChildNode( 0 ).getNodeData().getTaxonomy().getScientificName().equals( "Rhombozoa" ) ) { + return false; + } + if ( t1.getRoot().getChildNode( 0 ).getNumberOfDescendants() != 3 ) { + return false; + } + final Phylogeny[] phylogenies_1 = factory.create( Test.PATH_TO_TEST_DATA + "tol_2.tol", parser ); + if ( parser.getErrorCount() > 0 ) { + System.out.println( parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_1.length != 1 ) { + return false; + } + final Phylogeny t2 = phylogenies_1[ 0 ]; + if ( t2.getNumberOfExternalNodes() != 664 ) { + return false; + } + if ( !t2.isRooted() ) { + return false; + } + if ( !t2.getRoot().getNodeData().getTaxonomy().getScientificName().equals( "Eubacteria" ) ) { + return false; + } + if ( !t2.getRoot().getNodeData().getTaxonomy().getIdentifier().getValue().equals( "2" ) ) { + return false; + } + if ( t2.getRoot().getNumberOfDescendants() != 24 ) { + return false; + } + if ( t2.getRoot().getNumberOfDescendants() != 24 ) { + return false; + } + if ( !t2.getRoot().getChildNode( 0 ).getNodeData().getTaxonomy().getScientificName().equals( "Aquificae" ) ) { + return false; + } + if ( !t2.getRoot().getChildNode( 0 ).getChildNode( 0 ).getNodeData().getTaxonomy().getScientificName() + .equals( "Aquifex" ) ) { + return false; + } + final Phylogeny[] phylogenies_2 = factory.create( Test.PATH_TO_TEST_DATA + "tol_5.tol", parser ); + if ( parser.getErrorCount() > 0 ) { + System.out.println( parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_2.length != 1 ) { + return false; + } + final Phylogeny t3 = phylogenies_2[ 0 ]; + if ( t3.getNumberOfExternalNodes() != 184 ) { + return false; + } + if ( !t3.getRoot().getNodeData().getTaxonomy().getScientificName().equals( "Viruses" ) ) { + return false; + } + if ( !t3.getRoot().getNodeData().getTaxonomy().getIdentifier().getValue().equals( "5" ) ) { + return false; + } + if ( t3.getRoot().getNumberOfDescendants() != 6 ) { + return false; + } + final Phylogeny[] phylogenies_3 = factory.create( Test.PATH_TO_TEST_DATA + "tol_4567.tol", parser ); + if ( parser.getErrorCount() > 0 ) { + System.out.println( parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_3.length != 1 ) { + return false; + } + final Phylogeny t4 = phylogenies_3[ 0 ]; + if ( t4.getNumberOfExternalNodes() != 1 ) { + return false; + } + if ( !t4.getRoot().getNodeData().getTaxonomy().getScientificName().equals( "Marpissa decorata" ) ) { + return false; + } + if ( !t4.getRoot().getNodeData().getTaxonomy().getIdentifier().getValue().equals( "4567" ) ) { + return false; + } + if ( t4.getRoot().getNumberOfDescendants() != 0 ) { + return false; + } + final Phylogeny[] phylogenies_4 = factory.create( Test.PATH_TO_TEST_DATA + "tol_16299.tol", parser ); + if ( parser.getErrorCount() > 0 ) { + System.out.println( parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_4.length != 1 ) { + return false; + } + final Phylogeny t5 = phylogenies_4[ 0 ]; + if ( t5.getNumberOfExternalNodes() != 13 ) { + return false; + } + if ( !t5.getRoot().getNodeData().getTaxonomy().getScientificName().equals( "Hominidae" ) ) { + return false; + } + if ( !t5.getRoot().getNodeData().getTaxonomy().getIdentifier().getValue().equals( "16299" ) ) { + return false; + } + if ( t5.getRoot().getNumberOfDescendants() != 2 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testBasicTreeMethods() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t1 = factory.create(); + if ( !t1.isEmpty() ) { + return false; + } + final Phylogeny t2 = factory.create( "((A:1,B:2)AB:1,(C:3,D:5)CD:3)ABCD:0.5", new NHXParser() )[ 0 ]; + if ( t2.getNumberOfExternalNodes() != 4 ) { + return false; + } + if ( t2.getHeight() != 8.5 ) { + return false; + } + if ( !t2.isCompletelyBinary() ) { + return false; + } + if ( t2.isEmpty() ) { + return false; + } + final Phylogeny t3 = factory.create( "((A:1,B:2,C:10)ABC:1,(D:3,E:5)DE:3)", new NHXParser() )[ 0 ]; + if ( t3.getNumberOfExternalNodes() != 5 ) { + return false; + } + if ( t3.getHeight() != 11 ) { + return false; + } + if ( t3.isCompletelyBinary() ) { + return false; + } + final PhylogenyNode n = t3.getNode( "ABC" ); + PhylogenyNodeIterator it; + for( it = n.iterateChildNodesForward(); it.hasNext(); ) { + it.next(); + } + for( it.reset(); it.hasNext(); ) { + it.next(); + } + final PhylogenyNodeIterator it2 = n.iterateChildNodesForward(); + if ( !it2.next().getName().equals( "A" ) ) { + return false; + } + if ( !it2.next().getName().equals( "B" ) ) { + return false; + } + if ( !it2.next().getName().equals( "C" ) ) { + return false; + } + if ( it2.hasNext() ) { + return false; + } + final Phylogeny t4 = factory.create( "((A:1,B:2,C:10)ABC:1,(D:3,E:5)DE:3,(F,G,H,I))", new NHXParser() )[ 0 ]; + if ( t4.getNumberOfExternalNodes() != 9 ) { + return false; + } + if ( t4.getHeight() != 11 ) { + return false; + } + if ( t4.isCompletelyBinary() ) { + return false; + } + final StringBuffer sb5 = new StringBuffer( "(((A11:2)A1:2,(A21:1,A22:2,A23)A2:11,A3:2)A:2,B:10,C:3,D:8)" ); + final Phylogeny t5 = factory.create( sb5, new NHXParser() )[ 0 ]; + if ( t5.getNumberOfExternalNodes() != 8 ) { + return false; + } + if ( t5.getHeight() != 15 ) { + return false; + } + final StringBuffer sb6 = new StringBuffer( "(X,Y,Z,(((A111)A11:2)A1:2,(X,Y,Z,A21:1,A22:2,A23)A2:11,A3:2)A:2,B:10,C:3,D:8)" ); + final Phylogeny t6 = factory.create( sb6, new NHXParser() )[ 0 ]; + if ( t6.getHeight() != 15 ) { + return false; + } + final StringBuffer sb7 = new StringBuffer( "(((A11:2)A1:2,(A21:1,A22:2,A23)A2:11,A3:2)A:2,B:10,C:15,D:8)" ); + final Phylogeny t7 = factory.create( sb7, new NHXParser() )[ 0 ]; + if ( t7.getHeight() != 15 ) { + return false; + } + final StringBuffer sb8 = new StringBuffer( "(((A11:11)A1:2,(A21:2,A22:2,A23,A24,AA:)A2:11,A3:2)A:2,B:15,C:15,D:15)" ); + final Phylogeny t8 = factory.create( sb8, new NHXParser() )[ 0 ]; + if ( t8.getNumberOfExternalNodes() != 10 ) { + return false; + } + if ( t8.getHeight() != 15 ) { + return false; + } + final char[] a9 = new char[] {}; + final Phylogeny t9 = factory.create( a9, new NHXParser() )[ 0 ]; + if ( t9.getHeight() != 0 ) { + return false; + } + final char[] a10 = new char[] { 'a', ':', '6' }; + final Phylogeny t10 = factory.create( a10, new NHXParser() )[ 0 ]; + if ( t10.getHeight() != 6 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testConfidenceAssessor() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t0 = factory.create( "((((A,B)ab,C)abc,D)abcd,E)abcde", new NHXParser() )[ 0 ]; + final Phylogeny[] ev0 = factory + .create( "((((A,B),C),D),E);((((A,B),C),D),E);((((A,B),C),D),E);((((A,B),C),D),E);", + new NHXParser() ); + ConfidenceAssessor.evaluate( "bootstrap", ev0, t0, false, 1, 0, 2 ); + if ( !isEqual( t0.getNode( "ab" ).getBranchData().getConfidence( 0 ).getValue(), 3 ) ) { + return false; + } + if ( !isEqual( t0.getNode( "abc" ).getBranchData().getConfidence( 0 ).getValue(), 3 ) ) { + return false; + } + final Phylogeny t1 = factory.create( "((((A,B)ab[&&NHX:B=50],C)abc,D)abcd,E)abcde", new NHXParser() )[ 0 ]; + final Phylogeny[] ev1 = factory + .create( "((((A,B),C),D),E);((A,B),((E,D),C));(((A,B),C),(E,D));(A,(((E,D),C),B));(B,(A,((E,D),C)));(C,((E,D),(A,B)));(D,(E,((A,B),C)));", + new NHXParser() ); + ConfidenceAssessor.evaluate( "bootstrap", ev1, t1, false, 1 ); + if ( !isEqual( t1.getNode( "ab" ).getBranchData().getConfidence( 1 ).getValue(), 7 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "abc" ).getBranchData().getConfidence( 0 ).getValue(), 7 ) ) { + return false; + } + final Phylogeny t_b = factory.create( "((((A,C)ac,D)acd,E)acde,B)abcde", new NHXParser() )[ 0 ]; + final Phylogeny[] ev_b = factory + .create( "((A,C),X);((A,X),C);(A,C);((((A,B),C),D),E);((A,B),((E,D),C));(((A,B),C),(E,D));(A,(((E,D),C),B));(B,(A,((E,D),C)));(C,((E,D),(A,B)));(D,(E,((A,B),C)));((((A,C)ac,D)acd,E)acde,B)abcd", + new NHXParser() ); + ConfidenceAssessor.evaluate( "bootstrap", ev_b, t_b, false, 1 ); + // Archaeopteryx.createApplication( t_b ); //TODO use me again me working here... + if ( !isEqual( t_b.getNode( "ac" ).getBranchData().getConfidence( 0 ).getValue(), 4 ) ) { + return false; + } + if ( !isEqual( t_b.getNode( "acd" ).getBranchData().getConfidence( 0 ).getValue(), 1 ) ) { + return false; + } + // + final Phylogeny t1x = factory.create( "((((A,B)ab,C)abc,D)abcd,E)abcde", new NHXParser() )[ 0 ]; + final Phylogeny[] ev1x = factory + .create( "((((A,B),C),D),E);((A,B),((E,D),C));(((A,B),C),(E,D));(A,(((E,D),C),B));(B,(A,((E,D),C)));(C,((E,D),(A,B)));(D,(E,((A,B),C)));", + new NHXParser() ); + ConfidenceAssessor.evaluate( "bootstrap", ev1x, t1x, true, 1 ); + if ( !isEqual( t1x.getNode( "ab" ).getBranchData().getConfidence( 0 ).getValue(), 7 ) ) { + return false; + } + if ( !isEqual( t1x.getNode( "abc" ).getBranchData().getConfidence( 0 ).getValue(), 7 ) ) { + return false; + } + final Phylogeny t_bx = factory.create( "((((A,C)ac,D)acd,E)acde,B)abcde", new NHXParser() )[ 0 ]; + final Phylogeny[] ev_bx = factory + .create( "((((A,B),C),D),E);((A,B),((E,D),C));(((A,B),C),(E,D));(A,(((E,D),C),B));(B,(A,((E,D),C)));(C,((E,D),(A,B)));(D,(E,((A,B),C)));((((A,C)ac,D)acd,E)acde,B)abcd", + new NHXParser() ); + ConfidenceAssessor.evaluate( "bootstrap", ev_bx, t_bx, true, 1 ); + if ( !isEqual( t_bx.getNode( "ac" ).getBranchData().getConfidence( 0 ).getValue(), 1 ) ) { + return false; + } + if ( !isEqual( t_bx.getNode( "acd" ).getBranchData().getConfidence( 0 ).getValue(), 1 ) ) { + return false; + } + // + final Phylogeny[] t2 = factory + .create( "((((a,b),c),d),e);(((a,b),c),(d,e));(((((a,b),c),d),e),f);((((a,b),c),(d,e)),f);(((a,b),c),d,e);((a,b,c),d,e);", + new NHXParser() ); + final Phylogeny[] ev2 = factory + .create( "((((a,b),c),d),e);((((a,b),c),d),e);((((a,b),e),d),c);((((a,b),e),d),c);(((a,b),(c,d)),e);((a,b),x);((a,b),(x,y));(a,b);(a,e);(a,b,c);", + new NHXParser() ); + for( final Phylogeny target : t2 ) { + ConfidenceAssessor.evaluate( "bootstrap", ev2, target, false, 1 ); + } + // + final Phylogeny t4 = factory.create( "((((((A,B)ab,C)abc,D)abcd,E)abcde,F)abcdef,G)abcdefg", + new NHXParser() )[ 0 ]; + final Phylogeny[] ev4 = factory.create( "(((A,B),C),(X,Y));((F,G),((A,B,C),(D,E)))", new NHXParser() ); + ConfidenceAssessor.evaluate( "bootstrap", ev4, t4, false, 1 ); + if ( !isEqual( t4.getNode( "ab" ).getBranchData().getConfidence( 0 ).getValue(), 1 ) ) { + return false; + } + if ( !isEqual( t4.getNode( "abc" ).getBranchData().getConfidence( 0 ).getValue(), 2 ) ) { + return false; + } + if ( !isEqual( t4.getNode( "abcde" ).getBranchData().getConfidence( 0 ).getValue(), 1 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testCopyOfNodeData() { + try { + final PhylogenyNode n1 = new PhylogenyNode( "n5:0.1[&&NHX:S=Ecoli:E=1.1.1.1:D=Y:Co=Y:B=56:T=1:O=22:SO=33:SN=44:W=2:C=10.20.30:XN=S=tag1=value1=unit1]" ); + final PhylogenyNode n2 = n1.copyNodeData(); + if ( !n1.toNewHampshireX().equals( n2.toNewHampshireX() ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testDataObjects() { + try { + final Confidence s0 = new Confidence(); + final Confidence s1 = new Confidence(); + if ( !s0.isEqual( s1 ) ) { + return false; + } + final Confidence s2 = new Confidence( 0.23, "bootstrap" ); + final Confidence s3 = new Confidence( 0.23, "bootstrap" ); + if ( s2.isEqual( s1 ) ) { + return false; + } + if ( !s2.isEqual( s3 ) ) { + return false; + } + final Confidence s4 = ( Confidence ) s3.copy(); + if ( !s4.isEqual( s3 ) ) { + return false; + } + s3.asSimpleText(); + s3.asText(); + // Taxonomy + // ---------- + final Taxonomy t1 = new Taxonomy(); + final Taxonomy t2 = new Taxonomy(); + final Taxonomy t3 = new Taxonomy(); + final Taxonomy t4 = new Taxonomy(); + final Taxonomy t5 = new Taxonomy(); + t1.setIdentifier( new Identifier( "ecoli" ) ); + t1.setTaxonomyCode( "ECOLI" ); + t1.setScientificName( "E. coli" ); + t1.setCommonName( "coli" ); + final Taxonomy t0 = ( Taxonomy ) t1.copy(); + if ( !t1.isEqual( t0 ) ) { + return false; + } + t2.setIdentifier( new Identifier( "ecoli" ) ); + t2.setTaxonomyCode( "other" ); + t2.setScientificName( "what" ); + t2.setCommonName( "something" ); + if ( !t1.isEqual( t2 ) ) { + return false; + } + t2.setIdentifier( new Identifier( "nemve" ) ); + if ( t1.isEqual( t2 ) ) { + return false; + } + t1.setIdentifier( null ); + t3.setTaxonomyCode( "ECOLI" ); + t3.setScientificName( "what" ); + t3.setCommonName( "something" ); + if ( !t1.isEqual( t3 ) ) { + return false; + } + t1.setIdentifier( null ); + t1.setTaxonomyCode( "" ); + t4.setScientificName( "E. ColI" ); + t4.setCommonName( "something" ); + if ( !t1.isEqual( t4 ) ) { + return false; + } + t4.setScientificName( "B. subtilis" ); + t4.setCommonName( "something" ); + if ( t1.isEqual( t4 ) ) { + return false; + } + t1.setIdentifier( null ); + t1.setTaxonomyCode( "" ); + t1.setScientificName( "" ); + t5.setCommonName( "COLI" ); + if ( !t1.isEqual( t5 ) ) { + return false; + } + t5.setCommonName( "vibrio" ); + if ( t1.isEqual( t5 ) ) { + return false; + } + // Identifier + // ---------- + final Identifier id0 = new Identifier( "123", "pfam" ); + final Identifier id1 = ( Identifier ) id0.copy(); + if ( !id1.isEqual( id1 ) ) { + return false; + } + if ( !id1.isEqual( id0 ) ) { + return false; + } + if ( !id0.isEqual( id1 ) ) { + return false; + } + id1.asSimpleText(); + id1.asText(); + // ProteinDomain + // --------------- + final ProteinDomain pd0 = new ProteinDomain( "abc", 100, 200 ); + final ProteinDomain pd1 = ( ProteinDomain ) pd0.copy(); + if ( !pd1.isEqual( pd1 ) ) { + return false; + } + if ( !pd1.isEqual( pd0 ) ) { + return false; + } + pd1.asSimpleText(); + pd1.asText(); + final ProteinDomain pd2 = new ProteinDomain( pd0.getName(), pd0.getFrom(), pd0.getTo(), "id" ); + final ProteinDomain pd3 = ( ProteinDomain ) pd2.copy(); + if ( !pd3.isEqual( pd3 ) ) { + return false; + } + if ( !pd2.isEqual( pd3 ) ) { + return false; + } + if ( !pd0.isEqual( pd3 ) ) { + return false; + } + pd3.asSimpleText(); + pd3.asText(); + // DomainArchitecture + // ------------------ + final ProteinDomain d0 = new ProteinDomain( "domain0", 10, 20 ); + final ProteinDomain d1 = new ProteinDomain( "domain1", 30, 40 ); + final ProteinDomain d2 = new ProteinDomain( "domain2", 50, 60 ); + final ProteinDomain d3 = new ProteinDomain( "domain3", 70, 80 ); + final ProteinDomain d4 = new ProteinDomain( "domain4", 90, 100 ); + final ArrayList domains0 = new ArrayList(); + domains0.add( d2 ); + domains0.add( d0 ); + domains0.add( d3 ); + domains0.add( d1 ); + final DomainArchitecture ds0 = new DomainArchitecture( domains0, 110 ); + if ( ds0.getNumberOfDomains() != 4 ) { + return false; + } + final DomainArchitecture ds1 = ( DomainArchitecture ) ds0.copy(); + if ( !ds0.isEqual( ds0 ) ) { + return false; + } + if ( !ds0.isEqual( ds1 ) ) { + return false; + } + if ( ds1.getNumberOfDomains() != 4 ) { + return false; + } + final ArrayList domains1 = new ArrayList(); + domains1.add( d1 ); + domains1.add( d2 ); + domains1.add( d4 ); + domains1.add( d0 ); + final DomainArchitecture ds2 = new DomainArchitecture( domains1, 200 ); + if ( ds0.isEqual( ds2 ) ) { + return false; + } + ds1.asSimpleText(); + ds1.asText(); + ds1.toNHX(); + final DomainArchitecture ds3 = new DomainArchitecture( "120>30>40>0.9>b>50>60>0.4>c>10>20>0.1>a" ); + if ( !ds3.toNHX().toString().equals( ":DS=120>10>20>0.1>a>30>40>0.9>b>50>60>0.4>c" ) ) { + System.out.println( ds3.toNHX() ); + return false; + } + if ( ds3.getNumberOfDomains() != 3 ) { + return false; + } + // Event + // ----- + final Event e1 = new Event( Event.EventType.fusion ); + if ( e1.isDuplication() ) { + return false; + } + if ( !e1.isFusion() ) { + return false; + } + if ( !e1.asText().toString().equals( "fusion" ) ) { + return false; + } + if ( !e1.asSimpleText().toString().equals( "fusion" ) ) { + return false; + } + final Event e11 = new Event( Event.EventType.fusion ); + if ( !e11.isEqual( e1 ) ) { + return false; + } + if ( !e11.toNHX().toString().equals( "" ) ) { + return false; + } + final Event e2 = new Event( Event.EventType.speciation_or_duplication ); + if ( e2.isDuplication() ) { + return false; + } + if ( !e2.isSpeciationOrDuplication() ) { + return false; + } + if ( !e2.asText().toString().equals( "speciation_or_duplication" ) ) { + return false; + } + if ( !e2.asSimpleText().toString().equals( "?" ) ) { + return false; + } + if ( !e2.toNHX().toString().equals( ":D=?" ) ) { + return false; + } + if ( e11.isEqual( e2 ) ) { + return false; + } + final Event e2c = ( Event ) e2.copy(); + if ( !e2c.isEqual( e2 ) ) { + return false; + } + Event e3 = new Event( 1, 2, 3 ); + if ( e3.isDuplication() ) { + return false; + } + if ( e3.isSpeciation() ) { + return false; + } + if ( e3.isGeneLoss() ) { + return false; + } + if ( !e3.asText().toString().equals( "duplications [1] speciations [2] gene-losses [3]" ) ) { + return false; + } + final Event e3c = ( Event ) e3.copy(); + final Event e3cc = ( Event ) e3c.copy(); + if ( !e3c.asSimpleText().toString().equals( "D2S3L" ) ) { + return false; + } + e3 = null; + if ( !e3c.isEqual( e3cc ) ) { + return false; + } + Event e4 = new Event( 1, 2, 3 ); + if ( !e4.asText().toString().equals( "duplications [1] speciations [2] gene-losses [3]" ) ) { + return false; + } + if ( !e4.asSimpleText().toString().equals( "D2S3L" ) ) { + return false; + } + final Event e4c = ( Event ) e4.copy(); + e4 = null; + final Event e4cc = ( Event ) e4c.copy(); + if ( !e4cc.asText().toString().equals( "duplications [1] speciations [2] gene-losses [3]" ) ) { + return false; + } + if ( !e4c.isEqual( e4cc ) ) { + return false; + } + final Event e5 = new Event(); + if ( !e5.isUnassigned() ) { + return false; + } + if ( !e5.asText().toString().equals( "unassigned" ) ) { + return false; + } + if ( !e5.asSimpleText().toString().equals( "" ) ) { + return false; + } + final Event e6 = new Event( 1, 0, 0 ); + if ( !e6.asText().toString().equals( "duplication" ) ) { + return false; + } + if ( !e6.asSimpleText().toString().equals( "D" ) ) { + return false; + } + final Event e7 = new Event( 0, 1, 0 ); + if ( !e7.asText().toString().equals( "speciation" ) ) { + return false; + } + if ( !e7.asSimpleText().toString().equals( "S" ) ) { + return false; + } + final Event e8 = new Event( 0, 0, 1 ); + if ( !e8.asText().toString().equals( "gene-loss" ) ) { + return false; + } + if ( !e8.asSimpleText().toString().equals( "L" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDeletionOfExternalNodes() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t0 = factory.create( "A", new NHXParser() )[ 0 ]; + final PhylogenyWriter w = new PhylogenyWriter(); + if ( t0.isEmpty() ) { + return false; + } + if ( t0.getNumberOfExternalNodes() != 1 ) { + return false; + } + t0.deleteSubtree( t0.getNode( "A" ), false ); + if ( t0.getNumberOfExternalNodes() != 0 ) { + return false; + } + if ( !t0.isEmpty() ) { + return false; + } + final Phylogeny t1 = factory.create( "(A,B)r", new NHXParser() )[ 0 ]; + if ( t1.getNumberOfExternalNodes() != 2 ) { + return false; + } + t1.deleteSubtree( t1.getNode( "A" ), false ); + if ( t1.getNumberOfExternalNodes() != 1 ) { + return false; + } + if ( !t1.getNode( "B" ).getName().equals( "B" ) ) { + return false; + } + t1.deleteSubtree( t1.getNode( "B" ), false ); + if ( t1.getNumberOfExternalNodes() != 1 ) { + return false; + } + t1.deleteSubtree( t1.getNode( "r" ), false ); + if ( !t1.isEmpty() ) { + return false; + } + final Phylogeny t2 = factory.create( "((A,B),C)", new NHXParser() )[ 0 ]; + if ( t2.getNumberOfExternalNodes() != 3 ) { + return false; + } + t2.deleteSubtree( t2.getNode( "B" ), false ); + if ( t2.getNumberOfExternalNodes() != 2 ) { + return false; + } + t2.toNewHampshireX(); + PhylogenyNode n = t2.getNode( "A" ); + if ( !n.getNextExternalNode().getName().equals( "C" ) ) { + return false; + } + t2.deleteSubtree( t2.getNode( "A" ), false ); + if ( t2.getNumberOfExternalNodes() != 2 ) { + return false; + } + t2.deleteSubtree( t2.getNode( "C" ), true ); + if ( t2.getNumberOfExternalNodes() != 1 ) { + return false; + } + final Phylogeny t3 = factory.create( "((A,B),(C,D))", new NHXParser() )[ 0 ]; + if ( t3.getNumberOfExternalNodes() != 4 ) { + return false; + } + t3.deleteSubtree( t3.getNode( "B" ), true ); + if ( t3.getNumberOfExternalNodes() != 3 ) { + return false; + } + n = t3.getNode( "A" ); + if ( !n.getNextExternalNode().getName().equals( "C" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getNextExternalNode().getName().equals( "D" ) ) { + return false; + } + t3.deleteSubtree( t3.getNode( "A" ), true ); + if ( t3.getNumberOfExternalNodes() != 2 ) { + return false; + } + n = t3.getNode( "C" ); + if ( !n.getNextExternalNode().getName().equals( "D" ) ) { + return false; + } + t3.deleteSubtree( t3.getNode( "C" ), true ); + if ( t3.getNumberOfExternalNodes() != 1 ) { + return false; + } + t3.deleteSubtree( t3.getNode( "D" ), true ); + if ( t3.getNumberOfExternalNodes() != 0 ) { + return false; + } + final Phylogeny t4 = factory.create( "((A,((B11,B12),B2)),(C,D))", new NHXParser() )[ 0 ]; + if ( t4.getNumberOfExternalNodes() != 6 ) { + return false; + } + t4.deleteSubtree( t4.getNode( "B2" ), true ); + if ( t4.getNumberOfExternalNodes() != 5 ) { + return false; + } + String s = w.toNewHampshire( t4, false, true ).toString(); + if ( !s.equals( "((A,(B11,B12)),(C,D));" ) ) { + return false; + } + t4.deleteSubtree( t4.getNode( "B11" ), true ); + if ( t4.getNumberOfExternalNodes() != 4 ) { + return false; + } + t4.deleteSubtree( t4.getNode( "C" ), true ); + if ( t4.getNumberOfExternalNodes() != 3 ) { + return false; + } + n = t4.getNode( "A" ); + n = n.getNextExternalNode(); + if ( !n.getName().equals( "B12" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "D" ) ) { + return false; + } + s = w.toNewHampshire( t4, false, true ).toString(); + if ( !s.equals( "((A,B12),D);" ) ) { + return false; + } + final Phylogeny t5 = factory.create( "((A,((B11,B12),B2)),(C,D))", new NHXParser() )[ 0 ]; + t5.deleteSubtree( t5.getNode( "A" ), true ); + if ( t5.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t5, false, true ).toString(); + if ( !s.equals( "(((B11,B12),B2),(C,D));" ) ) { + return false; + } + final Phylogeny t6 = factory.create( "((A,((B11,B12),B2)),(C,D))", new NHXParser() )[ 0 ]; + t6.deleteSubtree( t6.getNode( "B11" ), true ); + if ( t6.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t6, false, false ).toString(); + if ( !s.equals( "((A,(B12,B2)),(C,D));" ) ) { + return false; + } + final Phylogeny t7 = factory.create( "((A,((B11,B12),B2)),(C,D))", new NHXParser() )[ 0 ]; + t7.deleteSubtree( t7.getNode( "B12" ), true ); + if ( t7.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t7, false, true ).toString(); + if ( !s.equals( "((A,(B11,B2)),(C,D));" ) ) { + return false; + } + final Phylogeny t8 = factory.create( "((A,((B11,B12),B2)),(C,D))", new NHXParser() )[ 0 ]; + t8.deleteSubtree( t8.getNode( "B2" ), true ); + if ( t8.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t8, false, false ).toString(); + if ( !s.equals( "((A,(B11,B12)),(C,D));" ) ) { + return false; + } + final Phylogeny t9 = factory.create( "((A,((B11,B12),B2)),(C,D))", new NHXParser() )[ 0 ]; + t9.deleteSubtree( t9.getNode( "C" ), true ); + if ( t9.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t9, false, true ).toString(); + if ( !s.equals( "((A,((B11,B12),B2)),D);" ) ) { + return false; + } + final Phylogeny t10 = factory.create( "((A,((B11,B12),B2)),(C,D))", new NHXParser() )[ 0 ]; + t10.deleteSubtree( t10.getNode( "D" ), true ); + if ( t10.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t10, false, true ).toString(); + if ( !s.equals( "((A,((B11,B12),B2)),C);" ) ) { + return false; + } + final Phylogeny t11 = factory.create( "(A,B,C)", new NHXParser() )[ 0 ]; + t11.deleteSubtree( t11.getNode( "A" ), true ); + if ( t11.getNumberOfExternalNodes() != 2 ) { + return false; + } + s = w.toNewHampshire( t11, false, true ).toString(); + if ( !s.equals( "(B,C);" ) ) { + return false; + } + t11.deleteSubtree( t11.getNode( "C" ), true ); + if ( t11.getNumberOfExternalNodes() != 1 ) { + return false; + } + s = w.toNewHampshire( t11, false, false ).toString(); + if ( !s.equals( "B;" ) ) { + return false; + } + final Phylogeny t12 = factory.create( "((A1,A2,A3),(B1,B2,B3),(C1,C2,C3))", new NHXParser() )[ 0 ]; + t12.deleteSubtree( t12.getNode( "B2" ), true ); + if ( t12.getNumberOfExternalNodes() != 8 ) { + return false; + } + s = w.toNewHampshire( t12, false, true ).toString(); + if ( !s.equals( "((A1,A2,A3),(B1,B3),(C1,C2,C3));" ) ) { + return false; + } + t12.deleteSubtree( t12.getNode( "B3" ), true ); + if ( t12.getNumberOfExternalNodes() != 7 ) { + return false; + } + s = w.toNewHampshire( t12, false, true ).toString(); + if ( !s.equals( "((A1,A2,A3),B1,(C1,C2,C3));" ) ) { + return false; + } + t12.deleteSubtree( t12.getNode( "C3" ), true ); + if ( t12.getNumberOfExternalNodes() != 6 ) { + return false; + } + s = w.toNewHampshire( t12, false, true ).toString(); + if ( !s.equals( "((A1,A2,A3),B1,(C1,C2));" ) ) { + return false; + } + t12.deleteSubtree( t12.getNode( "A1" ), true ); + if ( t12.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t12, false, true ).toString(); + if ( !s.equals( "((A2,A3),B1,(C1,C2));" ) ) { + return false; + } + t12.deleteSubtree( t12.getNode( "B1" ), true ); + if ( t12.getNumberOfExternalNodes() != 4 ) { + return false; + } + s = w.toNewHampshire( t12, false, true ).toString(); + if ( !s.equals( "((A2,A3),(C1,C2));" ) ) { + return false; + } + t12.deleteSubtree( t12.getNode( "A3" ), true ); + if ( t12.getNumberOfExternalNodes() != 3 ) { + return false; + } + s = w.toNewHampshire( t12, false, true ).toString(); + if ( !s.equals( "(A2,(C1,C2));" ) ) { + return false; + } + t12.deleteSubtree( t12.getNode( "A2" ), true ); + if ( t12.getNumberOfExternalNodes() != 2 ) { + return false; + } + s = w.toNewHampshire( t12, false, true ).toString(); + if ( !s.equals( "(C1,C2);" ) ) { + return false; + } + final Phylogeny t13 = factory.create( "(A,B,C,(D:1.0,E:2.0):3.0)", new NHXParser() )[ 0 ]; + t13.deleteSubtree( t13.getNode( "D" ), true ); + if ( t13.getNumberOfExternalNodes() != 4 ) { + return false; + } + s = w.toNewHampshire( t13, false, true ).toString(); + if ( !s.equals( "(A,B,C,E:5.0);" ) ) { + return false; + } + final Phylogeny t14 = factory.create( "((A,B,C,(D:0.1,E:0.4):1.0),F)", new NHXParser() )[ 0 ]; + t14.deleteSubtree( t14.getNode( "E" ), true ); + if ( t14.getNumberOfExternalNodes() != 5 ) { + return false; + } + s = w.toNewHampshire( t14, false, true ).toString(); + if ( !s.equals( "((A,B,C,D:1.1),F);" ) ) { + return false; + } + final Phylogeny t15 = factory.create( "((A1,A2,A3,A4),(B1,B2,B3,B4),(C1,C2,C3,C4))", new NHXParser() )[ 0 ]; + t15.deleteSubtree( t15.getNode( "B2" ), true ); + if ( t15.getNumberOfExternalNodes() != 11 ) { + return false; + } + t15.deleteSubtree( t15.getNode( "B1" ), true ); + if ( t15.getNumberOfExternalNodes() != 10 ) { + return false; + } + t15.deleteSubtree( t15.getNode( "B3" ), true ); + if ( t15.getNumberOfExternalNodes() != 9 ) { + return false; + } + t15.deleteSubtree( t15.getNode( "B4" ), true ); + if ( t15.getNumberOfExternalNodes() != 8 ) { + return false; + } + t15.deleteSubtree( t15.getNode( "A1" ), true ); + if ( t15.getNumberOfExternalNodes() != 7 ) { + return false; + } + t15.deleteSubtree( t15.getNode( "C4" ), true ); + if ( t15.getNumberOfExternalNodes() != 6 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDescriptiveStatistics() { + try { + final DescriptiveStatistics dss1 = new BasicDescriptiveStatistics(); + dss1.addValue( 82 ); + dss1.addValue( 78 ); + dss1.addValue( 70 ); + dss1.addValue( 58 ); + dss1.addValue( 42 ); + if ( dss1.getN() != 5 ) { + return false; + } + if ( !Test.isEqual( dss1.getMin(), 42 ) ) { + return false; + } + if ( !Test.isEqual( dss1.getMax(), 82 ) ) { + return false; + } + if ( !Test.isEqual( dss1.arithmeticMean(), 66 ) ) { + return false; + } + if ( !Test.isEqual( dss1.sampleStandardDeviation(), 16.24807680927192 ) ) { + return false; + } + if ( !Test.isEqual( dss1.median(), 70 ) ) { + return false; + } + if ( !Test.isEqual( dss1.midrange(), 62 ) ) { + return false; + } + if ( !Test.isEqual( dss1.sampleVariance(), 264 ) ) { + return false; + } + if ( !Test.isEqual( dss1.pearsonianSkewness(), -0.7385489458759964 ) ) { + return false; + } + if ( !Test.isEqual( dss1.coefficientOfVariation(), 0.24618298195866547 ) ) { + return false; + } + if ( !Test.isEqual( dss1.sampleStandardUnit( 66 - 16.24807680927192 ), -1.0 ) ) { + return false; + } + if ( !Test.isEqual( dss1.getValue( 1 ), 78 ) ) { + return false; + } + dss1.addValue( 123 ); + if ( !Test.isEqual( dss1.arithmeticMean(), 75.5 ) ) { + return false; + } + if ( !Test.isEqual( dss1.getMax(), 123 ) ) { + return false; + } + if ( !Test.isEqual( dss1.standardErrorOfMean(), 11.200446419674531 ) ) { + return false; + } + final DescriptiveStatistics dss2 = new BasicDescriptiveStatistics(); + dss2.addValue( -1.85 ); + dss2.addValue( 57.5 ); + dss2.addValue( 92.78 ); + dss2.addValue( 57.78 ); + if ( !Test.isEqual( dss2.median(), 57.64 ) ) { + return false; + } + if ( !Test.isEqual( dss2.sampleStandardDeviation(), 39.266984753946495 ) ) { + return false; + } + final double[] a = dss2.getDataAsDoubleArray(); + if ( !Test.isEqual( a[ 3 ], 57.78 ) ) { + return false; + } + dss2.addValue( -100 ); + if ( !Test.isEqual( dss2.sampleStandardDeviation(), 75.829111296388 ) ) { + return false; + } + if ( !Test.isEqual( dss2.sampleVariance(), 5750.05412 ) ) { + return false; + } + final double[] ds = new double[ 14 ]; + ds[ 0 ] = 34; + ds[ 1 ] = 23; + ds[ 2 ] = 1; + ds[ 3 ] = 32; + ds[ 4 ] = 11; + ds[ 5 ] = 2; + ds[ 6 ] = 12; + ds[ 7 ] = 33; + ds[ 8 ] = 13; + ds[ 9 ] = 22; + ds[ 10 ] = 21; + ds[ 11 ] = 35; + ds[ 12 ] = 24; + ds[ 13 ] = 31; + final int[] bins = BasicDescriptiveStatistics.performBinning( ds, 0, 40, 4 ); + if ( bins.length != 4 ) { + return false; + } + if ( bins[ 0 ] != 2 ) { + return false; + } + if ( bins[ 1 ] != 3 ) { + return false; + } + if ( bins[ 2 ] != 4 ) { + return false; + } + if ( bins[ 3 ] != 5 ) { + return false; + } + final double[] ds1 = new double[ 9 ]; + ds1[ 0 ] = 10.0; + ds1[ 1 ] = 19.0; + ds1[ 2 ] = 9.999; + ds1[ 3 ] = 0.0; + ds1[ 4 ] = 39.9; + ds1[ 5 ] = 39.999; + ds1[ 6 ] = 30.0; + ds1[ 7 ] = 19.999; + ds1[ 8 ] = 30.1; + final int[] bins1 = BasicDescriptiveStatistics.performBinning( ds1, 0, 40, 4 ); + if ( bins1.length != 4 ) { + return false; + } + if ( bins1[ 0 ] != 2 ) { + return false; + } + if ( bins1[ 1 ] != 3 ) { + return false; + } + if ( bins1[ 2 ] != 0 ) { + return false; + } + if ( bins1[ 3 ] != 4 ) { + return false; + } + final int[] bins1_1 = BasicDescriptiveStatistics.performBinning( ds1, 0, 40, 3 ); + if ( bins1_1.length != 3 ) { + return false; + } + if ( bins1_1[ 0 ] != 3 ) { + return false; + } + if ( bins1_1[ 1 ] != 2 ) { + return false; + } + if ( bins1_1[ 2 ] != 4 ) { + return false; + } + final int[] bins1_2 = BasicDescriptiveStatistics.performBinning( ds1, 1, 39, 3 ); + if ( bins1_2.length != 3 ) { + return false; + } + if ( bins1_2[ 0 ] != 2 ) { + return false; + } + if ( bins1_2[ 1 ] != 2 ) { + return false; + } + if ( bins1_2[ 2 ] != 2 ) { + return false; + } + final DescriptiveStatistics dss3 = new BasicDescriptiveStatistics(); + dss3.addValue( 1 ); + dss3.addValue( 1 ); + dss3.addValue( 1 ); + dss3.addValue( 2 ); + dss3.addValue( 3 ); + dss3.addValue( 4 ); + dss3.addValue( 5 ); + dss3.addValue( 5 ); + dss3.addValue( 5 ); + dss3.addValue( 6 ); + dss3.addValue( 7 ); + dss3.addValue( 8 ); + dss3.addValue( 9 ); + dss3.addValue( 10 ); + dss3.addValue( 10 ); + dss3.addValue( 10 ); + final AsciiHistogram histo = new AsciiHistogram( dss3 ); + histo.toStringBuffer( 10, '=', 40, 5 ); + histo.toStringBuffer( 3, 8, 10, '=', 40, 5 ); + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testDir( final String file ) { + try { + final File f = new File( file ); + if ( !f.exists() ) { + return false; + } + if ( !f.isDirectory() ) { + return false; + } + if ( !f.canRead() ) { + return false; + } + } + catch ( final Exception e ) { + return false; + } + return true; + } + + private static boolean testExternalNodeRelatedMethods() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t1 = factory.create( "((A,B),(C,D))", new NHXParser() )[ 0 ]; + PhylogenyNode n = t1.getNode( "A" ); + n = n.getNextExternalNode(); + if ( !n.getName().equals( "B" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "C" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "D" ) ) { + return false; + } + n = t1.getNode( "B" ); + while ( !n.isLastExternalNode() ) { + n = n.getNextExternalNode(); + } + final Phylogeny t2 = factory.create( "(((A,B),C),D)", new NHXParser() )[ 0 ]; + n = t2.getNode( "A" ); + n = n.getNextExternalNode(); + if ( !n.getName().equals( "B" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "C" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "D" ) ) { + return false; + } + n = t2.getNode( "B" ); + while ( !n.isLastExternalNode() ) { + n = n.getNextExternalNode(); + } + final Phylogeny t3 = factory.create( "(((A,B),(C,D)),((E,F),(G,H)))", new NHXParser() )[ 0 ]; + n = t3.getNode( "A" ); + n = n.getNextExternalNode(); + if ( !n.getName().equals( "B" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "C" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "D" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "E" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "F" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "G" ) ) { + return false; + } + n = n.getNextExternalNode(); + if ( !n.getName().equals( "H" ) ) { + return false; + } + n = t3.getNode( "B" ); + while ( !n.isLastExternalNode() ) { + n = n.getNextExternalNode(); + } + final Phylogeny t4 = factory.create( "((A,B),(C,D))", new NHXParser() )[ 0 ]; + for( final PhylogenyNodeIterator iter = t4.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + } + final Phylogeny t5 = factory.create( "(((A,B),(C,D)),((E,F),(G,H)))", new NHXParser() )[ 0 ]; + for( final PhylogenyNodeIterator iter = t5.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testGeneralTable() { + try { + final GeneralTable t0 = new GeneralTable(); + t0.setValue( 3, 2, "23" ); + t0.setValue( 10, 1, "error" ); + t0.setValue( 10, 1, "110" ); + t0.setValue( 9, 1, "19" ); + t0.setValue( 1, 10, "101" ); + t0.setValue( 10, 10, "1010" ); + t0.setValue( 100, 10, "10100" ); + t0.setValue( 0, 0, "00" ); + if ( !t0.getValue( 3, 2 ).equals( "23" ) ) { + return false; + } + if ( !t0.getValue( 10, 1 ).equals( "110" ) ) { + return false; + } + if ( !t0.getValueAsString( 1, 10 ).equals( "101" ) ) { + return false; + } + if ( !t0.getValueAsString( 10, 10 ).equals( "1010" ) ) { + return false; + } + if ( !t0.getValueAsString( 100, 10 ).equals( "10100" ) ) { + return false; + } + if ( !t0.getValueAsString( 9, 1 ).equals( "19" ) ) { + return false; + } + if ( !t0.getValueAsString( 0, 0 ).equals( "00" ) ) { + return false; + } + if ( !t0.getValueAsString( 49, 4 ).equals( "" ) ) { + return false; + } + if ( !t0.getValueAsString( 22349, 3434344 ).equals( "" ) ) { + return false; + } + final GeneralTable t1 = new GeneralTable(); + t1.setValue( "3", "2", "23" ); + t1.setValue( "10", "1", "error" ); + t1.setValue( "10", "1", "110" ); + t1.setValue( "9", "1", "19" ); + t1.setValue( "1", "10", "101" ); + t1.setValue( "10", "10", "1010" ); + t1.setValue( "100", "10", "10100" ); + t1.setValue( "0", "0", "00" ); + t1.setValue( "qwerty", "zxcvbnm", "asdef" ); + if ( !t1.getValue( "3", "2" ).equals( "23" ) ) { + return false; + } + if ( !t1.getValue( "10", "1" ).equals( "110" ) ) { + return false; + } + if ( !t1.getValueAsString( "1", "10" ).equals( "101" ) ) { + return false; + } + if ( !t1.getValueAsString( "10", "10" ).equals( "1010" ) ) { + return false; + } + if ( !t1.getValueAsString( "100", "10" ).equals( "10100" ) ) { + return false; + } + if ( !t1.getValueAsString( "9", "1" ).equals( "19" ) ) { + return false; + } + if ( !t1.getValueAsString( "0", "0" ).equals( "00" ) ) { + return false; + } + if ( !t1.getValueAsString( "qwerty", "zxcvbnm" ).equals( "asdef" ) ) { + return false; + } + if ( !t1.getValueAsString( "49", "4" ).equals( "" ) ) { + return false; + } + if ( !t1.getValueAsString( "22349", "3434344" ).equals( "" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testGetDistance() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p1 = factory.create( "(((A:1,B:2,X:100)ab:3,C:4)abc:5,(D:7,(E:9,F:10)ef:8)def:6)r", + new NHXParser() )[ 0 ]; + final PhylogenyMethods pm = PhylogenyMethods.getInstance(); + if ( pm.calculateDistance( p1.getNode( "C" ), p1.getNode( "C" ) ) != 0 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "def" ), p1.getNode( "def" ) ) != 0 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ef" ), p1.getNode( "ef" ) ) != 0 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "r" ), p1.getNode( "r" ) ) != 0 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "A" ) ) != 0 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "B" ) ) != 3 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "B" ), p1.getNode( "A" ) ) != 3 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "C" ) ) != 8 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "C" ), p1.getNode( "A" ) ) != 8 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "D" ) ) != 22 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "E" ) ) != 32 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "E" ), p1.getNode( "A" ) ) != 32 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "F" ) ) != 33 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "F" ), p1.getNode( "A" ) ) != 33 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "ab" ) ) != 1 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ab" ), p1.getNode( "A" ) ) != 1 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "abc" ) ) != 4 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "abc" ), p1.getNode( "A" ) ) != 4 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "r" ) ) != 9 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "r" ), p1.getNode( "A" ) ) != 9 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "def" ) ) != 15 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "def" ), p1.getNode( "A" ) ) != 15 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "A" ), p1.getNode( "ef" ) ) != 23 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ef" ), p1.getNode( "A" ) ) != 23 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ef" ), p1.getNode( "def" ) ) != 8 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "def" ), p1.getNode( "ef" ) ) != 8 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ef" ), p1.getNode( "r" ) ) != 14 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ef" ), p1.getNode( "abc" ) ) != 19 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ef" ), p1.getNode( "ab" ) ) != 22 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "ab" ), p1.getNode( "ef" ) ) != 22 ) { + return false; + } + if ( pm.calculateDistance( p1.getNode( "def" ), p1.getNode( "abc" ) ) != 11 ) { + return false; + } + final Phylogeny p2 = factory.create( "((A:4,B:5,C:6)abc:1,(D:7,E:8,F:9)def:2,(G:10,H:11,I:12)ghi:3)r", + new NHXParser() )[ 0 ]; + if ( pm.calculateDistance( p2.getNode( "A" ), p2.getNode( "B" ) ) != 9 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "A" ), p2.getNode( "C" ) ) != 10 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "A" ), p2.getNode( "D" ) ) != 14 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "A" ), p2.getNode( "ghi" ) ) != 8 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "A" ), p2.getNode( "I" ) ) != 20 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "G" ), p2.getNode( "ghi" ) ) != 10 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "r" ), p2.getNode( "r" ) ) != 0 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "r" ), p2.getNode( "G" ) ) != 13 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "G" ), p2.getNode( "r" ) ) != 13 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "G" ), p2.getNode( "H" ) ) != 21 ) { + return false; + } + if ( pm.calculateDistance( p2.getNode( "G" ), p2.getNode( "I" ) ) != 22 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testGetLCA() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p1 = factory.create( "((((((A,B)ab,C)abc,D)abcd,E)abcde,F)abcdef,(G,H)gh)abcdefgh", + new NHXParser() )[ 0 ]; + final PhylogenyMethods pm = PhylogenyMethods.getInstance(); + final PhylogenyNode A = pm.obtainLCA( p1.getNode( "A" ), p1.getNode( "A" ) ); + if ( !A.getName().equals( "A" ) ) { + return false; + } + final PhylogenyNode gh = pm.obtainLCA( p1.getNode( "gh" ), p1.getNode( "gh" ) ); + if ( !gh.getName().equals( "gh" ) ) { + return false; + } + final PhylogenyNode ab = pm.obtainLCA( p1.getNode( "A" ), p1.getNode( "B" ) ); + if ( !ab.getName().equals( "ab" ) ) { + return false; + } + final PhylogenyNode ab2 = pm.obtainLCA( p1.getNode( "B" ), p1.getNode( "A" ) ); + if ( !ab2.getName().equals( "ab" ) ) { + return false; + } + final PhylogenyNode gh2 = pm.obtainLCA( p1.getNode( "H" ), p1.getNode( "G" ) ); + if ( !gh2.getName().equals( "gh" ) ) { + return false; + } + final PhylogenyNode gh3 = pm.obtainLCA( p1.getNode( "G" ), p1.getNode( "H" ) ); + if ( !gh3.getName().equals( "gh" ) ) { + return false; + } + final PhylogenyNode abc = pm.obtainLCA( p1.getNode( "C" ), p1.getNode( "A" ) ); + if ( !abc.getName().equals( "abc" ) ) { + return false; + } + final PhylogenyNode abc2 = pm.obtainLCA( p1.getNode( "A" ), p1.getNode( "C" ) ); + if ( !abc2.getName().equals( "abc" ) ) { + return false; + } + final PhylogenyNode abcd = pm.obtainLCA( p1.getNode( "A" ), p1.getNode( "D" ) ); + if ( !abcd.getName().equals( "abcd" ) ) { + return false; + } + final PhylogenyNode abcd2 = pm.obtainLCA( p1.getNode( "D" ), p1.getNode( "A" ) ); + if ( !abcd2.getName().equals( "abcd" ) ) { + return false; + } + final PhylogenyNode abcdef = pm.obtainLCA( p1.getNode( "A" ), p1.getNode( "F" ) ); + if ( !abcdef.getName().equals( "abcdef" ) ) { + return false; + } + final PhylogenyNode abcdef2 = pm.obtainLCA( p1.getNode( "F" ), p1.getNode( "A" ) ); + if ( !abcdef2.getName().equals( "abcdef" ) ) { + return false; + } + final PhylogenyNode abcdef3 = pm.obtainLCA( p1.getNode( "ab" ), p1.getNode( "F" ) ); + if ( !abcdef3.getName().equals( "abcdef" ) ) { + return false; + } + final PhylogenyNode abcdef4 = pm.obtainLCA( p1.getNode( "F" ), p1.getNode( "ab" ) ); + if ( !abcdef4.getName().equals( "abcdef" ) ) { + return false; + } + final PhylogenyNode abcde = pm.obtainLCA( p1.getNode( "A" ), p1.getNode( "E" ) ); + if ( !abcde.getName().equals( "abcde" ) ) { + return false; + } + final PhylogenyNode abcde2 = pm.obtainLCA( p1.getNode( "E" ), p1.getNode( "A" ) ); + if ( !abcde2.getName().equals( "abcde" ) ) { + return false; + } + final PhylogenyNode r = pm.obtainLCA( p1.getNode( "abcdefgh" ), p1.getNode( "abcdefgh" ) ); + if ( !r.getName().equals( "abcdefgh" ) ) { + return false; + } + final PhylogenyNode r2 = pm.obtainLCA( p1.getNode( "A" ), p1.getNode( "H" ) ); + if ( !r2.getName().equals( "abcdefgh" ) ) { + return false; + } + final PhylogenyNode r3 = pm.obtainLCA( p1.getNode( "H" ), p1.getNode( "A" ) ); + if ( !r3.getName().equals( "abcdefgh" ) ) { + return false; + } + final PhylogenyNode abcde3 = pm.obtainLCA( p1.getNode( "E" ), p1.getNode( "abcde" ) ); + if ( !abcde3.getName().equals( "abcde" ) ) { + return false; + } + final PhylogenyNode abcde4 = pm.obtainLCA( p1.getNode( "abcde" ), p1.getNode( "E" ) ); + if ( !abcde4.getName().equals( "abcde" ) ) { + return false; + } + final PhylogenyNode ab3 = pm.obtainLCA( p1.getNode( "ab" ), p1.getNode( "B" ) ); + if ( !ab3.getName().equals( "ab" ) ) { + return false; + } + final PhylogenyNode ab4 = pm.obtainLCA( p1.getNode( "B" ), p1.getNode( "ab" ) ); + if ( !ab4.getName().equals( "ab" ) ) { + return false; + } + final Phylogeny p2 = factory.create( "(a,b,(((c,d)cd,e)cde,f)cdef)r", new NHXParser() )[ 0 ]; + final PhylogenyNode cd = pm.obtainLCA( p2.getNode( "c" ), p2.getNode( "d" ) ); + if ( !cd.getName().equals( "cd" ) ) { + return false; + } + final PhylogenyNode cd2 = pm.obtainLCA( p2.getNode( "d" ), p2.getNode( "c" ) ); + if ( !cd2.getName().equals( "cd" ) ) { + return false; + } + final PhylogenyNode cde = pm.obtainLCA( p2.getNode( "c" ), p2.getNode( "e" ) ); + if ( !cde.getName().equals( "cde" ) ) { + return false; + } + final PhylogenyNode cde2 = pm.obtainLCA( p2.getNode( "e" ), p2.getNode( "c" ) ); + if ( !cde2.getName().equals( "cde" ) ) { + return false; + } + final PhylogenyNode cdef = pm.obtainLCA( p2.getNode( "c" ), p2.getNode( "f" ) ); + if ( !cdef.getName().equals( "cdef" ) ) { + return false; + } + final PhylogenyNode cdef2 = pm.obtainLCA( p2.getNode( "d" ), p2.getNode( "f" ) ); + if ( !cdef2.getName().equals( "cdef" ) ) { + return false; + } + final PhylogenyNode cdef3 = pm.obtainLCA( p2.getNode( "f" ), p2.getNode( "d" ) ); + if ( !cdef3.getName().equals( "cdef" ) ) { + return false; + } + final PhylogenyNode rt = pm.obtainLCA( p2.getNode( "c" ), p2.getNode( "a" ) ); + if ( !rt.getName().equals( "r" ) ) { + return false; + } + final Phylogeny p3 = factory + .create( "((((a,(b,c)bc)abc,(d,e)de)abcde,f)abcdef,(((g,h)gh,(i,j)ij)ghij,k)ghijk,l)", + new NHXParser() )[ 0 ]; + final PhylogenyNode bc_3 = pm.obtainLCA( p3.getNode( "b" ), p3.getNode( "c" ) ); + if ( !bc_3.getName().equals( "bc" ) ) { + return false; + } + final PhylogenyNode ac_3 = pm.obtainLCA( p3.getNode( "a" ), p3.getNode( "c" ) ); + if ( !ac_3.getName().equals( "abc" ) ) { + return false; + } + final PhylogenyNode ad_3 = pm.obtainLCA( p3.getNode( "a" ), p3.getNode( "d" ) ); + if ( !ad_3.getName().equals( "abcde" ) ) { + return false; + } + final PhylogenyNode af_3 = pm.obtainLCA( p3.getNode( "a" ), p3.getNode( "f" ) ); + if ( !af_3.getName().equals( "abcdef" ) ) { + return false; + } + final PhylogenyNode ag_3 = pm.obtainLCA( p3.getNode( "a" ), p3.getNode( "g" ) ); + if ( !ag_3.getName().equals( "" ) ) { + return false; + } + if ( !ag_3.isRoot() ) { + return false; + } + final PhylogenyNode al_3 = pm.obtainLCA( p3.getNode( "a" ), p3.getNode( "l" ) ); + if ( !al_3.getName().equals( "" ) ) { + return false; + } + if ( !al_3.isRoot() ) { + return false; + } + final PhylogenyNode kl_3 = pm.obtainLCA( p3.getNode( "k" ), p3.getNode( "l" ) ); + if ( !kl_3.getName().equals( "" ) ) { + return false; + } + if ( !kl_3.isRoot() ) { + return false; + } + final PhylogenyNode fl_3 = pm.obtainLCA( p3.getNode( "f" ), p3.getNode( "l" ) ); + if ( !fl_3.getName().equals( "" ) ) { + return false; + } + if ( !fl_3.isRoot() ) { + return false; + } + final PhylogenyNode gk_3 = pm.obtainLCA( p3.getNode( "g" ), p3.getNode( "k" ) ); + if ( !gk_3.getName().equals( "ghijk" ) ) { + return false; + } + final Phylogeny p4 = factory.create( "(a,b,c)r", new NHXParser() )[ 0 ]; + final PhylogenyNode r_4 = pm.obtainLCA( p4.getNode( "b" ), p4.getNode( "c" ) ); + if ( !r_4.getName().equals( "r" ) ) { + return false; + } + final Phylogeny p5 = factory.create( "((a,b),c,d)root", new NHXParser() )[ 0 ]; + final PhylogenyNode r_5 = pm.obtainLCA( p5.getNode( "a" ), p5.getNode( "c" ) ); + if ( !r_5.getName().equals( "root" ) ) { + return false; + } + final Phylogeny p6 = factory.create( "((a,b),c,d)rot", new NHXParser() )[ 0 ]; + final PhylogenyNode r_6 = pm.obtainLCA( p6.getNode( "c" ), p6.getNode( "a" ) ); + if ( !r_6.getName().equals( "rot" ) ) { + return false; + } + final Phylogeny p7 = factory.create( "(((a,b)x,c)x,d,e)rott", new NHXParser() )[ 0 ]; + final PhylogenyNode r_7 = pm.obtainLCA( p7.getNode( "a" ), p7.getNode( "e" ) ); + if ( !r_7.getName().equals( "rott" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testHmmscanOutputParser() { + final String test_dir = Test.PATH_TO_TEST_DATA; + try { + final HmmscanPerDomainTableParser parser1 = new HmmscanPerDomainTableParser( new File( test_dir + + ForesterUtil.getFileSeparator() + "hmmscan30b3_output_1" ), "MONBR", INDIVIDUAL_SCORE_CUTOFF.NONE ); + parser1.parse(); + final HmmscanPerDomainTableParser parser2 = new HmmscanPerDomainTableParser( new File( test_dir + + ForesterUtil.getFileSeparator() + "hmmscan30b3_output_2" ), "MONBR", INDIVIDUAL_SCORE_CUTOFF.NONE ); + final List domain_collections = parser2.parse(); + if ( parser2.getProteinsEncountered() != 4 ) { + return false; + } + if ( domain_collections.size() != 4 ) { + return false; + } + if ( parser2.getDomainsEncountered() != 69 ) { + return false; + } + if ( parser2.getDomainsIgnoredDueToDuf() != 0 ) { + return false; + } + if ( parser2.getDomainsIgnoredDueToEval() != 0 ) { + return false; + } + final Protein p1 = domain_collections.get( 0 ); + if ( p1.getNumberOfProteinDomains() != 15 ) { + return false; + } + final Protein p4 = domain_collections.get( 3 ); + if ( p4.getNumberOfProteinDomains() != 1 ) { + return false; + } + if ( !p4.getProteinDomain( 0 ).getDomainId().toString().equals( "DNA_pol_B_new" ) ) { + return false; + } + if ( p4.getProteinDomain( 0 ).getFrom() != 51 ) { + return false; + } + if ( p4.getProteinDomain( 0 ).getTo() != 395 ) { + return false; + } + if ( !Test.isEqual( p4.getProteinDomain( 0 ).getPerDomainEvalue(), 1.2e-39 ) ) { + return false; + } + if ( !Test.isEqual( p4.getProteinDomain( 0 ).getPerDomainScore(), 135.7 ) ) { + return false; + } + if ( !Test.isEqual( p4.getProteinDomain( 0 ).getPerSequenceEvalue(), 8.3e-40 ) ) { + return false; + } + if ( !Test.isEqual( p4.getProteinDomain( 0 ).getPerSequenceScore(), 136.3 ) ) { + return false; + } + if ( !Test.isEqual( p4.getProteinDomain( 0 ).getNumber(), 1 ) ) { + return false; + } + if ( !Test.isEqual( p4.getProteinDomain( 0 ).getTotalCount(), 1 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testLastExternalNodeMethods() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final char[] a0 = { '(', '(', 'A', ',', 'B', ')', ',', '(', 'C', ',', 'D', ')', ')', }; + final Phylogeny t0 = factory.create( a0, new NHXParser() )[ 0 ]; + final PhylogenyNode n1 = t0.getNode( "A" ); + if ( n1.isLastExternalNode() ) { + return false; + } + final PhylogenyNode n2 = t0.getNode( "B" ); + if ( n2.isLastExternalNode() ) { + return false; + } + final PhylogenyNode n3 = t0.getNode( "C" ); + if ( n3.isLastExternalNode() ) { + return false; + } + final PhylogenyNode n4 = t0.getNode( "D" ); + if ( !n4.isLastExternalNode() ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testLevelOrderIterator() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t0 = factory.create( "((A,B)ab,(C,D)cd)r", new NHXParser() )[ 0 ]; + PhylogenyNodeIterator it0; + for( it0 = t0.iteratorLevelOrder(); it0.hasNext(); ) { + it0.next(); + } + for( it0.reset(); it0.hasNext(); ) { + it0.next(); + } + final PhylogenyNodeIterator it = t0.iteratorLevelOrder(); + if ( !it.next().getName().equals( "r" ) ) { + return false; + } + if ( !it.next().getName().equals( "ab" ) ) { + return false; + } + if ( !it.next().getName().equals( "cd" ) ) { + return false; + } + if ( !it.next().getName().equals( "A" ) ) { + return false; + } + if ( !it.next().getName().equals( "B" ) ) { + return false; + } + if ( !it.next().getName().equals( "C" ) ) { + return false; + } + if ( !it.next().getName().equals( "D" ) ) { + return false; + } + if ( it.hasNext() ) { + return false; + } + final Phylogeny t2 = factory.create( "(((1,2,(a,(X,Y,Z)b)3,4,5,6)A,B,C)abc,(D,E,(f1,(f21)f2,f3)F,G)defg)r", + new NHXParser() )[ 0 ]; + PhylogenyNodeIterator it2; + for( it2 = t2.iteratorLevelOrder(); it2.hasNext(); ) { + it2.next(); + } + for( it2.reset(); it2.hasNext(); ) { + it2.next(); + } + final PhylogenyNodeIterator it3 = t2.iteratorLevelOrder(); + if ( !it3.next().getName().equals( "r" ) ) { + return false; + } + if ( !it3.next().getName().equals( "abc" ) ) { + return false; + } + if ( !it3.next().getName().equals( "defg" ) ) { + return false; + } + if ( !it3.next().getName().equals( "A" ) ) { + return false; + } + if ( !it3.next().getName().equals( "B" ) ) { + return false; + } + if ( !it3.next().getName().equals( "C" ) ) { + return false; + } + if ( !it3.next().getName().equals( "D" ) ) { + return false; + } + if ( !it3.next().getName().equals( "E" ) ) { + return false; + } + if ( !it3.next().getName().equals( "F" ) ) { + return false; + } + if ( !it3.next().getName().equals( "G" ) ) { + return false; + } + if ( !it3.next().getName().equals( "1" ) ) { + return false; + } + if ( !it3.next().getName().equals( "2" ) ) { + return false; + } + if ( !it3.next().getName().equals( "3" ) ) { + return false; + } + if ( !it3.next().getName().equals( "4" ) ) { + return false; + } + if ( !it3.next().getName().equals( "5" ) ) { + return false; + } + if ( !it3.next().getName().equals( "6" ) ) { + return false; + } + if ( !it3.next().getName().equals( "f1" ) ) { + return false; + } + if ( !it3.next().getName().equals( "f2" ) ) { + return false; + } + if ( !it3.next().getName().equals( "f3" ) ) { + return false; + } + if ( !it3.next().getName().equals( "a" ) ) { + return false; + } + if ( !it3.next().getName().equals( "b" ) ) { + return false; + } + if ( !it3.next().getName().equals( "f21" ) ) { + return false; + } + if ( !it3.next().getName().equals( "X" ) ) { + return false; + } + if ( !it3.next().getName().equals( "Y" ) ) { + return false; + } + if ( !it3.next().getName().equals( "Z" ) ) { + return false; + } + if ( it3.hasNext() ) { + return false; + } + final Phylogeny t4 = factory.create( "((((D)C)B)A)r", new NHXParser() )[ 0 ]; + PhylogenyNodeIterator it4; + for( it4 = t4.iteratorLevelOrder(); it4.hasNext(); ) { + it4.next(); + } + for( it4.reset(); it4.hasNext(); ) { + it4.next(); + } + final PhylogenyNodeIterator it5 = t4.iteratorLevelOrder(); + if ( !it5.next().getName().equals( "r" ) ) { + return false; + } + if ( !it5.next().getName().equals( "A" ) ) { + return false; + } + if ( !it5.next().getName().equals( "B" ) ) { + return false; + } + if ( !it5.next().getName().equals( "C" ) ) { + return false; + } + if ( !it5.next().getName().equals( "D" ) ) { + return false; + } + final Phylogeny t5 = factory.create( "A", new NHXParser() )[ 0 ]; + PhylogenyNodeIterator it6; + for( it6 = t5.iteratorLevelOrder(); it6.hasNext(); ) { + it6.next(); + } + for( it6.reset(); it6.hasNext(); ) { + it6.next(); + } + final PhylogenyNodeIterator it7 = t5.iteratorLevelOrder(); + if ( !it7.next().getName().equals( "A" ) ) { + return false; + } + if ( it.hasNext() ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testMidpointrooting() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t1 = factory.create( "((A:1,B:2)AB:1[&&NHX:B=55],(C:3,D:4)CD:3[&&NHX:B=10])ABCD:0.5", + new NHXParser() )[ 0 ]; + if ( !t1.isRooted() ) { + return false; + } + PhylogenyMethods.midpointRoot( t1 ); + if ( !isEqual( t1.getNode( "A" ).getDistanceToParent(), 1 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "B" ).getDistanceToParent(), 2 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "C" ).getDistanceToParent(), 3 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "D" ).getDistanceToParent(), 4 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "CD" ).getDistanceToParent(), 1 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "AB" ).getDistanceToParent(), 3 ) ) { + return false; + } + t1.reRoot( t1.getNode( "A" ) ); + PhylogenyMethods.midpointRoot( t1 ); + if ( !isEqual( t1.getNode( "A" ).getDistanceToParent(), 1 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "B" ).getDistanceToParent(), 2 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "C" ).getDistanceToParent(), 3 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "D" ).getDistanceToParent(), 4 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "CD" ).getDistanceToParent(), 1 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "AB" ).getDistanceToParent(), 3 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNexusCharactersParsing() { + try { + final NexusCharactersParser parser = new NexusCharactersParser(); + parser.setSource( new File( Test.PATH_TO_TEST_DATA + "nexus_test_7.nex" ) ); + parser.parse(); + String[] labels = parser.getCharStateLabels(); + if ( labels.length != 7 ) { + return false; + } + if ( !labels[ 0 ].equals( "14-3-3" ) ) { + return false; + } + if ( !labels[ 1 ].equals( "2-Hacid_dh" ) ) { + return false; + } + if ( !labels[ 2 ].equals( "2-Hacid_dh_C" ) ) { + return false; + } + if ( !labels[ 3 ].equals( "2-oxoacid_dh" ) ) { + return false; + } + if ( !labels[ 4 ].equals( "2OG-FeII_Oxy" ) ) { + return false; + } + if ( !labels[ 5 ].equals( "3-HAO" ) ) { + return false; + } + if ( !labels[ 6 ].equals( "3_5_exonuc" ) ) { + return false; + } + parser.setSource( new File( Test.PATH_TO_TEST_DATA + "nexus_test_8.nex" ) ); + parser.parse(); + labels = parser.getCharStateLabels(); + if ( labels.length != 7 ) { + return false; + } + if ( !labels[ 0 ].equals( "14-3-3" ) ) { + return false; + } + if ( !labels[ 1 ].equals( "2-Hacid_dh" ) ) { + return false; + } + if ( !labels[ 2 ].equals( "2-Hacid_dh_C" ) ) { + return false; + } + if ( !labels[ 3 ].equals( "2-oxoacid_dh" ) ) { + return false; + } + if ( !labels[ 4 ].equals( "2OG-FeII_Oxy" ) ) { + return false; + } + if ( !labels[ 5 ].equals( "3-HAO" ) ) { + return false; + } + if ( !labels[ 6 ].equals( "3_5_exonuc" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNexusMatrixParsing() { + try { + final NexusBinaryStatesMatrixParser parser = new NexusBinaryStatesMatrixParser(); + parser.setSource( new File( Test.PATH_TO_TEST_DATA + "nexus_test_9.nex" ) ); + parser.parse(); + final CharacterStateMatrix m = parser.getMatrix(); + if ( m.getNumberOfCharacters() != 9 ) { + return false; + } + if ( m.getNumberOfIdentifiers() != 5 ) { + return false; + } + if ( m.getState( 0, 0 ) != BinaryStates.PRESENT ) { + return false; + } + if ( m.getState( 0, 1 ) != BinaryStates.ABSENT ) { + return false; + } + if ( m.getState( 1, 0 ) != BinaryStates.PRESENT ) { + return false; + } + if ( m.getState( 2, 0 ) != BinaryStates.ABSENT ) { + return false; + } + if ( m.getState( 4, 8 ) != BinaryStates.PRESENT ) { + return false; + } + if ( !m.getIdentifier( 0 ).equals( "MOUSE" ) ) { + return false; + } + if ( !m.getIdentifier( 4 ).equals( "ARATH" ) ) { + return false; + } + // if ( labels.length != 7 ) { + // return false; + // } + // if ( !labels[ 0 ].equals( "14-3-3" ) ) { + // return false; + // } + // if ( !labels[ 1 ].equals( "2-Hacid_dh" ) ) { + // return false; + // } + // if ( !labels[ 2 ].equals( "2-Hacid_dh_C" ) ) { + // return false; + // } + // if ( !labels[ 3 ].equals( "2-oxoacid_dh" ) ) { + // return false; + // } + // if ( !labels[ 4 ].equals( "2OG-FeII_Oxy" ) ) { + // return false; + // } + // if ( !labels[ 5 ].equals( "3-HAO" ) ) { + // return false; + // } + // if ( !labels[ 6 ].equals( "3_5_exonuc" ) ) { + // return false; + // } + // parser.setSource( new File( Test.PATH_TO_TEST_DATA + "nexus_test_8.nex" ) ); + // parser.parse(); + // labels = parser.getCharStateLabels(); + // if ( labels.length != 7 ) { + // return false; + // } + // if ( !labels[ 0 ].equals( "14-3-3" ) ) { + // return false; + // } + // if ( !labels[ 1 ].equals( "2-Hacid_dh" ) ) { + // return false; + // } + // if ( !labels[ 2 ].equals( "2-Hacid_dh_C" ) ) { + // return false; + // } + // if ( !labels[ 3 ].equals( "2-oxoacid_dh" ) ) { + // return false; + // } + // if ( !labels[ 4 ].equals( "2OG-FeII_Oxy" ) ) { + // return false; + // } + // if ( !labels[ 5 ].equals( "3-HAO" ) ) { + // return false; + // } + // if ( !labels[ 6 ].equals( "3_5_exonuc" ) ) { + // return false; + // } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNexusTreeParsing() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final NexusPhylogeniesParser parser = new NexusPhylogeniesParser(); + Phylogeny[] phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "nexus_test_1.nex", parser ); + if ( phylogenies.length != 1 ) { + return false; + } + if ( phylogenies[ 0 ].getNumberOfExternalNodes() != 25 ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "" ) ) { + return false; + } + phylogenies = null; + phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "nexus_test_2.nex", parser ); + if ( phylogenies.length != 1 ) { + return false; + } + if ( phylogenies[ 0 ].getNumberOfExternalNodes() != 10 ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "name" ) ) { + return false; + } + phylogenies = null; + phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "nexus_test_3.nex", parser ); + if ( phylogenies.length != 1 ) { + return false; + } + if ( phylogenies[ 0 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "" ) ) { + return false; + } + if ( phylogenies[ 0 ].isRooted() ) { + return false; + } + phylogenies = null; + phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "nexus_test_4.nex", parser ); + if ( phylogenies.length != 18 ) { + return false; + } + if ( phylogenies[ 0 ].getNumberOfExternalNodes() != 10 ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "tree 0" ) ) { + return false; + } + if ( !phylogenies[ 1 ].getName().equals( "tree 1" ) ) { + return false; + } + if ( phylogenies[ 1 ].getNumberOfExternalNodes() != 10 ) { + return false; + } + if ( phylogenies[ 2 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( phylogenies[ 3 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( phylogenies[ 4 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( phylogenies[ 5 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( phylogenies[ 6 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( phylogenies[ 7 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 8 ].getName().equals( "tree 8" ) ) { + return false; + } + if ( phylogenies[ 8 ].isRooted() ) { + return false; + } + if ( phylogenies[ 8 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 9 ].getName().equals( "tree 9" ) ) { + return false; + } + if ( !phylogenies[ 9 ].isRooted() ) { + return false; + } + if ( phylogenies[ 9 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 10 ].getName().equals( "tree 10" ) ) { + return false; + } + if ( !phylogenies[ 10 ].isRooted() ) { + return false; + } + if ( phylogenies[ 10 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 11 ].getName().equals( "tree 11" ) ) { + return false; + } + if ( phylogenies[ 11 ].isRooted() ) { + return false; + } + if ( phylogenies[ 11 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 12 ].getName().equals( "tree 12" ) ) { + return false; + } + if ( !phylogenies[ 12 ].isRooted() ) { + return false; + } + if ( phylogenies[ 12 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 13 ].getName().equals( "tree 13" ) ) { + return false; + } + if ( !phylogenies[ 13 ].isRooted() ) { + return false; + } + if ( phylogenies[ 13 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 14 ].getName().equals( "tree 14" ) ) { + return false; + } + if ( !phylogenies[ 14 ].isRooted() ) { + return false; + } + if ( phylogenies[ 14 ].getNumberOfExternalNodes() != 10 ) { + return false; + } + if ( !phylogenies[ 15 ].getName().equals( "tree 15" ) ) { + return false; + } + if ( phylogenies[ 15 ].isRooted() ) { + return false; + } + if ( phylogenies[ 15 ].getNumberOfExternalNodes() != 10 ) { + return false; + } + if ( !phylogenies[ 16 ].getName().equals( "tree 16" ) ) { + return false; + } + if ( !phylogenies[ 16 ].isRooted() ) { + return false; + } + if ( phylogenies[ 16 ].getNumberOfExternalNodes() != 10 ) { + return false; + } + if ( !phylogenies[ 17 ].getName().equals( "tree 17" ) ) { + return false; + } + if ( phylogenies[ 17 ].isRooted() ) { + return false; + } + if ( phylogenies[ 17 ].getNumberOfExternalNodes() != 10 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNexusTreeParsingTranslating() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final NexusPhylogeniesParser parser = new NexusPhylogeniesParser(); + Phylogeny[] phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "nexus_test_5.nex", parser ); + if ( phylogenies.length != 1 ) { + return false; + } + if ( phylogenies[ 0 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "Tree0" ) ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getName().equals( "Scarabaeus" ) ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getNextExternalNode().getName().equals( "Drosophila" ) ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getNextExternalNode().getNextExternalNode().getName() + .equals( "Aranaeus" ) ) { + return false; + } + phylogenies = null; + phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "nexus_test_6.nex", parser ); + if ( phylogenies.length != 3 ) { + return false; + } + if ( phylogenies[ 0 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "Tree0" ) ) { + return false; + } + if ( phylogenies[ 0 ].isRooted() ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getName().equals( "Scarabaeus" ) ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getNextExternalNode().getName().equals( "Drosophila" ) ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getNextExternalNode().getNextExternalNode().getName() + .equals( "Aranaeus" ) ) { + return false; + } + if ( phylogenies[ 1 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 1 ].getName().equals( "Tree1" ) ) { + return false; + } + if ( phylogenies[ 1 ].isRooted() ) { + return false; + } + if ( !phylogenies[ 1 ].getFirstExternalNode().getName().equals( "Scarabaeus" ) ) { + return false; + } + if ( !phylogenies[ 1 ].getFirstExternalNode().getNextExternalNode().getName().equals( "Drosophila" ) ) { + return false; + } + if ( !phylogenies[ 1 ].getFirstExternalNode().getNextExternalNode().getNextExternalNode().getName() + .equals( "Aranaeus" ) ) { + return false; + } + if ( phylogenies[ 2 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 2 ].getName().equals( "Tree2" ) ) { + return false; + } + if ( !phylogenies[ 2 ].isRooted() ) { + return false; + } + if ( !phylogenies[ 2 ].getFirstExternalNode().getName().equals( "Scarabaeus" ) ) { + return false; + } + if ( !phylogenies[ 2 ].getFirstExternalNode().getNextExternalNode().getName().equals( "Drosophila" ) ) { + return false; + } + if ( !phylogenies[ 2 ].getFirstExternalNode().getNextExternalNode().getNextExternalNode().getName() + .equals( "Aranaeus" ) ) { + return false; + } + phylogenies = null; + phylogenies = factory.create( Test.PATH_TO_TEST_DATA + "nexus_test_7.nex", parser ); + if ( phylogenies.length != 3 ) { + return false; + } + if ( phylogenies[ 0 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 0 ].getName().equals( "Tree0" ) ) { + return false; + } + if ( phylogenies[ 0 ].isRooted() ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getName().equals( "Scarabaeus" ) ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getNextExternalNode().getName().equals( "Drosophila" ) ) { + return false; + } + if ( !phylogenies[ 0 ].getFirstExternalNode().getNextExternalNode().getNextExternalNode().getName() + .equals( "Aranaeus" ) ) { + return false; + } + if ( phylogenies[ 1 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 1 ].getName().equals( "Tree1" ) ) { + return false; + } + if ( phylogenies[ 1 ].isRooted() ) { + return false; + } + if ( !phylogenies[ 1 ].getFirstExternalNode().getName().equals( "Scarabaeus" ) ) { + return false; + } + if ( !phylogenies[ 1 ].getFirstExternalNode().getNextExternalNode().getName().equals( "Drosophila" ) ) { + return false; + } + if ( !phylogenies[ 1 ].getFirstExternalNode().getNextExternalNode().getNextExternalNode().getName() + .equals( "Aranaeus" ) ) { + return false; + } + if ( phylogenies[ 2 ].getNumberOfExternalNodes() != 3 ) { + return false; + } + if ( !phylogenies[ 2 ].getName().equals( "Tree2" ) ) { + return false; + } + if ( !phylogenies[ 2 ].isRooted() ) { + return false; + } + if ( !phylogenies[ 2 ].getFirstExternalNode().getName().equals( "Scarabaeus" ) ) { + return false; + } + if ( !phylogenies[ 2 ].getFirstExternalNode().getNextExternalNode().getName().equals( "Drosophila" ) ) { + return false; + } + if ( !phylogenies[ 2 ].getFirstExternalNode().getNextExternalNode().getNextExternalNode().getName() + .equals( "Aranaeus" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNHParsing() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p1 = factory.create( "(A,B1)", new NHXParser() )[ 0 ]; + if ( !p1.toNewHampshireX().equals( "(A,B1)" ) ) { + return false; + } + final NHXParser nhxp = new NHXParser(); + nhxp.setTaxonomyExtraction( ForesterUtil.TAXONOMY_EXTRACTION.NO ); + nhxp.setReplaceUnderscores( true ); + final Phylogeny uc0 = factory.create( "(A__A_,_B_B)", nhxp )[ 0 ]; + if ( !uc0.getRoot().getChildNode( 0 ).getName().equals( "A A " ) ) { + return false; + } + if ( !uc0.getRoot().getChildNode( 1 ).getName().equals( " B B" ) ) { + return false; + } + final Phylogeny p1b = factory + .create( " \n \t \b \r \f ; ( \n \t \b \r \f; A ; \n \t \b \r \f, \n \t \b \r \f; B ; \n \t \b \r \f 1 \n \t \b \r \f ; \n \t \b \r \f );;;;; \n \t \b \r \f;;; \n \t \b \r \f ", + new NHXParser() )[ 0 ]; + if ( !p1b.toNewHampshireX().equals( "(';A;',';B;1;')" ) ) { + return false; + } + if ( !p1b.toNewHampshire().equals( "(';A;',';B;1;');" ) ) { + return false; + } + final Phylogeny p2 = factory.create( new StringBuffer( "(A,B2)" ), new NHXParser() )[ 0 ]; + final Phylogeny p3 = factory.create( new char[] { '(', 'A', ',', 'B', '3', ')' }, new NHXParser() )[ 0 ]; + final Phylogeny p4 = factory.create( "(A,B4);", new NHXParser() )[ 0 ]; + final Phylogeny p5 = factory.create( new StringBuffer( "(A,B5);" ), new NHXParser() )[ 0 ]; + final Phylogeny[] p7 = factory.create( "(A,B7);(C,D7)", new NHXParser() ); + final Phylogeny[] p8 = factory.create( "(A,B8) (C,D8)", new NHXParser() ); + final Phylogeny[] p9 = factory.create( "(A,B9)\n(C,D9)", new NHXParser() ); + final Phylogeny[] p10 = factory.create( "(A,B10);(C,D10);", new NHXParser() ); + final Phylogeny[] p11 = factory.create( "(A,B11);(C,D11) (E,F11)\t(G,H11)", new NHXParser() ); + final Phylogeny[] p12 = factory.create( "(A,B12) (C,D12) (E,F12) (G,H12)", new NHXParser() ); + final Phylogeny[] p13 = factory.create( " ; (;A; , ; B ; 1 3 ; \n)\t ( \n ;" + + " C ; ,; D;13;);;;;;;(;E;,;F;13 ;) ; " + + "; ; ( \t\n\r\b; G ;, ;H ;1 3; ) ; ; ;", + new NHXParser() ); + if ( !p13[ 0 ].toNewHampshireX().equals( "(';A;',';B;13;')" ) ) { + return false; + } + if ( !p13[ 1 ].toNewHampshireX().equals( "(';C;',';D;13;')" ) ) { + return false; + } + if ( !p13[ 2 ].toNewHampshireX().equals( "(';E;',';F;13;')" ) ) { + return false; + } + if ( !p13[ 3 ].toNewHampshireX().equals( "(';G;',';H;13;')" ) ) { + return false; + } + final Phylogeny[] p14 = factory.create( "(A,B14)ab", new NHXParser() ); + final Phylogeny[] p15 = factory.create( "(A,B15)ab;", new NHXParser() ); + final String p16_S = "((A,B),C)"; + final Phylogeny[] p16 = factory.create( p16_S, new NHXParser() ); + if ( !p16[ 0 ].toNewHampshireX().equals( p16_S ) ) { + return false; + } + final String p17_S = "(C,(A,B))"; + final Phylogeny[] p17 = factory.create( p17_S, new NHXParser() ); + if ( !p17[ 0 ].toNewHampshireX().equals( p17_S ) ) { + return false; + } + final String p18_S = "((A,B),(C,D))"; + final Phylogeny[] p18 = factory.create( p18_S, new NHXParser() ); + if ( !p18[ 0 ].toNewHampshireX().equals( p18_S ) ) { + return false; + } + final String p19_S = "(((A,B),C),D)"; + final Phylogeny[] p19 = factory.create( p19_S, new NHXParser() ); + if ( !p19[ 0 ].toNewHampshireX().equals( p19_S ) ) { + return false; + } + final String p20_S = "(A,(B,(C,D)))"; + final Phylogeny[] p20 = factory.create( p20_S, new NHXParser() ); + if ( !p20[ 0 ].toNewHampshireX().equals( p20_S ) ) { + return false; + } + final String p21_S = "(A,(B,(C,(D,E))))"; + final Phylogeny[] p21 = factory.create( p21_S, new NHXParser() ); + if ( !p21[ 0 ].toNewHampshireX().equals( p21_S ) ) { + return false; + } + final String p22_S = "((((A,B),C),D),E)"; + final Phylogeny[] p22 = factory.create( p22_S, new NHXParser() ); + if ( !p22[ 0 ].toNewHampshireX().equals( p22_S ) ) { + return false; + } + final String p23_S = "(A,(B,(C,(D,E)de)cde)bcde)abcde"; + final Phylogeny[] p23 = factory.create( p23_S, new NHXParser() ); + if ( !p23[ 0 ].toNewHampshireX().equals( p23_S ) ) { + return false; + } + final String p24_S = "((((A,B)ab,C)abc,D)abcd,E)abcde"; + final Phylogeny[] p24 = factory.create( p24_S, new NHXParser() ); + if ( !p24[ 0 ].toNewHampshireX().equals( p24_S ) ) { + return false; + } + final String p241_S1 = "(A,(B,(C,(D,E)de)cde)bcde)abcde"; + final String p241_S2 = "((((A,B)ab,C)abc,D)abcd,E)abcde"; + final Phylogeny[] p241 = factory.create( p241_S1 + p241_S2, new NHXParser() ); + if ( !p241[ 0 ].toNewHampshireX().equals( p241_S1 ) ) { + return false; + } + if ( !p241[ 1 ].toNewHampshireX().equals( p241_S2 ) ) { + return false; + } + final String p25_S = "((((((((((((((A,B)ab,C)abc,D)abcd,E)" + + "abcde,(B,(C,(D,E)de)cde)bcde)abcde,(B,((A,(B,(C,(D," + + "E)de)cde)bcde)abcde,(D,E)de)cde)bcde)abcde,B)ab,C)" + + "abc,((((A,B)ab,C)abc,D)abcd,E)abcde)abcd,E)abcde," + + "((((A,((((((((A,B)ab,C)abc,((((A,B)ab,C)abc,D)abcd," + + "E)abcde)abcd,E)abcde,((((A,B)ab,C)abc,D)abcd,E)abcde)" + + "ab,C)abc,((((A,B)ab,C)abc,D)abcd,E)abcde)abcd,E)abcde" + + ")ab,C)abc,D)abcd,E)abcde)ab,C)abc,((((A,B)ab,C)abc,D)" + "abcd,E)abcde)abcd,E)abcde"; + final Phylogeny[] p25 = factory.create( p25_S, new NHXParser() ); + if ( !p25[ 0 ].toNewHampshireX().equals( p25_S ) ) { + return false; + } + final String p26_S = "(A,B)ab"; + final Phylogeny[] p26 = factory.create( p26_S, new NHXParser() ); + if ( !p26[ 0 ].toNewHampshireX().equals( p26_S ) ) { + return false; + } + final String p27_S = "((((A,B)ab,C)abc,D)abcd,E)abcde"; + final Phylogeny[] p27 = factory.create( new File( Test.PATH_TO_TEST_DATA + "phylogeny27.nhx" ), + new NHXParser() ); + if ( !p27[ 0 ].toNewHampshireX().equals( p27_S ) ) { + return false; + } + final String p28_S1 = "((((A,B)ab,C)abc,D)abcd,E)abcde"; + final String p28_S2 = "(A,(B,(C,(D,E)de)cde)bcde)abcde"; + final String p28_S3 = "(A,B)ab"; + final String p28_S4 = "((((A,B),C),D),;E;)"; + final Phylogeny[] p28 = factory.create( new File( Test.PATH_TO_TEST_DATA + "phylogeny28.nhx" ), + new NHXParser() ); + if ( !p28[ 0 ].toNewHampshireX().equals( p28_S1 ) ) { + return false; + } + if ( !p28[ 1 ].toNewHampshireX().equals( p28_S2 ) ) { + return false; + } + if ( !p28[ 2 ].toNewHampshireX().equals( p28_S3 ) ) { + return false; + } + if ( !p28[ 3 ].toNewHampshireX().equals( "((((A,B),C),D),';E;')" ) ) { + return false; + } + final String p29_S = "((((A:0.01,B:0.684)ab:0.345,C:0.3451)abc:0.3451,D:1.5)abcd:0.134,E:0.32)abcde:0.1345"; + final Phylogeny[] p29 = factory.create( p29_S, new NHXParser() ); + if ( !p29[ 0 ].toNewHampshireX().equals( p29_S ) ) { + return false; + } + final String p30_S = "((((A:0.01,B:0.02):0.93,C:0.04):0.05,D:1.4):0.06,E):0.72"; + final Phylogeny[] p30 = factory.create( p30_S, new NHXParser() ); + if ( !p30[ 0 ].toNewHampshireX().equals( p30_S ) ) { + return false; + } + final String p32_S = " ; ; \n \t \b \f \r ;;;;;; "; + final Phylogeny[] p32 = factory.create( p32_S, new NHXParser() ); + if ( ( p32.length != 1 ) || !p32[ 0 ].isEmpty() ) { + return false; + } + final String p33_S = "A"; + final Phylogeny[] p33 = factory.create( p33_S, new NHXParser() ); + if ( !p33[ 0 ].toNewHampshireX().equals( p33_S ) ) { + return false; + } + final String p34_S = "B;"; + final Phylogeny[] p34 = factory.create( p34_S, new NHXParser() ); + if ( !p34[ 0 ].toNewHampshireX().equals( "B" ) ) { + return false; + } + final String p35_S = "B:0.2"; + final Phylogeny[] p35 = factory.create( p35_S, new NHXParser() ); + if ( !p35[ 0 ].toNewHampshireX().equals( p35_S ) ) { + return false; + } + final String p36_S = "(A)"; + final Phylogeny[] p36 = factory.create( p36_S, new NHXParser() ); + if ( !p36[ 0 ].toNewHampshireX().equals( p36_S ) ) { + return false; + } + final String p37_S = "((A))"; + final Phylogeny[] p37 = factory.create( p37_S, new NHXParser() ); + if ( !p37[ 0 ].toNewHampshireX().equals( p37_S ) ) { + return false; + } + final String p38_S = "(((((((A:0.2):0.2):0.3):0.4):0.5):0.6):0.7):0.8"; + final Phylogeny[] p38 = factory.create( p38_S, new NHXParser() ); + if ( !p38[ 0 ].toNewHampshireX().equals( p38_S ) ) { + return false; + } + final String p39_S = "(((B,((((A:0.2):0.2):0.3):0.4):0.5):0.6):0.7):0.8"; + final Phylogeny[] p39 = factory.create( p39_S, new NHXParser() ); + if ( !p39[ 0 ].toNewHampshireX().equals( p39_S ) ) { + return false; + } + final String p40_S = "(A,B,C)"; + final Phylogeny[] p40 = factory.create( p40_S, new NHXParser() ); + if ( !p40[ 0 ].toNewHampshireX().equals( p40_S ) ) { + return false; + } + final String p41_S = "(A,B,C,D,E,F,G,H,I,J,K)"; + final Phylogeny[] p41 = factory.create( p41_S, new NHXParser() ); + if ( !p41[ 0 ].toNewHampshireX().equals( p41_S ) ) { + return false; + } + final String p42_S = "(A,B,(X,Y,Z),D,E,F,G,H,I,J,K)"; + final Phylogeny[] p42 = factory.create( p42_S, new NHXParser() ); + if ( !p42[ 0 ].toNewHampshireX().equals( p42_S ) ) { + return false; + } + final String p43_S = "(A,B,C,(AA,BB,CC,(CCC,DDD,EEE,(FFFF,GGGG)x)y,DD,EE,FF,GG,HH),D,E,(EE,FF),F,G,H,(((((5)4)3)2)1),I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,(XX,(YY)),Y,Z)"; + final Phylogeny[] p43 = factory.create( p43_S, new NHXParser() ); + if ( !p43[ 0 ].toNewHampshireX().equals( p43_S ) ) { + return false; + } + final String p44_S = "(((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)))"; + final Phylogeny[] p44 = factory.create( p44_S, new NHXParser() ); + if ( !p44[ 0 ].toNewHampshireX().equals( p44_S ) ) { + return false; + } + final String p45_S = "((((((((((A))))))))),(((((((((B))))))))),(((((((((C))))))))))"; + final Phylogeny[] p45 = factory.create( p45_S, new NHXParser() ); + if ( !p45[ 0 ].toNewHampshireX().equals( p45_S ) ) { + return false; + } + final String p46_S = ""; + final Phylogeny[] p46 = factory.create( p46_S, new NHXParser() ); + if ( ( p46.length != 1 ) || !p46[ 0 ].isEmpty() ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNHXconversion() { + try { + final PhylogenyNode n1 = new PhylogenyNode(); + final PhylogenyNode n2 = new PhylogenyNode( "" ); + final PhylogenyNode n3 = new PhylogenyNode( "n3" ); + final PhylogenyNode n4 = new PhylogenyNode( "n4:0.01" ); + final PhylogenyNode n5 = new PhylogenyNode( "n5:0.1[&&NHX:S=Ecoli:E=1.1.1.1:D=Y:Co=Y:B=56:T=1:W=2:C=10.20.30:XN=S=tag1=value1=unit1]" ); + final PhylogenyNode n6 = new PhylogenyNode( "n6:0.000001[&&NHX:S=Ecoli:E=1.1.1.1:D=N:Co=N:B=100:T=1:W=2:C=0.0.0:XN=B=bool_tag=T]" ); + if ( !n1.toNewHampshireX().equals( "" ) ) { + return false; + } + if ( !n2.toNewHampshireX().equals( "" ) ) { + return false; + } + if ( !n3.toNewHampshireX().equals( "n3" ) ) { + return false; + } + if ( !n4.toNewHampshireX().equals( "n4:0.01" ) ) { + return false; + } + if ( !n5.toNewHampshireX() + .equals( "n5:0.1[&&NHX:T=1:S=Ecoli:D=Y:XN=S=tag1=value1=unit1:B=56.0:W=2.0:C=10.20.30]" ) ) { + return false; + } + if ( !n6.toNewHampshireX() + .equals( "n6:1.0E-6[&&NHX:T=1:S=Ecoli:D=N:XN=B=bool_tag=T:B=100.0:W=2.0:C=0.0.0]" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNHXNodeParsing() { + try { + final PhylogenyNode n1 = new PhylogenyNode(); + final PhylogenyNode n2 = new PhylogenyNode( "" ); + final PhylogenyNode n3 = new PhylogenyNode( "n3" ); + final PhylogenyNode n4 = new PhylogenyNode( "n4:0.01" ); + final PhylogenyNode n5 = new PhylogenyNode( "n5:0.1[&&NHX:S=Ecoli:E=1.1.1.1:D=Y:B=56:T=1:On=22:SOn=33:SNn=44:W=2:C=10.20.30:XN=S=tag1=value1=unit1:XN=S=tag3=value3=unit3]" ); + if ( !n3.getName().equals( "n3" ) ) { + return false; + } + if ( n3.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + return false; + } + if ( n3.isDuplication() ) { + return false; + } + if ( n3.isHasAssignedEvent() ) { + return false; + } + if ( PhylogenyMethods.getBranchWidthValue( n3 ) != BranchWidth.BRANCH_WIDTH_DEFAULT_VALUE ) { + return false; + } + if ( !n4.getName().equals( "n4" ) ) { + return false; + } + if ( n4.getDistanceToParent() != 0.01 ) { + return false; + } + if ( !n5.getName().equals( "n5" ) ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( n5 ) != 56 ) { + return false; + } + if ( n5.getDistanceToParent() != 0.1 ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n5 ).equals( "Ecoli" ) ) { + return false; + } + if ( !n5.isDuplication() ) { + return false; + } + if ( !n5.isHasAssignedEvent() ) { + return false; + } + if ( PhylogenyMethods.getBranchWidthValue( n5 ) != 2 ) { + return false; + } + if ( n5.getNodeData().getProperties().getPropertyRefs().length != 2 ) { + return false; + } + final PhylogenyNode n8 = new PhylogenyNode( "n8_ECOLI/12:0.01", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n8.getName().equals( "n8_ECOLI/12" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n8 ).equals( "ECOLI" ) ) { + return false; + } + final PhylogenyNode n9 = new PhylogenyNode( "n9_ECOLI/12=12:0.01", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n9.getName().equals( "n9_ECOLI/12=12" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n9 ).equals( "ECOLI" ) ) { + return false; + } + final PhylogenyNode n10 = new PhylogenyNode( "n10.ECOLI", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n10.getName().equals( "n10.ECOLI" ) ) { + return false; + } + final PhylogenyNode n20 = new PhylogenyNode( "n20_ECOLI/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n20.getName().equals( "n20_ECOLI/1-2" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n20 ).equals( "ECOLI" ) ) { + return false; + } + final PhylogenyNode n20x = new PhylogenyNode( "n20_ECOL1/1-2", ForesterUtil.TAXONOMY_EXTRACTION.YES ); + if ( !n20x.getName().equals( "n20_ECOL1/1-2" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n20x ).equals( "ECOL1" ) ) { + return false; + } + final PhylogenyNode n20xx = new PhylogenyNode( "n20_eCOL1/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n20xx.getName().equals( "n20_eCOL1/1-2" ) ) { + return false; + } + if ( PhylogenyMethods.getSpecies( n20xx ).length() > 0 ) { + return false; + } + final PhylogenyNode n20xxx = new PhylogenyNode( "n20_ecoli/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n20xxx.getName().equals( "n20_ecoli/1-2" ) ) { + return false; + } + if ( PhylogenyMethods.getSpecies( n20xxx ).length() > 0 ) { + return false; + } + final PhylogenyNode n20xxxx = new PhylogenyNode( "n20_Ecoli/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n20xxxx.getName().equals( "n20_Ecoli/1-2" ) ) { + return false; + } + if ( PhylogenyMethods.getSpecies( n20xxxx ).length() > 0 ) { + return false; + } + final PhylogenyNode n21 = new PhylogenyNode( "n21_PIG", ForesterUtil.TAXONOMY_EXTRACTION.YES ); + if ( !n21.getName().equals( "n21_PIG" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n21 ).equals( "PIG" ) ) { + return false; + } + final PhylogenyNode n21x = new PhylogenyNode( "n21_PIG", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n21x.getName().equals( "n21_PIG" ) ) { + return false; + } + if ( PhylogenyMethods.getSpecies( n21x ).length() > 0 ) { + return false; + } + final PhylogenyNode n22 = new PhylogenyNode( "n22/PIG", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n22.getName().equals( "n22/PIG" ) ) { + return false; + } + if ( PhylogenyMethods.getSpecies( n22 ).length() > 0 ) { + return false; + } + final PhylogenyNode n23 = new PhylogenyNode( "n23/PIG_1", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n23.getName().equals( "n23/PIG_1" ) ) { + return false; + } + if ( PhylogenyMethods.getSpecies( n23 ).length() > 0 ) { + return false; + } + if ( NHXParser.LIMIT_SPECIES_NAMES_TO_FIVE_CHARS ) { + final PhylogenyNode a = new PhylogenyNode( "n10_ECOLI/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !a.getName().equals( "n10_ECOLI/1-2" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( a ).equals( "ECOLI" ) ) { + return false; + } + final PhylogenyNode b = new PhylogenyNode( "n10_ECOLI1/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !b.getName().equals( "n10_ECOLI1/1-2" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( b ).equals( "ECOLI" ) ) { + return false; + } + final PhylogenyNode c = new PhylogenyNode( "n10_RATAF12/1000-2000", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !c.getName().equals( "n10_RATAF12/1000-2000" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( c ).equals( "RATAF" ) ) { + return false; + } + final PhylogenyNode d = new PhylogenyNode( "n10_RAT1/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !d.getName().equals( "n10_RAT1/1-2" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( d ).equals( "RAT" ) ) { + return false; + } + final PhylogenyNode e = new PhylogenyNode( "n10_RAT1", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !e.getName().equals( "n10_RAT1" ) ) { + return false; + } + if ( !ForesterUtil.isEmpty( PhylogenyMethods.getSpecies( e ) ) ) { + return false; + } + } + final PhylogenyNode n11 = new PhylogenyNode( "n111111_ECOLI/jdj:0.4", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n11.getName().equals( "n111111_ECOLI/jdj" ) ) { + return false; + } + if ( n11.getDistanceToParent() != 0.4 ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n11 ).equals( "ECOLI" ) ) { + return false; + } + final PhylogenyNode n12 = new PhylogenyNode( "n111111-ECOLI---/jdj:0.4", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n12.getName().equals( "n111111-ECOLI---/jdj" ) ) { + return false; + } + if ( n12.getDistanceToParent() != 0.4 ) { + return false; + } + if ( PhylogenyMethods.getSpecies( n12 ).length() > 0 ) { + return false; + } + final Property tvu1 = n5.getNodeData().getProperties().getProperty( "tag1" ); + final Property tvu3 = n5.getNodeData().getProperties().getProperty( "tag3" ); + if ( !tvu1.getRef().equals( "tag1" ) ) { + return false; + } + if ( !tvu1.getDataType().equals( "xsd:string" ) ) { + return false; + } + if ( !tvu1.getUnit().equals( "unit1" ) ) { + return false; + } + if ( !tvu1.getValue().equals( "value1" ) ) { + return false; + } + if ( !tvu3.getRef().equals( "tag3" ) ) { + return false; + } + if ( !tvu3.getDataType().equals( "xsd:string" ) ) { + return false; + } + if ( !tvu3.getUnit().equals( "unit3" ) ) { + return false; + } + if ( !tvu3.getValue().equals( "value3" ) ) { + return false; + } + if ( n1.getName().compareTo( "" ) != 0 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( n1 ) != Confidence.CONFIDENCE_DEFAULT_VALUE ) { + return false; + } + if ( n1.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + return false; + } + if ( n2.getName().compareTo( "" ) != 0 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( n2 ) != Confidence.CONFIDENCE_DEFAULT_VALUE ) { + return false; + } + if ( n2.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + return false; + } + final PhylogenyNode n00 = new PhylogenyNode( "n7:0.000001[&&NHX:GN=gene_name:AC=accession123:ID=node_identifier:S=Ecoli:D=N:Co=N:B=100:T=1:On=100:SOn=100:SNn=100:W=2:C=0.0.0:XN=U=url_tag=www.yahoo.com]" ); + if ( !n00.getNodeData().getNodeIdentifier().getValue().equals( "node_identifier" ) ) { + return false; + } + if ( !n00.getNodeData().getSequence().getName().equals( "gene_name" ) ) { + return false; + } + if ( !n00.getNodeData().getSequence().getAccession().getValue().equals( "accession123" ) ) { + return false; + } + if ( !n00.getNodeData().getProperties().getProperty( "url_tag" ).getRef().equals( "url_tag" ) ) { + return false; + } + if ( n00.getNodeData().getProperties().getProperty( "url_tag" ).getAppliesTo() != Property.AppliesTo.NODE ) { + return false; + } + if ( !n00.getNodeData().getProperties().getProperty( "url_tag" ).getDataType().equals( "xsd:anyURI" ) ) { + return false; + } + if ( !n00.getNodeData().getProperties().getProperty( "url_tag" ).getValue().equals( "www.yahoo.com" ) ) { + return false; + } + if ( !n00.getNodeData().getProperties().getProperty( "url_tag" ).getUnit().equals( "" ) ) { + return false; + } + final PhylogenyNode nx = new PhylogenyNode( "n5:0.1[&&NHX:S=Ecoli:GN=gene_1]" ); + if ( !nx.getNodeData().getSequence().getName().equals( "gene_1" ) ) { + return false; + } + final PhylogenyNode nx2 = new PhylogenyNode( "n5:0.1[&&NHX:S=Ecoli:G=gene_2]" ); + if ( !nx2.getNodeData().getSequence().getName().equals( "gene_2" ) ) { + return false; + } + final PhylogenyNode n13 = new PhylogenyNode( "blah_12345/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n13.getName().equals( "blah_12345/1-2" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n13 ).equals( "" ) ) { + return false; + } + final PhylogenyNode n14 = new PhylogenyNode( "blah_12X45/1-2", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n14.getName().equals( "blah_12X45/1-2" ) ) { + return false; + } + if ( !PhylogenyMethods.getSpecies( n14 ).equals( "12X45" ) ) { + return false; + } + final PhylogenyNode n15 = new PhylogenyNode( "something_wicked[123]", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n15.getName().equals( "something_wicked" ) ) { + return false; + } + if ( n15.getBranchData().getNumberOfConfidences() != 1 ) { + return false; + } + if ( !isEqual( n15.getBranchData().getConfidence( 0 ).getValue(), 123 ) ) { + return false; + } + final PhylogenyNode n16 = new PhylogenyNode( "something_wicked2[9]", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n16.getName().equals( "something_wicked2" ) ) { + return false; + } + if ( n16.getBranchData().getNumberOfConfidences() != 1 ) { + return false; + } + if ( !isEqual( n16.getBranchData().getConfidence( 0 ).getValue(), 9 ) ) { + return false; + } + final PhylogenyNode n17 = new PhylogenyNode( "something_wicked3[a]", + ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !n17.getName().equals( "something_wicked3" ) ) { + return false; + } + if ( n17.getBranchData().getNumberOfConfidences() != 0 ) { + return false; + } + final PhylogenyNode n18 = new PhylogenyNode( ":0.5[91]", ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ); + if ( !isEqual( n18.getDistanceToParent(), 0.5 ) ) { + return false; + } + if ( n18.getBranchData().getNumberOfConfidences() != 1 ) { + return false; + } + if ( !isEqual( n18.getBranchData().getConfidence( 0 ).getValue(), 91 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNHXParsing() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p1 = factory.create( "(A [&&NHX:S=a_species],B1[&&NHX:S=b_species])", new NHXParser() )[ 0 ]; + if ( !p1.toNewHampshireX().equals( "(A[&&NHX:S=a_species],B1[&&NHX:S=b_species])" ) ) { + return false; + } + final String p2_S = "(((((((A:0.2[&&NHX:S=qwerty]):0.2[&&NHX:S=uiop]):0.3[&&NHX:S=asdf]):0.4[&&NHX:S=zxc]):0.5[&&NHX:S=a]):0.6[&&NHX:S=asd]):0.7[&&NHX:S=za]):0.8[&&NHX:S=zaq]"; + final Phylogeny[] p2 = factory.create( p2_S, new NHXParser() ); + if ( !p2[ 0 ].toNewHampshireX().equals( p2_S ) ) { + return false; + } + final String p2b_S = "(((((((A:0.2[&NHX:S=qwerty]):0.2[&:S=uiop]):0.3[&NHX:S=asdf]):0.4[S=zxc]):0.5[]):0.6[&&NH:S=asd]):0.7[&&HX:S=za]):0.8[&&:S=zaq]"; + final Phylogeny[] p2b = factory.create( p2b_S, new NHXParser() ); + if ( !p2b[ 0 ].toNewHampshireX().equals( "(((((((A:0.2):0.2):0.3):0.4):0.5):0.6):0.7):0.8" ) ) { + return false; + } + final Phylogeny[] p3 = factory + .create( "[ comment&&NHX,())))](((((((A:0.2[&&NHX:S=qwerty]):0.2[&&NHX:S=uiop]):0.3[&&NHX:S=asdf]):0.4[&&NHX:S=zxc]):0.5[&&NHX:S=a]):0.6[&&NHX:S=asd]):0.7[&&NHX:S=za]):0.8[&&NHX:S=zaq]", + new NHXParser() ); + if ( !p3[ 0 ].toNewHampshireX().equals( p2_S ) ) { + return false; + } + final Phylogeny[] p4 = factory + .create( "(((((((A:0.2[&&NHX:S=qwerty]):0.2[&&NHX:S=uiop]):0.3[&&NHX:S=asdf]):0.4[&&NHX:S=zxc]):0.5[&&NHX:S=a]):0.6[&&NHX:S=asd]):0.7[&&NHX:S=za]):0.8[&&NHX:S=zaq][comment(]", + new NHXParser() ); + if ( !p4[ 0 ].toNewHampshireX().equals( p2_S ) ) { + return false; + } + final Phylogeny[] p5 = factory + .create( "[] ( [][ ][ ] ([((( &&NHXcomment only![[[[[[]([]((((A:0.2[&&NHX:S=q[comment )))]werty][,,,,))]):0.2[&&NHX:S=uiop]):0.3[&&NHX:S=a[comment,,))]sdf])[comment(((]:0.4[&&NHX:S=zxc][comment(((][comment(((]):0.5[&&NHX:S=a]):0.6[&&NHX:S=a[comment(((]sd]):0.7[&&NHX:S=za]):0.8[&&NHX:S=zaq][comment(((]", + new NHXParser() ); + if ( !p5[ 0 ].toNewHampshireX().equals( p2_S ) ) { + return false; + } + final String p6_S_C = "(A[][][][1][22][333][4444][55555][666666][&&NHX:S=Aspecies],B[))],C,(AA,BB,CC,(CCC,DDD,EEE,[comment](FFFF,GGGG)x)y,D[comment]D,EE,FF,GG,HH),D,E,(EE,FF),F,G,H,(((((5)4)3)2)1),I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,(XX,(YY)),Y,Z)"; + final String p6_S_WO_C = "(A[&&NHX:S=Aspecies],B,C,(AA,BB,CC,(CCC,DDD,EEE,(FFFF,GGGG)x)y,DD,EE,FF,GG,HH),D,E,(EE,FF),F,G,H,(((((5)4)3)2)1),I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,(XX,(YY)),Y,Z)"; + final Phylogeny[] p6 = factory.create( p6_S_C, new NHXParser() ); + if ( !p6[ 0 ].toNewHampshireX().equals( p6_S_WO_C ) ) { + return false; + } + final String p7_S_C = "(((A [&&NHX:S=species_a], B [&&NHX:S=Vstorri] , C , D),(A,B,C,D[comment])[],[c][]([xxx]A[comment],[comment]B[comment][comment],[comment][comment]C[comment][comment],[comment][comment]D[comment][comment])[comment][comment],[comment] [comment](A,B,C,D)),((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),((A,B,C[comment][comment][comment][comment][comment] [comment],D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),[comment][comment]((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)))"; + final String p7_S_WO_C = "(((A[&&NHX:S=species_a],B[&&NHX:S=Vstorri],C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)),((A,B,C,D),(A,B,C,D),(A,B,C,D),(A,B,C,D)))"; + final Phylogeny[] p7 = factory.create( p7_S_C, new NHXParser() ); + if ( !p7[ 0 ].toNewHampshireX().equals( p7_S_WO_C ) ) { + return false; + } + final String p8_S_C = "[cmt](((([]([))))))](((((A[&&NHX:S= [a comment] a])))))))[too many comments!:)])),(((((((((B[&&NHX[ a comment in a bad place]:S =b])))))[] [] )))),(((((((((C[&&NHX:S=c]) ))[,,, ])))))))"; + final String p8_S_WO_C = "((((((((((A[&&NHX:S=a]))))))))),(((((((((B[&&NHX:S=b]))))))))),(((((((((C[&&NHX:S=c]))))))))))"; + final Phylogeny[] p8 = factory.create( p8_S_C, new NHXParser() ); + if ( !p8[ 0 ].toNewHampshireX().equals( p8_S_WO_C ) ) { + return false; + } + final Phylogeny p9 = factory.create( "((A:0.2,B:0.3):0.5[91],C:0.1)root:0.1[100]", new NHXParser() )[ 0 ]; + if ( !p9.toNewHampshireX().equals( "((A:0.2,B:0.3):0.5[&&NHX:B=91.0],C:0.1)root:0.1[&&NHX:B=100.0]" ) ) { + return false; + } + final Phylogeny p10 = factory + .create( " [79] ( (A [co mment] :0 .2[comment],B:0.3[com])[com ment]: 0. 5 \t[ 9 1 ][ comment],C: 0.1)[comment]root:0.1[100] [comment]", + new NHXParser() )[ 0 ]; + if ( !p10.toNewHampshireX().equals( "((A:0.2,B:0.3):0.5[&&NHX:B=91.0],C:0.1)root:0.1[&&NHX:B=100.0]" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testNHXParsingQuotes() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final NHXParser p = new NHXParser(); + final Phylogeny[] phylogenies_0 = factory.create( new File( Test.PATH_TO_TEST_DATA + "quotes.nhx" ), p ); + if ( phylogenies_0.length != 5 ) { + return false; + } + final Phylogeny phy = phylogenies_0[ 4 ]; + if ( phy.getNumberOfExternalNodes() != 7 ) { + return false; + } + if ( phy.getNodes( "a name in double quotes from tree ((a,b),c)" ).size() != 1 ) { + return false; + } + if ( phy.getNodes( "charles darwin 'origin of species'" ).size() != 1 ) { + return false; + } + if ( !phy.getNodes( "charles darwin 'origin of species'" ).get( 0 ).getNodeData().getTaxonomy() + .getScientificName().equals( "hsapiens" ) ) { + return false; + } + if ( phy.getNodes( "shouldbetogether single quotes" ).size() != 1 ) { + return false; + } + if ( phy.getNodes( "'single quotes' inside double quotes" ).size() != 1 ) { + return false; + } + if ( phy.getNodes( "double quotes inside single quotes" ).size() != 1 ) { + return false; + } + if ( phy.getNodes( "noquotes" ).size() != 1 ) { + return false; + } + if ( phy.getNodes( "A ( B C '" ).size() != 1 ) { + return false; + } + final NHXParser p1p = new NHXParser(); + p1p.setIgnoreQuotes( true ); + final Phylogeny p1 = factory.create( "(\"A\",'B1')", p1p )[ 0 ]; + if ( !p1.toNewHampshire().equals( "(A,B1);" ) ) { + return false; + } + final NHXParser p2p = new NHXParser(); + p1p.setIgnoreQuotes( false ); + final Phylogeny p2 = factory.create( "(\"A\",'B1')", p2p )[ 0 ]; + if ( !p2.toNewHampshire().equals( "(A,B1);" ) ) { + return false; + } + final NHXParser p3p = new NHXParser(); + p3p.setIgnoreQuotes( false ); + final Phylogeny p3 = factory.create( "(\"A)\",'B1')", p3p )[ 0 ]; + if ( !p3.toNewHampshire().equals( "('A)',B1);" ) ) { + return false; + } + final NHXParser p4p = new NHXParser(); + p4p.setIgnoreQuotes( false ); + final Phylogeny p4 = factory.create( "(\"A)\",'B(),; x')", p4p )[ 0 ]; + if ( !p4.toNewHampshire().equals( "('A)','B(),; x');" ) ) { + return false; + } + final Phylogeny p10 = factory + .create( " [79] ( (\"A \n\tB \" [co mment] :0 .2[comment],'B':0.3[com])[com ment]: 0. 5 \t[ 9 1 ][ comment],'C (or D?\\//;,))': 0.1)[comment]'\nroot is here (cool, was! ) ':0.1[100] [comment]", + new NHXParser() )[ 0 ]; + final String p10_clean_str = "(('A B':0.2,B:0.3):0.5[&&NHX:B=91.0],'C (or D?\\//;,))':0.1)'root is here (cool, was! )':0.1[&&NHX:B=100.0]"; + if ( !p10.toNewHampshireX().equals( p10_clean_str ) ) { + return false; + } + final Phylogeny p11 = factory.create( p10.toNewHampshireX(), new NHXParser() )[ 0 ]; + if ( !p11.toNewHampshireX().equals( p10_clean_str ) ) { + return false; + } + // + final Phylogeny p12 = factory + .create( " [79] ( (\"A \n\tB \" [[][] :0 .2[comment][\t&\t&\n N\tH\tX:S=mo\tnkey !],'\tB\t\b\t\n\f\rB B ':0.0\b3[])\t[com ment]: 0. 5 \t[ 9 1 ][ \ncomment],'C\t (or D?\\//;,))': 0.\b1)[comment]'\nroot \tis here (cool, \b\t\n\f\r was! ) ':0.1[100] [comment]", + new NHXParser() )[ 0 ]; + final String p12_clean_str = "(('A B':0.2[&&NHX:S=monkey!],'BB B':0.03):0.5[&&NHX:B=91.0],'C (or D?\\//;,))':0.1)'root is here (cool, was! )':0.1[&&NHX:B=100.0]"; + if ( !p12.toNewHampshireX().equals( p12_clean_str ) ) { + return false; + } + final Phylogeny p13 = factory.create( p12.toNewHampshireX(), new NHXParser() )[ 0 ]; + if ( !p13.toNewHampshireX().equals( p12_clean_str ) ) { + return false; + } + final String p12_clean_str_nh = "(('A B':0.2,'BB B':0.03):0.5,'C (or D?\\//;,))':0.1)'root is here (cool, was! )':0.1;"; + if ( !p13.toNewHampshire().equals( p12_clean_str_nh ) ) { + return false; + } + final Phylogeny p14 = factory.create( p13.toNewHampshire(), new NHXParser() )[ 0 ]; + if ( !p14.toNewHampshire().equals( p12_clean_str_nh ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPhylogenyBranch() { + try { + final PhylogenyNode a1 = new PhylogenyNode( "a" ); + final PhylogenyNode b1 = new PhylogenyNode( "b" ); + final PhylogenyBranch a1b1 = new PhylogenyBranch( a1, b1 ); + final PhylogenyBranch b1a1 = new PhylogenyBranch( b1, a1 ); + if ( !a1b1.equals( a1b1 ) ) { + return false; + } + if ( !a1b1.equals( b1a1 ) ) { + return false; + } + if ( !b1a1.equals( a1b1 ) ) { + return false; + } + final PhylogenyBranch a1_b1 = new PhylogenyBranch( a1, b1, true ); + final PhylogenyBranch b1_a1 = new PhylogenyBranch( b1, a1, true ); + final PhylogenyBranch a1_b1_ = new PhylogenyBranch( a1, b1, false ); + if ( a1_b1.equals( b1_a1 ) ) { + return false; + } + if ( a1_b1.equals( a1_b1_ ) ) { + return false; + } + final PhylogenyBranch b1_a1_ = new PhylogenyBranch( b1, a1, false ); + if ( !a1_b1.equals( b1_a1_ ) ) { + return false; + } + if ( a1_b1_.equals( b1_a1_ ) ) { + return false; + } + if ( !a1_b1_.equals( b1_a1 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPhyloXMLparsingOfDistributionElement() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + PhyloXmlParser xml_parser = null; + try { + xml_parser = PhyloXmlParser.createPhyloXmlParserXsdValidating(); + } + catch ( final Exception e ) { + // Do nothing -- means were not running from jar. + } + if ( xml_parser == null ) { + xml_parser = new PhyloXmlParser(); + if ( USE_LOCAL_PHYLOXML_SCHEMA ) { + xml_parser.setValidateAgainstSchema( PHYLOXML_LOCAL_XSD ); + } + else { + xml_parser.setValidateAgainstSchema( PHYLOXML_REMOTE_XSD ); + } + } + final Phylogeny[] phylogenies_0 = factory.create( Test.PATH_TO_TEST_DATA + "phyloxml_distribution.xml", + xml_parser ); + if ( xml_parser.getErrorCount() > 0 ) { + System.out.println( xml_parser.getErrorMessages().toString() ); + return false; + } + if ( phylogenies_0.length != 1 ) { + return false; + } + final Phylogeny t1 = phylogenies_0[ 0 ]; + PhylogenyNode n = null; + Distribution d = null; + n = t1.getNode( "root node" ); + if ( !n.getNodeData().isHasDistribution() ) { + return false; + } + if ( n.getNodeData().getDistributions().size() != 1 ) { + return false; + } + d = n.getNodeData().getDistribution(); + if ( !d.getDesc().equals( "Hirschweg 38" ) ) { + return false; + } + if ( d.getPoints().size() != 1 ) { + return false; + } + if ( d.getPolygons() != null ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltitude().toString().equals( "472" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltiudeUnit().equals( "m" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getGeodeticDatum().equals( "WGS84" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLatitude().toString().equals( "47.48148427110029" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLongitude().toString().equals( "8.768951296806335" ) ) { + return false; + } + n = t1.getNode( "node a" ); + if ( !n.getNodeData().isHasDistribution() ) { + return false; + } + if ( n.getNodeData().getDistributions().size() != 2 ) { + return false; + } + d = n.getNodeData().getDistribution( 1 ); + if ( !d.getDesc().equals( "San Diego" ) ) { + return false; + } + if ( d.getPoints().size() != 1 ) { + return false; + } + if ( d.getPolygons() != null ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltitude().toString().equals( "104" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltiudeUnit().equals( "m" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getGeodeticDatum().equals( "WGS84" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLatitude().toString().equals( "32.880933" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLongitude().toString().equals( "-117.217543" ) ) { + return false; + } + n = t1.getNode( "node bb" ); + if ( !n.getNodeData().isHasDistribution() ) { + return false; + } + if ( n.getNodeData().getDistributions().size() != 1 ) { + return false; + } + d = n.getNodeData().getDistribution( 0 ); + if ( d.getPoints().size() != 3 ) { + return false; + } + if ( d.getPolygons().size() != 2 ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLatitude().toString().equals( "1" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLongitude().toString().equals( "2" ) ) { + return false; + } + if ( !d.getPoints().get( 1 ).getLatitude().toString().equals( "3" ) ) { + return false; + } + if ( !d.getPoints().get( 1 ).getLongitude().toString().equals( "4" ) ) { + return false; + } + if ( !d.getPoints().get( 2 ).getLatitude().toString().equals( "5" ) ) { + return false; + } + if ( !d.getPoints().get( 2 ).getLongitude().toString().equals( "6" ) ) { + return false; + } + Polygon p = d.getPolygons().get( 0 ); + if ( p.getPoints().size() != 3 ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLatitude().toString().equals( "0.1" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLongitude().toString().equals( "0.2" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getAltitude().toString().equals( "10" ) ) { + return false; + } + if ( !p.getPoints().get( 2 ).getLatitude().toString().equals( "0.5" ) ) { + return false; + } + if ( !p.getPoints().get( 2 ).getLongitude().toString().equals( "0.6" ) ) { + return false; + } + if ( !p.getPoints().get( 2 ).getAltitude().toString().equals( "30" ) ) { + return false; + } + p = d.getPolygons().get( 1 ); + if ( p.getPoints().size() != 3 ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLatitude().toString().equals( "1.49348902489947473" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLongitude().toString().equals( "2.567489393947847492" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getAltitude().toString().equals( "10" ) ) { + return false; + } + // Roundtrip: + final StringBuffer t1_sb = new StringBuffer( t1.toPhyloXML( 0 ) ); + final Phylogeny[] rt = factory.create( t1_sb, xml_parser ); + if ( rt.length != 1 ) { + return false; + } + final Phylogeny t1_rt = rt[ 0 ]; + n = t1_rt.getNode( "root node" ); + if ( !n.getNodeData().isHasDistribution() ) { + return false; + } + if ( n.getNodeData().getDistributions().size() != 1 ) { + return false; + } + d = n.getNodeData().getDistribution(); + if ( !d.getDesc().equals( "Hirschweg 38" ) ) { + return false; + } + if ( d.getPoints().size() != 1 ) { + return false; + } + if ( d.getPolygons() != null ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltitude().toString().equals( "472" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltiudeUnit().equals( "m" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getGeodeticDatum().equals( "WGS84" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLatitude().toString().equals( "47.48148427110029" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLongitude().toString().equals( "8.768951296806335" ) ) { + return false; + } + n = t1_rt.getNode( "node a" ); + if ( !n.getNodeData().isHasDistribution() ) { + return false; + } + if ( n.getNodeData().getDistributions().size() != 2 ) { + return false; + } + d = n.getNodeData().getDistribution( 1 ); + if ( !d.getDesc().equals( "San Diego" ) ) { + return false; + } + if ( d.getPoints().size() != 1 ) { + return false; + } + if ( d.getPolygons() != null ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltitude().toString().equals( "104" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getAltiudeUnit().equals( "m" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getGeodeticDatum().equals( "WGS84" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLatitude().toString().equals( "32.880933" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLongitude().toString().equals( "-117.217543" ) ) { + return false; + } + n = t1_rt.getNode( "node bb" ); + if ( !n.getNodeData().isHasDistribution() ) { + return false; + } + if ( n.getNodeData().getDistributions().size() != 1 ) { + return false; + } + d = n.getNodeData().getDistribution( 0 ); + if ( d.getPoints().size() != 3 ) { + return false; + } + if ( d.getPolygons().size() != 2 ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLatitude().toString().equals( "1" ) ) { + return false; + } + if ( !d.getPoints().get( 0 ).getLongitude().toString().equals( "2" ) ) { + return false; + } + if ( !d.getPoints().get( 1 ).getLatitude().toString().equals( "3" ) ) { + return false; + } + if ( !d.getPoints().get( 1 ).getLongitude().toString().equals( "4" ) ) { + return false; + } + if ( !d.getPoints().get( 2 ).getLatitude().toString().equals( "5" ) ) { + return false; + } + if ( !d.getPoints().get( 2 ).getLongitude().toString().equals( "6" ) ) { + return false; + } + p = d.getPolygons().get( 0 ); + if ( p.getPoints().size() != 3 ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLatitude().toString().equals( "0.1" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLongitude().toString().equals( "0.2" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getAltitude().toString().equals( "10" ) ) { + return false; + } + if ( !p.getPoints().get( 2 ).getLatitude().toString().equals( "0.5" ) ) { + return false; + } + if ( !p.getPoints().get( 2 ).getLongitude().toString().equals( "0.6" ) ) { + return false; + } + if ( !p.getPoints().get( 2 ).getAltitude().toString().equals( "30" ) ) { + return false; + } + p = d.getPolygons().get( 1 ); + if ( p.getPoints().size() != 3 ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLatitude().toString().equals( "1.49348902489947473" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getLongitude().toString().equals( "2.567489393947847492" ) ) { + return false; + } + if ( !p.getPoints().get( 0 ).getAltitude().toString().equals( "10" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPostOrderIterator() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t0 = factory.create( "((A,B)ab,(C,D)cd)r", new NHXParser() )[ 0 ]; + PhylogenyNodeIterator it0; + for( it0 = t0.iteratorPostorder(); it0.hasNext(); ) { + it0.next(); + } + for( it0.reset(); it0.hasNext(); ) { + it0.next(); + } + final Phylogeny t1 = factory.create( "(((A,B)ab,(C,D)cd)abcd,((E,F)ef,(G,H)gh)efgh)r", new NHXParser() )[ 0 ]; + final PhylogenyNodeIterator it = t1.iteratorPostorder(); + if ( !it.next().getName().equals( "A" ) ) { + return false; + } + if ( !it.next().getName().equals( "B" ) ) { + return false; + } + if ( !it.next().getName().equals( "ab" ) ) { + return false; + } + if ( !it.next().getName().equals( "C" ) ) { + return false; + } + if ( !it.next().getName().equals( "D" ) ) { + return false; + } + if ( !it.next().getName().equals( "cd" ) ) { + return false; + } + if ( !it.next().getName().equals( "abcd" ) ) { + return false; + } + if ( !it.next().getName().equals( "E" ) ) { + return false; + } + if ( !it.next().getName().equals( "F" ) ) { + return false; + } + if ( !it.next().getName().equals( "ef" ) ) { + return false; + } + if ( !it.next().getName().equals( "G" ) ) { + return false; + } + if ( !it.next().getName().equals( "H" ) ) { + return false; + } + if ( !it.next().getName().equals( "gh" ) ) { + return false; + } + if ( !it.next().getName().equals( "efgh" ) ) { + return false; + } + if ( !it.next().getName().equals( "r" ) ) { + return false; + } + if ( it.hasNext() ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPreOrderIterator() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t0 = factory.create( "((A,B)ab,(C,D)cd)r", new NHXParser() )[ 0 ]; + PhylogenyNodeIterator it0; + for( it0 = t0.iteratorPreorder(); it0.hasNext(); ) { + it0.next(); + } + for( it0.reset(); it0.hasNext(); ) { + it0.next(); + } + PhylogenyNodeIterator it = t0.iteratorPreorder(); + if ( !it.next().getName().equals( "r" ) ) { + return false; + } + if ( !it.next().getName().equals( "ab" ) ) { + return false; + } + if ( !it.next().getName().equals( "A" ) ) { + return false; + } + if ( !it.next().getName().equals( "B" ) ) { + return false; + } + if ( !it.next().getName().equals( "cd" ) ) { + return false; + } + if ( !it.next().getName().equals( "C" ) ) { + return false; + } + if ( !it.next().getName().equals( "D" ) ) { + return false; + } + if ( it.hasNext() ) { + return false; + } + final Phylogeny t1 = factory.create( "(((A,B)ab,(C,D)cd)abcd,((E,F)ef,(G,H)gh)efgh)r", new NHXParser() )[ 0 ]; + it = t1.iteratorPreorder(); + if ( !it.next().getName().equals( "r" ) ) { + return false; + } + if ( !it.next().getName().equals( "abcd" ) ) { + return false; + } + if ( !it.next().getName().equals( "ab" ) ) { + return false; + } + if ( !it.next().getName().equals( "A" ) ) { + return false; + } + if ( !it.next().getName().equals( "B" ) ) { + return false; + } + if ( !it.next().getName().equals( "cd" ) ) { + return false; + } + if ( !it.next().getName().equals( "C" ) ) { + return false; + } + if ( !it.next().getName().equals( "D" ) ) { + return false; + } + if ( !it.next().getName().equals( "efgh" ) ) { + return false; + } + if ( !it.next().getName().equals( "ef" ) ) { + return false; + } + if ( !it.next().getName().equals( "E" ) ) { + return false; + } + if ( !it.next().getName().equals( "F" ) ) { + return false; + } + if ( !it.next().getName().equals( "gh" ) ) { + return false; + } + if ( !it.next().getName().equals( "G" ) ) { + return false; + } + if ( !it.next().getName().equals( "H" ) ) { + return false; + } + if ( it.hasNext() ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testPropertiesMap() { + try { + final PropertiesMap pm = new PropertiesMap(); + final Property p0 = new Property( "dimensions:diameter", "1", "metric:mm", "xsd:decimal", AppliesTo.NODE ); + final Property p1 = new Property( "dimensions:length", "2", "metric:mm", "xsd:decimal", AppliesTo.NODE ); + final Property p2 = new Property( "something:else", + "?", + "improbable:research", + "xsd:decimal", + AppliesTo.NODE ); + pm.addProperty( p0 ); + pm.addProperty( p1 ); + pm.addProperty( p2 ); + if ( !pm.getProperty( "dimensions:diameter" ).getValue().equals( "1" ) ) { + return false; + } + if ( !pm.getProperty( "dimensions:length" ).getValue().equals( "2" ) ) { + return false; + } + if ( pm.getProperties().size() != 3 ) { + return false; + } + if ( pm.getPropertiesWithGivenReferencePrefix( "dimensions" ).size() != 2 ) { + return false; + } + if ( pm.getPropertiesWithGivenReferencePrefix( "something" ).size() != 1 ) { + return false; + } + if ( pm.getProperties().size() != 3 ) { + return false; + } + pm.removeProperty( "dimensions:diameter" ); + if ( pm.getProperties().size() != 2 ) { + return false; + } + if ( pm.getPropertiesWithGivenReferencePrefix( "dimensions" ).size() != 1 ) { + return false; + } + if ( pm.getPropertiesWithGivenReferencePrefix( "something" ).size() != 1 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testReIdMethods() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p = factory.create( "((1,2)A,(((X,Y,Z)a,b)3)B,(4,5,6)C)r", new NHXParser() )[ 0 ]; + final int count = PhylogenyNode.getNodeCount(); + p.levelOrderReID(); + if ( p.getNode( "r" ).getId() != count ) { + return false; + } + if ( p.getNode( "A" ).getId() != count + 1 ) { + return false; + } + if ( p.getNode( "B" ).getId() != count + 1 ) { + return false; + } + if ( p.getNode( "C" ).getId() != count + 1 ) { + return false; + } + if ( p.getNode( "1" ).getId() != count + 2 ) { + return false; + } + if ( p.getNode( "2" ).getId() != count + 2 ) { + return false; + } + if ( p.getNode( "3" ).getId() != count + 2 ) { + return false; + } + if ( p.getNode( "4" ).getId() != count + 2 ) { + return false; + } + if ( p.getNode( "5" ).getId() != count + 2 ) { + return false; + } + if ( p.getNode( "6" ).getId() != count + 2 ) { + return false; + } + if ( p.getNode( "a" ).getId() != count + 3 ) { + return false; + } + if ( p.getNode( "b" ).getId() != count + 3 ) { + return false; + } + if ( p.getNode( "X" ).getId() != count + 4 ) { + return false; + } + if ( p.getNode( "Y" ).getId() != count + 4 ) { + return false; + } + if ( p.getNode( "Z" ).getId() != count + 4 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testRerooting() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t1 = factory.create( "((A:1,B:2)AB:1[&&NHX:B=55],(C:3,D:5)CD:3[&&NHX:B=10])ABCD:0.5", + new NHXParser() )[ 0 ]; + if ( !t1.isRooted() ) { + return false; + } + t1.reRoot( t1.getNode( "D" ) ); + t1.reRoot( t1.getNode( "CD" ) ); + t1.reRoot( t1.getNode( "A" ) ); + t1.reRoot( t1.getNode( "B" ) ); + t1.reRoot( t1.getNode( "AB" ) ); + t1.reRoot( t1.getNode( "D" ) ); + t1.reRoot( t1.getNode( "C" ) ); + t1.reRoot( t1.getNode( "CD" ) ); + t1.reRoot( t1.getNode( "A" ) ); + t1.reRoot( t1.getNode( "B" ) ); + t1.reRoot( t1.getNode( "AB" ) ); + t1.reRoot( t1.getNode( "D" ) ); + t1.reRoot( t1.getNode( "D" ) ); + t1.reRoot( t1.getNode( "C" ) ); + t1.reRoot( t1.getNode( "A" ) ); + t1.reRoot( t1.getNode( "B" ) ); + t1.reRoot( t1.getNode( "AB" ) ); + t1.reRoot( t1.getNode( "C" ) ); + t1.reRoot( t1.getNode( "D" ) ); + t1.reRoot( t1.getNode( "CD" ) ); + t1.reRoot( t1.getNode( "D" ) ); + t1.reRoot( t1.getNode( "A" ) ); + t1.reRoot( t1.getNode( "B" ) ); + t1.reRoot( t1.getNode( "AB" ) ); + t1.reRoot( t1.getNode( "C" ) ); + t1.reRoot( t1.getNode( "D" ) ); + t1.reRoot( t1.getNode( "CD" ) ); + t1.reRoot( t1.getNode( "D" ) ); + if ( !isEqual( t1.getNode( "A" ).getDistanceToParent(), 1 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "B" ).getDistanceToParent(), 2 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "C" ).getDistanceToParent(), 3 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "D" ).getDistanceToParent(), 2.5 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "CD" ).getDistanceToParent(), 2.5 ) ) { + return false; + } + if ( !isEqual( t1.getNode( "AB" ).getDistanceToParent(), 4 ) ) { + return false; + } + final Phylogeny t2 = factory.create( "(((A:1,B:2)AB:10[&&NHX:B=55],C)ABC:3[&&NHX:B=33],D:5)ABCD:0.5", + new NHXParser() )[ 0 ]; + t2.reRoot( t2.getNode( "A" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "ABC" ) ); + t2.reRoot( t2.getNode( "A" ) ); + t2.reRoot( t2.getNode( "B" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "C" ) ); + t2.reRoot( t2.getNode( "ABC" ) ); + t2.reRoot( t2.getNode( "A" ) ); + t2.reRoot( t2.getNode( "B" ) ); + t2.reRoot( t2.getNode( "AB" ) ); + t2.reRoot( t2.getNode( "AB" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "C" ) ); + t2.reRoot( t2.getNode( "B" ) ); + t2.reRoot( t2.getNode( "AB" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "ABC" ) ); + t2.reRoot( t2.getNode( "A" ) ); + t2.reRoot( t2.getNode( "B" ) ); + t2.reRoot( t2.getNode( "AB" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "C" ) ); + t2.reRoot( t2.getNode( "ABC" ) ); + t2.reRoot( t2.getNode( "A" ) ); + t2.reRoot( t2.getNode( "B" ) ); + t2.reRoot( t2.getNode( "AB" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "C" ) ); + t2.reRoot( t2.getNode( "A" ) ); + t2.reRoot( t2.getNode( "B" ) ); + t2.reRoot( t2.getNode( "AB" ) ); + t2.reRoot( t2.getNode( "C" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "ABC" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "A" ) ); + t2.reRoot( t2.getNode( "B" ) ); + t2.reRoot( t2.getNode( "AB" ) ); + t2.reRoot( t2.getNode( "C" ) ); + t2.reRoot( t2.getNode( "D" ) ); + t2.reRoot( t2.getNode( "ABC" ) ); + t2.reRoot( t2.getNode( "D" ) ); + if ( !isEqual( t2.getNode( "AB" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "ABC" ).getBranchData().getConfidence( 0 ).getValue(), 33 ) ) { + return false; + } + t2.reRoot( t2.getNode( "ABC" ) ); + if ( !isEqual( t2.getNode( "AB" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "ABC" ).getBranchData().getConfidence( 0 ).getValue(), 33 ) ) { + return false; + } + t2.reRoot( t2.getNode( "AB" ) ); + if ( !isEqual( t2.getNode( "AB" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "ABC" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "D" ).getBranchData().getConfidence( 0 ).getValue(), 33 ) ) { + return false; + } + t2.reRoot( t2.getNode( "AB" ) ); + if ( !isEqual( t2.getNode( "AB" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "ABC" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "D" ).getBranchData().getConfidence( 0 ).getValue(), 33 ) ) { + return false; + } + t2.reRoot( t2.getNode( "D" ) ); + if ( !isEqual( t2.getNode( "AB" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "ABC" ).getBranchData().getConfidence( 0 ).getValue(), 33 ) ) { + return false; + } + t2.reRoot( t2.getNode( "ABC" ) ); + if ( !isEqual( t2.getNode( "AB" ).getBranchData().getConfidence( 0 ).getValue(), 55 ) ) { + return false; + } + if ( !isEqual( t2.getNode( "ABC" ).getBranchData().getConfidence( 0 ).getValue(), 33 ) ) { + return false; + } + final Phylogeny t3 = factory.create( "(A[&&NHX:B=10],B[&&NHX:B=20],C[&&NHX:B=30],D[&&NHX:B=40])", + new NHXParser() )[ 0 ]; + t3.reRoot( t3.getNode( "B" ) ); + if ( t3.getNode( "B" ).getBranchData().getConfidence( 0 ).getValue() != 20 ) { + return false; + } + if ( t3.getNode( "A" ).getParent().getBranchData().getConfidence( 0 ).getValue() != 20 ) { + return false; + } + if ( t3.getNode( "A" ).getParent().getNumberOfDescendants() != 3 ) { + return false; + } + t3.reRoot( t3.getNode( "B" ) ); + if ( t3.getNode( "B" ).getBranchData().getConfidence( 0 ).getValue() != 20 ) { + return false; + } + if ( t3.getNode( "A" ).getParent().getBranchData().getConfidence( 0 ).getValue() != 20 ) { + return false; + } + if ( t3.getNode( "A" ).getParent().getNumberOfDescendants() != 3 ) { + return false; + } + t3.reRoot( t3.getRoot() ); + if ( t3.getNode( "B" ).getBranchData().getConfidence( 0 ).getValue() != 20 ) { + return false; + } + if ( t3.getNode( "A" ).getParent().getBranchData().getConfidence( 0 ).getValue() != 20 ) { + return false; + } + if ( t3.getNode( "A" ).getParent().getNumberOfDescendants() != 3 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testSDIse() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny species1 = factory.create( "[&&NHX:S=yeast]", new NHXParser() )[ 0 ]; + final Phylogeny gene1 = factory.create( "(A1[&&NHX:S=yeast],A2[&&NHX:S=yeast])", new NHXParser() )[ 0 ]; + gene1.setRooted( true ); + species1.setRooted( true ); + final SDI sdi = new SDIse( gene1, species1 ); + if ( !gene1.getRoot().isDuplication() ) { + return false; + } + final Phylogeny species2 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=B]),[&&NHX:S=C]),[&&NHX:S=D]),([&&NHX:S=E],[&&NHX:S=F]))", + new NHXParser() )[ 0 ]; + final Phylogeny gene2 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=B])ab,[&&NHX:S=C])abc,[&&NHX:S=D])abcd,([&&NHX:S=E],[&&NHX:S=F])ef)r", + new NHXParser() )[ 0 ]; + species2.setRooted( true ); + gene2.setRooted( true ); + final SDI sdi2 = new SDIse( gene2, species2 ); + if ( sdi2.getDuplicationsSum() != 0 ) { + return false; + } + if ( !gene2.getNode( "ab" ).isSpeciation() ) { + return false; + } + if ( !gene2.getNode( "ab" ).isHasAssignedEvent() ) { + return false; + } + if ( !gene2.getNode( "abc" ).isSpeciation() ) { + return false; + } + if ( !gene2.getNode( "abc" ).isHasAssignedEvent() ) { + return false; + } + if ( !gene2.getNode( "r" ).isSpeciation() ) { + return false; + } + if ( !gene2.getNode( "r" ).isHasAssignedEvent() ) { + return false; + } + final Phylogeny species3 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=B]),[&&NHX:S=C]),[&&NHX:S=D]),([&&NHX:S=E],[&&NHX:S=F]))", + new NHXParser() )[ 0 ]; + final Phylogeny gene3 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=A])aa,[&&NHX:S=C])abc,[&&NHX:S=D])abcd,([&&NHX:S=E],[&&NHX:S=F])ef)r", + new NHXParser() )[ 0 ]; + species3.setRooted( true ); + gene3.setRooted( true ); + final SDI sdi3 = new SDIse( gene3, species3 ); + if ( sdi3.getDuplicationsSum() != 1 ) { + return false; + } + if ( !gene3.getNode( "aa" ).isDuplication() ) { + return false; + } + if ( !gene3.getNode( "aa" ).isHasAssignedEvent() ) { + return false; + } + final Phylogeny species4 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=B]),[&&NHX:S=C]),[&&NHX:S=D]),([&&NHX:S=E],[&&NHX:S=F]))", + new NHXParser() )[ 0 ]; + final Phylogeny gene4 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=C])ac,[&&NHX:S=B])abc,[&&NHX:S=D])abcd,([&&NHX:S=E],[&&NHX:S=F])ef)r", + new NHXParser() )[ 0 ]; + species4.setRooted( true ); + gene4.setRooted( true ); + final SDI sdi4 = new SDIse( gene4, species4 ); + if ( sdi4.getDuplicationsSum() != 1 ) { + return false; + } + if ( !gene4.getNode( "ac" ).isSpeciation() ) { + return false; + } + if ( !gene4.getNode( "abc" ).isDuplication() ) { + return false; + } + if ( gene4.getNode( "abcd" ).isDuplication() ) { + return false; + } + if ( species4.getNumberOfExternalNodes() != 6 ) { + return false; + } + if ( gene4.getNumberOfExternalNodes() != 6 ) { + return false; + } + final Phylogeny species5 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=B]),[&&NHX:S=C]),[&&NHX:S=D]),([&&NHX:S=E],[&&NHX:S=F]))", + new NHXParser() )[ 0 ]; + final Phylogeny gene5 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=D])ad,[&&NHX:S=C])adc,[&&NHX:S=B])abcd,([&&NHX:S=E],[&&NHX:S=F])ef)r", + new NHXParser() )[ 0 ]; + species5.setRooted( true ); + gene5.setRooted( true ); + final SDI sdi5 = new SDIse( gene5, species5 ); + if ( sdi5.getDuplicationsSum() != 2 ) { + return false; + } + if ( !gene5.getNode( "ad" ).isSpeciation() ) { + return false; + } + if ( !gene5.getNode( "adc" ).isDuplication() ) { + return false; + } + if ( !gene5.getNode( "abcd" ).isDuplication() ) { + return false; + } + if ( species5.getNumberOfExternalNodes() != 6 ) { + return false; + } + if ( gene5.getNumberOfExternalNodes() != 6 ) { + return false; + } + // Trees from Louxin Zhang 1997 "On a Mirkin-Muchnik-Smith + // Conjecture for Comparing Molecular Phylogenies" + // J. of Comput Bio. Vol. 4, No 2, pp.177-187 + final Phylogeny species6 = factory + .create( "(((1:[&&NHX:S=1],5:[&&NHX:S=5])1-5,((4:[&&NHX:S=4],6:[&&NHX:S=6])4-6,2:[&&NHX:S=2])4-6-2)1-5-4-6-2," + + "((9:[&&NHX:S=9],3:[&&NHX:S=3])9-3,(8:[&&NHX:S=8],7:[&&NHX:S=7])8-7)9-3-8-7)", + new NHXParser() )[ 0 ]; + final Phylogeny gene6 = factory + .create( "(((1:0.1[&&NHX:S=1],2:0.1[&&NHX:S=2])1-2:0.1,3:0.1[&&NHX:S=3])1-2-3:0.1," + + "((4:0.1[&&NHX:S=4],(5:0.1[&&NHX:S=5],6:0.1[&&NHX:S=6])5-6:0.1)4-5-6:0.1," + + "(7:0.1[&&NHX:S=7],(8:0.1[&&NHX:S=8],9:0.1[&&NHX:S=9])8-9:0.1)7-8-9:0.1)4-5-6-7-8-9:0.1)r;", + new NHXParser() )[ 0 ]; + species6.setRooted( true ); + gene6.setRooted( true ); + final SDI sdi6 = new SDIse( gene6, species6 ); + if ( sdi6.getDuplicationsSum() != 3 ) { + return false; + } + if ( !gene6.getNode( "r" ).isDuplication() ) { + return false; + } + if ( !gene6.getNode( "4-5-6" ).isDuplication() ) { + return false; + } + if ( !gene6.getNode( "7-8-9" ).isDuplication() ) { + return false; + } + if ( !gene6.getNode( "1-2" ).isSpeciation() ) { + return false; + } + if ( !gene6.getNode( "1-2-3" ).isSpeciation() ) { + return false; + } + if ( !gene6.getNode( "5-6" ).isSpeciation() ) { + return false; + } + if ( !gene6.getNode( "8-9" ).isSpeciation() ) { + return false; + } + if ( !gene6.getNode( "4-5-6-7-8-9" ).isSpeciation() ) { + return false; + } + sdi6.computeMappingCostL(); + if ( sdi6.computeMappingCostL() != 17 ) { + return false; + } + if ( species6.getNumberOfExternalNodes() != 9 ) { + return false; + } + if ( gene6.getNumberOfExternalNodes() != 9 ) { + return false; + } + final Phylogeny species7 = Test.createPhylogeny( "(((((((" + "([&&NHX:S=a1],[&&NHX:S=a2])," + + "([&&NHX:S=b1],[&&NHX:S=b2])" + "),[&&NHX:S=x]),(" + "([&&NHX:S=m1],[&&NHX:S=m2])," + + "([&&NHX:S=n1],[&&NHX:S=n2])" + ")),(" + "([&&NHX:S=i1],[&&NHX:S=i2])," + + "([&&NHX:S=j1],[&&NHX:S=j2])" + ")),(" + "([&&NHX:S=e1],[&&NHX:S=e2])," + + "([&&NHX:S=f1],[&&NHX:S=f2])" + ")),[&&NHX:S=y]),[&&NHX:S=z])" ); + species7.setRooted( true ); + final Phylogeny gene7_1 = Test + .createPhylogeny( "((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),x[&&NHX:S=x]),m1[&&NHX:S=m1]),i1[&&NHX:S=i1]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" ); + gene7_1.setRooted( true ); + final SDI sdi7 = new SDIse( gene7_1, species7 ); + if ( sdi7.getDuplicationsSum() != 0 ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "a2" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "x" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "m1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "i1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "e1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "y" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_1, "a1", "z" ).isSpeciation() ) { + return false; + } + final Phylogeny gene7_2 = Test + .createPhylogeny( "(((((((((a1[&&NHX:S=a1],a2[&&NHX:S=a2]),b1[&&NHX:S=b1]),x[&&NHX:S=x]),m1[&&NHX:S=m1]),i1[&&NHX:S=i1]),j2[&&NHX:S=j2]),e1[&&NHX:S=e1]),y[&&NHX:S=y]),z[&&NHX:S=z])" ); + gene7_2.setRooted( true ); + final SDI sdi7_2 = new SDIse( gene7_2, species7 ); + if ( sdi7_2.getDuplicationsSum() != 1 ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "a2" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "b1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "x" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "m1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "i1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "j2" ).isDuplication() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "e1" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "y" ).isSpeciation() ) { + return false; + } + if ( !Test.getEvent( gene7_2, "a1", "z" ).isSpeciation() ) { + return false; + } + } + catch ( final Exception e ) { + return false; + } + return true; + } + + private static boolean testSDIunrooted() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p0 = factory.create( "((((A,B)ab,(C1,C2)cc)abc,D)abcd,(E,F)ef)abcdef", new NHXParser() )[ 0 ]; + final List l = SDIR.getBranchesInPreorder( p0 ); + final Iterator iter = l.iterator(); + PhylogenyBranch br = iter.next(); + if ( !br.getFirstNode().getName().equals( "abcd" ) && !br.getFirstNode().getName().equals( "ef" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "abcd" ) && !br.getSecondNode().getName().equals( "ef" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "abcd" ) && !br.getFirstNode().getName().equals( "abc" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "abcd" ) && !br.getSecondNode().getName().equals( "abc" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "abc" ) && !br.getFirstNode().getName().equals( "ab" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "abc" ) && !br.getSecondNode().getName().equals( "ab" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "A" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "A" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "B" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "B" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "abc" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "abc" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "abc" ) && !br.getFirstNode().getName().equals( "cc" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "abc" ) && !br.getSecondNode().getName().equals( "cc" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "C1" ) && !br.getFirstNode().getName().equals( "cc" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "C1" ) && !br.getSecondNode().getName().equals( "cc" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "C2" ) && !br.getFirstNode().getName().equals( "cc" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "C2" ) && !br.getSecondNode().getName().equals( "cc" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "abc" ) && !br.getFirstNode().getName().equals( "cc" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "abc" ) && !br.getSecondNode().getName().equals( "cc" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "abc" ) && !br.getFirstNode().getName().equals( "abcd" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "abc" ) && !br.getSecondNode().getName().equals( "abcd" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "abcd" ) && !br.getFirstNode().getName().equals( "D" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "abcd" ) && !br.getSecondNode().getName().equals( "D" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "ef" ) && !br.getFirstNode().getName().equals( "abcd" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ef" ) && !br.getSecondNode().getName().equals( "abcd" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "ef" ) && !br.getFirstNode().getName().equals( "E" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ef" ) && !br.getSecondNode().getName().equals( "E" ) ) { + return false; + } + br = iter.next(); + if ( !br.getFirstNode().getName().equals( "ef" ) && !br.getFirstNode().getName().equals( "F" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ef" ) && !br.getSecondNode().getName().equals( "F" ) ) { + return false; + } + if ( iter.hasNext() ) { + return false; + } + final Phylogeny p1 = factory.create( "(C,(A,B)ab)abc", new NHXParser() )[ 0 ]; + final List l1 = SDIR.getBranchesInPreorder( p1 ); + final Iterator iter1 = l1.iterator(); + br = iter1.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "C" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "C" ) ) { + return false; + } + br = iter1.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "A" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "A" ) ) { + return false; + } + br = iter1.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "B" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "B" ) ) { + return false; + } + if ( iter1.hasNext() ) { + return false; + } + final Phylogeny p2 = factory.create( "((A,B)ab,C)abc", new NHXParser() )[ 0 ]; + final List l2 = SDIR.getBranchesInPreorder( p2 ); + final Iterator iter2 = l2.iterator(); + br = iter2.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "C" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "C" ) ) { + return false; + } + br = iter2.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "A" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "A" ) ) { + return false; + } + br = iter2.next(); + if ( !br.getFirstNode().getName().equals( "ab" ) && !br.getFirstNode().getName().equals( "B" ) ) { + return false; + } + if ( !br.getSecondNode().getName().equals( "ab" ) && !br.getSecondNode().getName().equals( "B" ) ) { + return false; + } + if ( iter2.hasNext() ) { + return false; + } + final Phylogeny species0 = factory + .create( "(((([&&NHX:S=A],[&&NHX:S=B]),[&&NHX:S=C]),[&&NHX:S=D]),([&&NHX:S=E],[&&NHX:S=F]))", + new NHXParser() )[ 0 ]; + final Phylogeny gene1 = factory + .create( "(((((A:0.6[&&NHX:S=A],B:0.1[&&NHX:S=B])ab:0.1,C:0.1[&&NHX:S=C])abc:0.3,D:1.0[&&NHX:S=D])abcd:0.2,E:0.1[&&NHX:S=E])abcde:0.2,F:0.2[&&NHX:S=F])", + new NHXParser() )[ 0 ]; + species0.setRooted( true ); + gene1.setRooted( true ); + final SDIR sdi_unrooted = new SDIR(); + sdi_unrooted.infer( gene1, species0, false, true, true, true, 10 ); + if ( sdi_unrooted.getCount() != 1 ) { + return false; + } + if ( sdi_unrooted.getMinimalDuplications() != 0 ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalDiffInSubTreeHeights(), 0.4 ) ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalTreeHeight(), 1.0 ) ) { + return false; + } + if ( sdi_unrooted.getMinimalMappingCost() != Integer.MAX_VALUE ) { + return false; + } + final Phylogeny gene2 = factory + .create( "(((((A:2.6[&&NHX:S=A],B:0.1[&&NHX:S=B])ab:0.1,C:0.1[&&NHX:S=C])abc:0.3,D:1.0[&&NHX:S=D])abcd:0.2,E:0.1[&&NHX:S=E])abcde:0.2,F:0.2[&&NHX:S=F])", + new NHXParser() )[ 0 ]; + gene2.setRooted( true ); + sdi_unrooted.infer( gene2, species0, false, false, true, true, 10 ); + if ( sdi_unrooted.getCount() != 1 ) { + return false; + } + if ( sdi_unrooted.getMinimalDuplications() != 3 ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalDiffInSubTreeHeights(), 0.0 ) ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalTreeHeight(), 2.0 ) ) { + return false; + } + if ( sdi_unrooted.getMinimalMappingCost() != Integer.MAX_VALUE ) { + return false; + } + final Phylogeny species6 = factory + .create( "(((1:[&&NHX:S=1],5:[&&NHX:S=5])1-5,((4:[&&NHX:S=4],6:[&&NHX:S=6])4-6,2:[&&NHX:S=2])4-6-2)1-5-4-6-2," + + "((9:[&&NHX:S=9],3:[&&NHX:S=3])9-3,(8:[&&NHX:S=8],7:[&&NHX:S=7])8-7)9-3-8-7)", + new NHXParser() )[ 0 ]; + final Phylogeny gene6 = factory + .create( "((5:0.1[&&NHX:S=5],6:0.1[&&NHX:S=6])5-6:0.05[&&NHX:S=6],(4:0.1[&&NHX:S=4]," + + "(((1:0.1[&&NHX:S=1],2:0.1[&&NHX:S=2])1-2:0.1[&&NHX:S=2],3:0.25[&&NHX:S=3])1-2-3:0.2[&&NHX:S=2]," + + "(7:0.1[&&NHX:S=7],(8:0.1[&&NHX:S=8]," + + "9:0.1[&&NHX:S=9])8-9:0.1[&&NHX:S=9])7-8-9:0.1[&&NHX:S=8])" + + "4-5-6-7-8-9:0.1[&&NHX:S=5])4-5-6:0.05[&&NHX:S=5])", + new NHXParser() )[ 0 ]; + species6.setRooted( true ); + gene6.setRooted( true ); + Phylogeny[] p6 = sdi_unrooted.infer( gene6, species6, false, true, true, true, 10 ); + if ( sdi_unrooted.getCount() != 1 ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalDiffInSubTreeHeights(), 0.0 ) ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalTreeHeight(), 0.375 ) ) { + return false; + } + if ( sdi_unrooted.getMinimalDuplications() != 3 ) { + return false; + } + if ( sdi_unrooted.getMinimalMappingCost() != Integer.MAX_VALUE ) { + return false; + } + if ( !p6[ 0 ].getRoot().isDuplication() ) { + return false; + } + if ( !p6[ 0 ].getNode( "4-5-6" ).isDuplication() ) { + return false; + } + if ( !p6[ 0 ].getNode( "7-8-9" ).isDuplication() ) { + return false; + } + if ( p6[ 0 ].getNode( "1-2" ).isDuplication() ) { + return false; + } + if ( p6[ 0 ].getNode( "1-2-3" ).isDuplication() ) { + return false; + } + if ( p6[ 0 ].getNode( "5-6" ).isDuplication() ) { + return false; + } + if ( p6[ 0 ].getNode( "8-9" ).isDuplication() ) { + return false; + } + if ( p6[ 0 ].getNode( "4-5-6-7-8-9" ).isDuplication() ) { + return false; + } + p6 = null; + final Phylogeny species7 = factory + .create( "(((1:[&&NHX:S=1],5:[&&NHX:S=5])1-5,((4:[&&NHX:S=4],6:[&&NHX:S=6])4-6,2:[&&NHX:S=2])4-6-2)1-5-4-6-2," + + "((9:[&&NHX:S=9],3:[&&NHX:S=3])9-3,(8:[&&NHX:S=8],7:[&&NHX:S=7])8-7)9-3-8-7)", + new NHXParser() )[ 0 ]; + final Phylogeny gene7 = factory + .create( "((5:0.1[&&NHX:S=5],6:0.1[&&NHX:S=6])5-6:0.05[&&NHX:S=6],(4:0.1[&&NHX:S=4]," + + "(((1:0.1[&&NHX:S=1],2:0.1[&&NHX:S=2])1-2:0.1[&&NHX:S=2],3:0.25[&&NHX:S=3])1-2-3:0.2[&&NHX:S=2]," + + "(7:0.1[&&NHX:S=7],(8:0.1[&&NHX:S=8]," + + "9:0.1[&&NHX:S=9])8-9:0.1[&&NHX:S=9])7-8-9:0.1[&&NHX:S=8])" + + "4-5-6-7-8-9:0.1[&&NHX:S=5])4-5-6:0.05[&&NHX:S=5])", + new NHXParser() )[ 0 ]; + species7.setRooted( true ); + gene7.setRooted( true ); + Phylogeny[] p7 = sdi_unrooted.infer( gene7, species7, true, true, true, true, 10 ); + if ( sdi_unrooted.getCount() != 1 ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalDiffInSubTreeHeights(), 0.0 ) ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalTreeHeight(), 0.375 ) ) { + return false; + } + if ( sdi_unrooted.getMinimalDuplications() != 3 ) { + return false; + } + if ( sdi_unrooted.getMinimalMappingCost() != 17 ) { + return false; + } + if ( !p7[ 0 ].getRoot().isDuplication() ) { + return false; + } + if ( !p7[ 0 ].getNode( "4-5-6" ).isDuplication() ) { + return false; + } + if ( !p7[ 0 ].getNode( "7-8-9" ).isDuplication() ) { + return false; + } + if ( p7[ 0 ].getNode( "1-2" ).isDuplication() ) { + return false; + } + if ( p7[ 0 ].getNode( "1-2-3" ).isDuplication() ) { + return false; + } + if ( p7[ 0 ].getNode( "5-6" ).isDuplication() ) { + return false; + } + if ( p7[ 0 ].getNode( "8-9" ).isDuplication() ) { + return false; + } + if ( p7[ 0 ].getNode( "4-5-6-7-8-9" ).isDuplication() ) { + return false; + } + p7 = null; + final Phylogeny species8 = factory + .create( "(((1:[&&NHX:S=1],5:[&&NHX:S=5])1-5,((4:[&&NHX:S=4],6:[&&NHX:S=6])4-6,2:[&&NHX:S=2])4-6-2)1-5-4-6-2," + + "((9:[&&NHX:S=9],3:[&&NHX:S=3])9-3,(8:[&&NHX:S=8],7:[&&NHX:S=7])8-7)9-3-8-7)", + new NHXParser() )[ 0 ]; + final Phylogeny gene8 = factory + .create( "((5:0.1[&&NHX:S=5],6:0.1[&&NHX:S=6])5-6:0.05[&&NHX:S=6],(4:0.1[&&NHX:S=4]," + + "(((1:0.1[&&NHX:S=1],2:0.1[&&NHX:S=2])1-2:0.1[&&NHX:S=2],3:0.25[&&NHX:S=3])1-2-3:0.2[&&NHX:S=2]," + + "(7:0.1[&&NHX:S=7],(8:0.1[&&NHX:S=8]," + + "9:0.1[&&NHX:S=9])8-9:0.1[&&NHX:S=9])7-8-9:0.1[&&NHX:S=8])" + + "4-5-6-7-8-9:0.1[&&NHX:S=5])4-5-6:0.05[&&NHX:S=5])", + new NHXParser() )[ 0 ]; + species8.setRooted( true ); + gene8.setRooted( true ); + Phylogeny[] p8 = sdi_unrooted.infer( gene8, species8, false, false, true, true, 10 ); + if ( sdi_unrooted.getCount() != 1 ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalDiffInSubTreeHeights(), 0.0 ) ) { + return false; + } + if ( !Test.isEqual( sdi_unrooted.getMinimalTreeHeight(), 0.375 ) ) { + return false; + } + if ( sdi_unrooted.getMinimalDuplications() != 3 ) { + return false; + } + if ( sdi_unrooted.getMinimalMappingCost() != Integer.MAX_VALUE ) { + return false; + } + if ( !p8[ 0 ].getRoot().isDuplication() ) { + return false; + } + if ( !p8[ 0 ].getNode( "4-5-6" ).isDuplication() ) { + return false; + } + if ( !p8[ 0 ].getNode( "7-8-9" ).isDuplication() ) { + return false; + } + if ( p8[ 0 ].getNode( "1-2" ).isDuplication() ) { + return false; + } + if ( p8[ 0 ].getNode( "1-2-3" ).isDuplication() ) { + return false; + } + if ( p8[ 0 ].getNode( "5-6" ).isDuplication() ) { + return false; + } + if ( p8[ 0 ].getNode( "8-9" ).isDuplication() ) { + return false; + } + if ( p8[ 0 ].getNode( "4-5-6-7-8-9" ).isDuplication() ) { + return false; + } + p8 = null; + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testSplit() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p0 = factory.create( "(((A,B,C),D),(E,(F,G)))R", new NHXParser() )[ 0 ]; + //Archaeopteryx.createApplication( p0 ); + final Set ex = new HashSet(); + ex.add( new PhylogenyNode( "A" ) ); + ex.add( new PhylogenyNode( "B" ) ); + ex.add( new PhylogenyNode( "C" ) ); + ex.add( new PhylogenyNode( "D" ) ); + ex.add( new PhylogenyNode( "E" ) ); + ex.add( new PhylogenyNode( "F" ) ); + ex.add( new PhylogenyNode( "G" ) ); + ex.add( new PhylogenyNode( "X" ) ); + ex.add( new PhylogenyNode( "Y" ) ); + final TreeSplitMatrix s0 = new TreeSplitMatrix( p0, false, ex ); + // System.out.println( s0.toString() ); + // + Set query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + ///////// + // query_nodes = new HashSet(); + // query_nodes.add( new PhylogenyNode( "X" ) ); + // query_nodes.add( new PhylogenyNode( "Y" ) ); + // query_nodes.add( new PhylogenyNode( "A" ) ); + // query_nodes.add( new PhylogenyNode( "B" ) ); + // query_nodes.add( new PhylogenyNode( "C" ) ); + // query_nodes.add( new PhylogenyNode( "D" ) ); + // query_nodes.add( new PhylogenyNode( "E" ) ); + // query_nodes.add( new PhylogenyNode( "F" ) ); + // query_nodes.add( new PhylogenyNode( "G" ) ); + // if ( !s0.match( query_nodes ) ) { + // return false; + // } + // query_nodes = new HashSet(); + // query_nodes.add( new PhylogenyNode( "X" ) ); + // query_nodes.add( new PhylogenyNode( "Y" ) ); + // query_nodes.add( new PhylogenyNode( "A" ) ); + // query_nodes.add( new PhylogenyNode( "B" ) ); + // query_nodes.add( new PhylogenyNode( "C" ) ); + // if ( !s0.match( query_nodes ) ) { + // return false; + // } + // // + // query_nodes = new HashSet(); + // query_nodes.add( new PhylogenyNode( "X" ) ); + // query_nodes.add( new PhylogenyNode( "Y" ) ); + // query_nodes.add( new PhylogenyNode( "D" ) ); + // query_nodes.add( new PhylogenyNode( "E" ) ); + // query_nodes.add( new PhylogenyNode( "F" ) ); + // query_nodes.add( new PhylogenyNode( "G" ) ); + // if ( !s0.match( query_nodes ) ) { + // return false; + // } + // // + // query_nodes = new HashSet(); + // query_nodes.add( new PhylogenyNode( "X" ) ); + // query_nodes.add( new PhylogenyNode( "Y" ) ); + // query_nodes.add( new PhylogenyNode( "A" ) ); + // query_nodes.add( new PhylogenyNode( "B" ) ); + // query_nodes.add( new PhylogenyNode( "C" ) ); + // query_nodes.add( new PhylogenyNode( "D" ) ); + // if ( !s0.match( query_nodes ) ) { + // return false; + // } + // // + // query_nodes = new HashSet(); + // query_nodes.add( new PhylogenyNode( "X" ) ); + // query_nodes.add( new PhylogenyNode( "Y" ) ); + // query_nodes.add( new PhylogenyNode( "E" ) ); + // query_nodes.add( new PhylogenyNode( "F" ) ); + // query_nodes.add( new PhylogenyNode( "G" ) ); + // if ( !s0.match( query_nodes ) ) { + // return false; + // } + // // + // query_nodes = new HashSet(); + // query_nodes.add( new PhylogenyNode( "X" ) ); + // query_nodes.add( new PhylogenyNode( "Y" ) ); + // query_nodes.add( new PhylogenyNode( "F" ) ); + // query_nodes.add( new PhylogenyNode( "G" ) ); + // if ( !s0.match( query_nodes ) ) { + // return false; + // } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + /////////////////////////// + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "X" ) ); + query_nodes.add( new PhylogenyNode( "Y" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testSplitStrict() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p0 = factory.create( "(((A,B,C),D),(E,(F,G)))R", new NHXParser() )[ 0 ]; + final Set ex = new HashSet(); + ex.add( new PhylogenyNode( "A" ) ); + ex.add( new PhylogenyNode( "B" ) ); + ex.add( new PhylogenyNode( "C" ) ); + ex.add( new PhylogenyNode( "D" ) ); + ex.add( new PhylogenyNode( "E" ) ); + ex.add( new PhylogenyNode( "F" ) ); + ex.add( new PhylogenyNode( "G" ) ); + final TreeSplitMatrix s0 = new TreeSplitMatrix( p0, true, ex ); + Set query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( !s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "C" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "E" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "F" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "B" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + // + query_nodes = new HashSet(); + query_nodes.add( new PhylogenyNode( "E" ) ); + query_nodes.add( new PhylogenyNode( "D" ) ); + query_nodes.add( new PhylogenyNode( "A" ) ); + query_nodes.add( new PhylogenyNode( "G" ) ); + if ( s0.match( query_nodes ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testSubtreeDeletion() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t1 = factory.create( "((A,B,C)abc,(D,E,F)def)r", new NHXParser() )[ 0 ]; + t1.deleteSubtree( t1.getNode( "A" ), false ); + if ( t1.getNumberOfExternalNodes() != 5 ) { + return false; + } + t1.toNewHampshireX(); + t1.deleteSubtree( t1.getNode( "E" ), false ); + if ( t1.getNumberOfExternalNodes() != 4 ) { + return false; + } + t1.toNewHampshireX(); + t1.deleteSubtree( t1.getNode( "F" ), false ); + if ( t1.getNumberOfExternalNodes() != 3 ) { + return false; + } + t1.toNewHampshireX(); + t1.deleteSubtree( t1.getNode( "D" ), false ); + t1.toNewHampshireX(); + if ( t1.getNumberOfExternalNodes() != 3 ) { + return false; + } + t1.deleteSubtree( t1.getNode( "def" ), false ); + t1.toNewHampshireX(); + if ( t1.getNumberOfExternalNodes() != 2 ) { + return false; + } + t1.deleteSubtree( t1.getNode( "B" ), false ); + t1.toNewHampshireX(); + if ( t1.getNumberOfExternalNodes() != 1 ) { + return false; + } + t1.deleteSubtree( t1.getNode( "C" ), false ); + t1.toNewHampshireX(); + if ( t1.getNumberOfExternalNodes() != 1 ) { + return false; + } + t1.deleteSubtree( t1.getNode( "abc" ), false ); + t1.toNewHampshireX(); + if ( t1.getNumberOfExternalNodes() != 1 ) { + return false; + } + t1.deleteSubtree( t1.getNode( "r" ), false ); + if ( t1.getNumberOfExternalNodes() != 0 ) { + return false; + } + if ( !t1.isEmpty() ) { + return false; + } + final Phylogeny t2 = factory.create( "(((1,2,3)A,B,C)abc,(D,E,F)def)r", new NHXParser() )[ 0 ]; + t2.deleteSubtree( t2.getNode( "A" ), false ); + t2.toNewHampshireX(); + if ( t2.getNumberOfExternalNodes() != 5 ) { + return false; + } + t2.deleteSubtree( t2.getNode( "abc" ), false ); + t2.toNewHampshireX(); + if ( t2.getNumberOfExternalNodes() != 3 ) { + return false; + } + t2.deleteSubtree( t2.getNode( "def" ), false ); + t2.toNewHampshireX(); + if ( t2.getNumberOfExternalNodes() != 1 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testSupportCount() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny t0_1 = factory.create( "(((A,B),C),(D,E))", new NHXParser() )[ 0 ]; + final Phylogeny[] phylogenies_1 = factory.create( "(((A,B),C),(D,E)) " + "(((C,B),A),(D,E))" + + "(((A,B),C),(D,E)) " + "(((A,B),C),(D,E))" + + "(((A,B),C),(D,E))" + "(((C,B),A),(D,E))" + + "(((E,B),D),(C,A))" + "(((C,B),A),(D,E))" + + "(((A,B),C),(D,E))" + "(((A,B),C),(D,E))", + new NHXParser() ); + SupportCount.count( t0_1, phylogenies_1, true, false ); + final Phylogeny t0_2 = factory.create( "(((((A,B),C),D),E),(F,G))", new NHXParser() )[ 0 ]; + final Phylogeny[] phylogenies_2 = factory.create( "(((((A,B),C),D),E),(F,G))" + + "(((((A,B),C),D),E),((F,G),X))" + + "(((((A,Y),B),C),D),((F,G),E))" + + "(((((A,B),C),D),E),(F,G))" + + "(((((A,B),C),D),E),(F,G))" + + "(((((A,B),C),D),E),(F,G))" + + "(((((A,B),C),D),E),(F,G),Z)" + + "(((((A,B),C),D),E),(F,G))" + + "((((((A,B),C),D),E),F),G)" + + "(((((X,Y),F,G),E),((A,B),C)),D)", + new NHXParser() ); + SupportCount.count( t0_2, phylogenies_2, true, false ); + final PhylogenyNodeIterator it = t0_2.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !n.isExternal() && ( PhylogenyMethods.getConfidenceValue( n ) != 10 ) ) { + return false; + } + } + final Phylogeny t0_3 = factory.create( "(((A,B)ab,C)abc,((D,E)de,F)def)", new NHXParser() )[ 0 ]; + final Phylogeny[] phylogenies_3 = factory.create( "(((A,B),C),((D,E),F))" + "(((A,C),B),((D,F),E))" + + "(((C,A),B),((F,D),E))" + "(((A,B),F),((D,E),C))" + "(((((A,B),C),D),E),F)", new NHXParser() ); + SupportCount.count( t0_3, phylogenies_3, true, false ); + t0_3.reRoot( t0_3.getNode( "def" ).getId() ); + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "ab" ) ) != 3 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "abc" ) ) != 4 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "def" ) ) != 4 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "de" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "A" ) ) != 5 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "B" ) ) != 5 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "C" ) ) != 5 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "D" ) ) != 5 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "E" ) ) != 5 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_3.getNode( "F" ) ) != 5 ) { + return false; + } + final Phylogeny t0_4 = factory.create( "(((((A,B)1,C)2,D)3,E)4,F)", new NHXParser() )[ 0 ]; + final Phylogeny[] phylogenies_4 = factory.create( "((((((A,X),C),B),D),E),F) " + + "(((A,B,Z),C,Q),(((D,Y),E),F))", new NHXParser() ); + SupportCount.count( t0_4, phylogenies_4, true, false ); + t0_4.reRoot( t0_4.getNode( "F" ).getId() ); + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "1" ) ) != 1 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "2" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "3" ) ) != 1 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "4" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "A" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "B" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "C" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "D" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "E" ) ) != 2 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( t0_4.getNode( "F" ) ) != 2 ) { + return false; + } + Phylogeny a = factory.create( "(((((A,B)1,C)2,D)3,E)4,F)", new NHXParser() )[ 0 ]; + final Phylogeny b1 = factory.create( "(((((B,A)1,C)2,D)3,E)4,F)", new NHXParser() )[ 0 ]; + double d = SupportCount.compare( b1, a, true, true, true ); + if ( !Test.isEqual( d, 5.0 / 5.0 ) ) { + return false; + } + a = factory.create( "(((((A,B)1,C)2,D)3,E)4,F)", new NHXParser() )[ 0 ]; + final Phylogeny b2 = factory.create( "(((((C,B)1,A)2,D)3,E)4,F)", new NHXParser() )[ 0 ]; + d = SupportCount.compare( b2, a, true, true, true ); + if ( !Test.isEqual( d, 4.0 / 5.0 ) ) { + return false; + } + a = factory.create( "(((((A,B)1,C)2,D)3,E)4,F)", new NHXParser() )[ 0 ]; + final Phylogeny b3 = factory.create( "(((((F,C)1,A)2,B)3,D)4,E)", new NHXParser() )[ 0 ]; + d = SupportCount.compare( b3, a, true, true, true ); + if ( !Test.isEqual( d, 2.0 / 5.0 ) ) { + return false; + } + a = factory.create( "(((((A,B)1,C)2,D)3,E)4,F)r", new NHXParser() )[ 0 ]; + final Phylogeny b4 = factory.create( "(((((F,C)1,A)2,B)3,D)4,E)r", new NHXParser() )[ 0 ]; + d = SupportCount.compare( b4, a, true, true, false ); + if ( !Test.isEqual( d, 1.0 / 5.0 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testSupportTransfer() { + try { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final Phylogeny p1 = factory.create( "(((A,B)ab:97,C)abc:57,((D,E)de:10,(F,G)fg:50,(H,I)hi:64)defghi)", + new NHXParser() )[ 0 ]; + final Phylogeny p2 = factory + .create( "(((A:0.1,B:0.3)ab:0.4,C)abc:0.5,((D,E)de,(F,G)fg,(H,I)hi:0.59)defghi)", new NHXParser() )[ 0 ]; + if ( PhylogenyMethods.getConfidenceValue( p2.getNode( "ab" ) ) >= 0.0 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( p2.getNode( "abc" ) ) >= 0.0 ) { + return false; + } + support_transfer.moveBranchLengthsToBootstrap( p1 ); + support_transfer.transferSupportValues( p1, p2 ); + if ( p2.getNode( "ab" ).getDistanceToParent() != 0.4 ) { + return false; + } + if ( p2.getNode( "abc" ).getDistanceToParent() != 0.5 ) { + return false; + } + if ( p2.getNode( "hi" ).getDistanceToParent() != 0.59 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( p2.getNode( "ab" ) ) != 97 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( p2.getNode( "abc" ) ) != 57 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( p2.getNode( "de" ) ) != 10 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( p2.getNode( "fg" ) ) != 50 ) { + return false; + } + if ( PhylogenyMethods.getConfidenceValue( p2.getNode( "hi" ) ) != 64 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testTaxonomyAssigner() { + try { + String s0_str = "(((([&&NHX:S=A],[&&NHX:S=B])[&&NHX:S=AB],[&&NHX:S=C])[&&NHX:S=ABC],[&&NHX:S=D])[&&NHX:S=ABCD],[&&NHX:S=E])[&&NHX:S=ABCDE]"; + String g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=A])a,[&&NHX:S=B])b,[&&NHX:S=C])c"; + Phylogeny s0 = ParserBasedPhylogenyFactory.getInstance().create( s0_str, new NHXParser() )[ 0 ]; + Phylogeny g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + s0.setRooted( true ); + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "AB" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "ABC" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=A])a,[&&NHX:S=A])b,[&&NHX:S=A])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=B])a,[&&NHX:S=A])b,[&&NHX:S=A])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "AB" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "AB" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "AB" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=B])a,[&&NHX:S=C])b,[&&NHX:S=A])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "AB" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABC" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "ABC" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=B])a,[&&NHX:S=C])b,[&&NHX:S=D])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "AB" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABC" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=E])a,[&&NHX:S=C])b,[&&NHX:S=D])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCDE" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCDE" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCDE" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=E])a,[&&NHX:S=A])b,[&&NHX:S=A])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCDE" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCDE" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCDE" ) ) { + return false; + } + s0_str = "(([&&NHX:S=A],[&&NHX:S=B],[&&NHX:S=C],[&&NHX:S=D])[&&NHX:S=ABCD]," + + "([&&NHX:S=E],[&&NHX:S=F],[&&NHX:S=G],[&&NHX:S=H])[&&NHX:S=EFGH]," + + "([&&NHX:S=I],[&&NHX:S=J],[&&NHX:S=K],[&&NHX:S=L])[&&NHX:S=IJKL], " + + "([&&NHX:S=M],[&&NHX:S=N],[&&NHX:S=O],[&&NHX:S=P])[&&NHX:S=MNOP])[&&NHX:S=ROOT]"; + s0 = ParserBasedPhylogenyFactory.getInstance().create( s0_str, new NHXParser() )[ 0 ]; + s0.setRooted( true ); + g0_str = "(([&&NHX:S=A],[&&NHX:S=B],[&&NHX:S=C],[&&NHX:S=D])a," + + "([&&NHX:S=E],[&&NHX:S=F],[&&NHX:S=G],[&&NHX:S=H])b," + + "([&&NHX:S=I],[&&NHX:S=J],[&&NHX:S=K],[&&NHX:S=L])c, " + + "([&&NHX:S=M],[&&NHX:S=N],[&&NHX:S=O],[&&NHX:S=P])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "EFGH" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "IJKL" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "MNOP" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + g0_str = "(([&&NHX:S=A],[&&NHX:S=B],[&&NHX:S=A],[&&NHX:S=B])a," + + "([&&NHX:S=E],[&&NHX:S=F],[&&NHX:S=F],[&&NHX:S=F])b," + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=I])c, " + + "([&&NHX:S=M],[&&NHX:S=N],[&&NHX:S=O],[&&NHX:S=O])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "EFGH" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "IJKL" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "MNOP" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + g0_str = "(([&&NHX:S=A],[&&NHX:S=B],[&&NHX:S=A],[&&NHX:S=B])a," + + "([&&NHX:S=E],[&&NHX:S=F],[&&NHX:S=F],[&&NHX:S=F])b," + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])c, " + + "([&&NHX:S=M],[&&NHX:S=N],[&&NHX:S=A],[&&NHX:S=O])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "EFGH" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + g0_str = "(([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])a," + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])b," + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])c, " + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=A])a,[&&NHX:S=A])b,[&&NHX:S=A])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + g0_str = "((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=B])a,[&&NHX:S=I])b,[&&NHX:S=J])c"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + g0_str = "(((([&&NHX:S=A],[&&NHX:S=B],[&&NHX:S=C],[&&NHX:S=D])a," + + "([&&NHX:S=D],[&&NHX:S=C],[&&NHX:S=B],[&&NHX:S=A])b)ab," + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])c)abc, " + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "ab" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "abc" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + g0_str = "(((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=C],[&&NHX:S=D])a," + + "([&&NHX:S=D],[&&NHX:S=D],[&&NHX:S=B],[&&NHX:S=A])b)ab," + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])c)abc, " + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "ab" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "abc" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + g0_str = "(((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=C],[&&NHX:S=D])a," + + "([&&NHX:S=D],[&&NHX:S=D],[&&NHX:S=B],[&&NHX:S=A])b)ab," + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L])c)abc, " + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=A])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "ab" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "L" ) ) { + return false; + } + if ( !g0.getNode( "abc" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + g0_str = "(((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=C],[&&NHX:S=D])a," + + "([&&NHX:S=D],[&&NHX:S=D],[&&NHX:S=B],[&&NHX:S=A])b)ab," + + "([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=A])c)abc, " + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=A])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( !g0.getNode( "a" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "b" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "ab" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + if ( !g0.getNode( "abc" ).getNodeData().getTaxonomy().getScientificName().equals( "ABCD" ) ) { + return false; + } + if ( !g0.getNode( "d" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + if ( !g0.getNode( "r" ).getNodeData().getTaxonomy().getScientificName().equals( "ROOT" ) ) { + return false; + } + s0_str = "(([&&NHX:S=A],[&&NHX:S=B],[&&NHX:S=C],[&&NHX:S=D])," + + "([&&NHX:S=E],[&&NHX:S=F],[&&NHX:S=G],[&&NHX:S=H])," + + "([&&NHX:S=I],[&&NHX:S=J],[&&NHX:S=K],[&&NHX:S=L]), " + + "([&&NHX:S=M],[&&NHX:S=N],[&&NHX:S=O],[&&NHX:S=P]))"; + s0 = ParserBasedPhylogenyFactory.getInstance().create( s0_str, new NHXParser() )[ 0 ]; + s0.setRooted( true ); + g0_str = "(((([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=C],[&&NHX:S=D])a," + + "([&&NHX:S=D],[&&NHX:S=D],[&&NHX:S=B],[&&NHX:S=A])b)ab," + + "([&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=A],[&&NHX:S=A])c)abc, " + + "([&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=L],[&&NHX:S=A])d)r"; + g0 = ParserBasedPhylogenyFactory.getInstance().create( g0_str, new NHXParser() )[ 0 ]; + g0.setRooted( true ); + TaxonomyAssigner.execute( g0, s0 ); + if ( g0.getNode( "a" ).getNodeData().isHasTaxonomy() ) { + return false; + } + if ( !g0.getNode( "c" ).getNodeData().getTaxonomy().getScientificName().equals( "A" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testUniprotTaxonomySearch() { + try { + List results = UniProtWsTools + .getTaxonomiesFromCommonNameStrict( "starlet sea anemone", 10 ); + if ( results.size() != 1 ) { + return false; + } + if ( !results.get( 0 ).getCode().equals( "NEMVE" ) ) { + return false; + } + if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "starlet sea anemone" ) ) { + return false; + } + if ( !results.get( 0 ).getId().equalsIgnoreCase( "45351" ) ) { + return false; + } + if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) { + return false; + } + if ( !results.get( 0 ).getScientificName().equals( "Nematostella vectensis" ) ) { + return false; + } + results = null; + results = UniProtWsTools.getTaxonomiesFromScientificNameStrict( "Nematostella vectensis", 10 ); + if ( results.size() != 1 ) { + return false; + } + if ( !results.get( 0 ).getCode().equals( "NEMVE" ) ) { + return false; + } + if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "starlet sea anemone" ) ) { + return false; + } + if ( !results.get( 0 ).getId().equalsIgnoreCase( "45351" ) ) { + return false; + } + if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) { + return false; + } + if ( !results.get( 0 ).getScientificName().equals( "Nematostella vectensis" ) ) { + return false; + } + results = null; + results = UniProtWsTools.getTaxonomiesFromId( "45351", 10 ); + if ( results.size() != 1 ) { + return false; + } + if ( !results.get( 0 ).getCode().equals( "NEMVE" ) ) { + return false; + } + if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "starlet sea anemone" ) ) { + return false; + } + if ( !results.get( 0 ).getId().equalsIgnoreCase( "45351" ) ) { + return false; + } + if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) { + return false; + } + if ( !results.get( 0 ).getScientificName().equals( "Nematostella vectensis" ) ) { + return false; + } + results = null; + results = UniProtWsTools.getTaxonomiesFromTaxonomyCode( "NEMVE", 10 ); + if ( results.size() != 1 ) { + return false; + } + if ( !results.get( 0 ).getCode().equals( "NEMVE" ) ) { + return false; + } + if ( !results.get( 0 ).getCommonName().equalsIgnoreCase( "starlet sea anemone" ) ) { + return false; + } + if ( !results.get( 0 ).getId().equalsIgnoreCase( "45351" ) ) { + return false; + } + if ( !results.get( 0 ).getRank().equalsIgnoreCase( "species" ) ) { + return false; + } + if ( !results.get( 0 ).getScientificName().equals( "Nematostella vectensis" ) ) { + return false; + } + if ( !results.get( 0 ).getLineage()[ 0 ].equals( "Eukaryota" ) ) { + return false; + } + if ( !results.get( 0 ).getLineage()[ 1 ].equals( "Metazoa" ) ) { + return false; + } + if ( !results.get( 0 ).getLineage()[ results.get( 0 ).getLineage().length - 1 ].equals( "Nematostella" ) ) { + return false; + } + } + catch ( final Exception e ) { + System.out.println(); + System.out.println( "the following might be due to absence internet connection:" ); + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testWabiTxSearch() { + try { + String result = ""; + result = TxSearch.searchSimple( "nematostella" ); + result = TxSearch.getTxId( "nematostella" ); + if ( !result.equals( "45350" ) ) { + return false; + } + result = TxSearch.getTxName( "45350" ); + if ( !result.equals( "Nematostella" ) ) { + return false; + } + result = TxSearch.getTxId( "nematostella vectensis" ); + if ( !result.equals( "45351" ) ) { + return false; + } + result = TxSearch.getTxName( "45351" ); + if ( !result.equals( "Nematostella vectensis" ) ) { + return false; + } + result = TxSearch.getTxId( "Bacillus subtilis subsp. subtilis str. N170" ); + if ( !result.equals( "536089" ) ) { + return false; + } + result = TxSearch.getTxName( "536089" ); + if ( !result.equals( "Bacillus subtilis subsp. subtilis str. N170" ) ) { + return false; + } + final List queries = new ArrayList(); + queries.add( "Campylobacter coli" ); + queries.add( "Escherichia coli" ); + queries.add( "Arabidopsis" ); + queries.add( "Trichoplax" ); + queries.add( "Samanea saman" ); + queries.add( "Kluyveromyces marxianus" ); + queries.add( "Bacillus subtilis subsp. subtilis str. N170" ); + queries.add( "Bornavirus parrot/PDD/2008" ); + final List ranks = new ArrayList(); + ranks.add( RANKS.SUPERKINGDOM ); + ranks.add( RANKS.KINGDOM ); + ranks.add( RANKS.FAMILY ); + ranks.add( RANKS.GENUS ); + ranks.add( RANKS.TRIBE ); + result = TxSearch.searchLineage( queries, ranks ); + result = TxSearch.searchParam( "Homo sapiens", TAX_NAME_CLASS.ALL, TAX_RANK.SPECIES, 10, true ); + result = TxSearch.searchParam( "Samanea saman", TAX_NAME_CLASS.SCIENTIFIC_NAME, TAX_RANK.ALL, 10, true ); + } + catch ( final Exception e ) { + System.out.println(); + System.out.println( "the following might be due to absence internet connection:" ); + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testAminoAcidSequence() { + try { + final Sequence aa1 = BasicSequence.createAaSequence( "aa1", "aAklm-?xX*z$#" ); + if ( aa1.getLength() != 13 ) { + return false; + } + if ( aa1.getResidueAt( 0 ) != 'A' ) { + return false; + } + if ( aa1.getResidueAt( 2 ) != 'K' ) { + return false; + } + if ( !new String( aa1.getMolecularSequence() ).equals( "AAKLM-XXX*ZXX" ) ) { + return false; + } + final Sequence aa2 = BasicSequence.createAaSequence( "aa3", "ARNDCQEGHILKMFPSTWYVX*-BZOJU" ); + if ( !new String( aa2.getMolecularSequence() ).equals( "ARNDCQEGHILKMFPSTWYVX*-BZXXU" ) ) { + return false; + } + final Sequence dna1 = BasicSequence.createDnaSequence( "dna1", "ACGTUX*-?RYMKWSN" ); + if ( !new String( dna1.getMolecularSequence() ).equals( "ACGTNN*-NRYMKWSN" ) ) { + return false; + } + final Sequence rna1 = BasicSequence.createRnaSequence( "rna1", "..ACGUTX*-?RYMKWSN" ); + if ( !new String( rna1.getMolecularSequence() ).equals( "--ACGUNN*-NRYMKWSN" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testCreateBalancedPhylogeny() { + try { + final Phylogeny p0 = DevelopmentTools.createBalancedPhylogeny( 6, 5 ); + if ( p0.getRoot().getNumberOfDescendants() != 5 ) { + return false; + } + if ( p0.getNumberOfExternalNodes() != 15625 ) { + return false; + } + final Phylogeny p1 = DevelopmentTools.createBalancedPhylogeny( 2, 10 ); + if ( p1.getRoot().getNumberOfDescendants() != 10 ) { + return false; + } + if ( p1.getNumberOfExternalNodes() != 100 ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testFastaParser() { + try { + if ( !FastaParser.isLikelyFasta( new FileInputStream( PATH_TO_TEST_DATA + "fasta_0.fasta" ) ) ) { + return false; + } + if ( FastaParser.isLikelyFasta( new FileInputStream( PATH_TO_TEST_DATA + "msa_3.txt" ) ) ) { + return false; + } + final Msa msa_0 = FastaParser.parseMsa( new FileInputStream( PATH_TO_TEST_DATA + "fasta_0.fasta" ) ); + if ( !msa_0.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "ACGTGKXFMFDMXEXXXSFMFMF" ) ) { + return false; + } + if ( !msa_0.getIdentifier( 0 ).equals( "one dumb" ) ) { + return false; + } + if ( !msa_0.getSequenceAsString( 1 ).toString().equalsIgnoreCase( "DKXASDFXSFXFKFKSXDFKSLX" ) ) { + return false; + } + if ( !msa_0.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "SXDFKSXLFSFPWEXPRXWXERR" ) ) { + return false; + } + if ( !msa_0.getSequenceAsString( 3 ).toString().equalsIgnoreCase( "AAAAAAAAAAAAAAAAAAAAAAA" ) ) { + return false; + } + if ( !msa_0.getSequenceAsString( 4 ).toString().equalsIgnoreCase( "DDDDDDDDDDDDDDDDDDDDAXF" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testGeneralMsaParser() { + try { + final String msa_str_0 = "seq1 abcd\n\nseq2 efgh\n"; + final Msa msa_0 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_0.getBytes() ) ); + final String msa_str_1 = "seq_1 abc\nseq2 ghi\nseq_1 def\nseq2 jkm\n"; + final Msa msa_1 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_1.getBytes() ) ); + final String msa_str_2 = "seq1 abc\nseq2 ghi\n\ndef\njkm\n"; + final Msa msa_2 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_2.getBytes() ) ); + final String msa_str_3 = "seq1 abc\n def\nseq2 ghi\n jkm\n"; + final Msa msa_3 = GeneralMsaParser.parse( new ByteArrayInputStream( msa_str_3.getBytes() ) ); + final Msa msa_4 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_1.txt" ) ); + if ( !msa_4.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefeeeeeeeexx" ) ) { + return false; + } + if ( !msa_4.getSequenceAsString( 1 ).toString().equalsIgnoreCase( "efghixffffffffyy" ) ) { + return false; + } + if ( !msa_4.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "klmnxphhhhhhhhzz" ) ) { + return false; + } + final Msa msa_5 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_2.txt" ) ); + if ( !msa_5.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefxx" ) ) { + return false; + } + if ( !msa_5.getSequenceAsString( 1 ).toString().equalsIgnoreCase( "efghixyy" ) ) { + return false; + } + if ( !msa_5.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "klmnxpzz" ) ) { + return false; + } + final Msa msa_6 = GeneralMsaParser.parse( new FileInputStream( PATH_TO_TEST_DATA + "msa_3.txt" ) ); + if ( !msa_6.getSequenceAsString( 0 ).toString().equalsIgnoreCase( "abcdefeeeeeeeexx" ) ) { + return false; + } + if ( !msa_6.getSequenceAsString( 1 ).toString().equalsIgnoreCase( "efghixffffffffyy" ) ) { + return false; + } + if ( !msa_6.getSequenceAsString( 2 ).toString().equalsIgnoreCase( "klmnxphhhhhhhhzz" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + + private static boolean testMafft() { + try { + final List opts = new ArrayList(); + opts.add( "--maxiterate" ); + opts.add( "1000" ); + opts.add( "--localpair" ); + opts.add( "--quiet" ); + Msa msa = null; + final MsaInferrer mafft = Mafft.createInstance(); + msa = mafft.infer( new File( PATH_TO_TEST_DATA + "ncbi.fasta" ), opts ); + if ( ( msa == null ) || ( msa.getLength() < 10 ) || ( msa.getNumberOfSequences() != 19 ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } +} diff --git a/forester/java/src/org/forester/tools/ConfidenceAssessor.java b/forester/java/src/org/forester/tools/ConfidenceAssessor.java new file mode 100644 index 0000000..7fc7c90 --- /dev/null +++ b/forester/java/src/org/forester/tools/ConfidenceAssessor.java @@ -0,0 +1,178 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.tools; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; + +public final class ConfidenceAssessor { + + private ConfidenceAssessor() { + // Hidden constructor. + } + + private final static void addExternalNodesToMap( final Map> node_to_ext_nodes_map, + final PhylogenyNode node ) { + final Set ex_descs = new HashSet(); + for( final PhylogenyNode n : node.getAllExternalDescendants() ) { + if ( ex_descs.contains( n ) ) { + throw new IllegalArgumentException( "node [" + n.toString() + "] of target is not unique" ); + } + ex_descs.add( n ); + } + node_to_ext_nodes_map.put( node, ex_descs ); + } + + private final static void checkPreconditions( final String confidence_type, + final Phylogeny[] evaluators, + final Phylogeny target, + final double value, + final int first, + final int last ) { + if ( ( first < 0 ) || ( last < 0 ) ) { + throw new IllegalArgumentException( "attempt to set first or last evaluator topology to use to a number less than zero" ); + } + if ( evaluators.length < 1 ) { + throw new IllegalArgumentException( "need at least one evaluator topology" ); + } + if ( ForesterUtil.isEmpty( confidence_type ) ) { + throw new IllegalArgumentException( "attempt to use empty confidence type" ); + } + if ( value <= 0 ) { + throw new IllegalArgumentException( "attempt to use zero or negative \'count value\'" ); + } + if ( ( first != 0 ) || ( last != 0 ) ) { + if ( ( last >= evaluators.length ) || ( last <= first ) ) { + throw new IllegalArgumentException( "illegal value for last evaluator topology to use" ); + } + } + final Set nodes = new HashSet(); + for( final PhylogenyNodeIterator it = target.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + if ( nodes.contains( node ) ) { + throw new IllegalArgumentException( "node [" + node + "] in target is not unique" ); + } + nodes.add( node ); + final List confidences = node.getBranchData().getConfidences(); + for( final Confidence confidence : confidences ) { + if ( confidence.getType().equals( confidence_type ) ) { + throw new IllegalArgumentException( "confidence [" + confidence_type + + "] is already present in target" ); + } + } + } + } + + public final static void evaluate( final String confidence_type, + final Phylogeny[] evaluators, + final Phylogeny target, + final boolean strict, + final double value ) { + evaluate( confidence_type, evaluators, target, strict, value, 0, 0 ); + } + + public final static void evaluate( final String confidence_type, + final Phylogeny[] evaluators, + final Phylogeny target, + final boolean strict, + final double value, + final int first, + final int last ) { + try { + checkPreconditions( confidence_type, evaluators, target, value, first, last ); + } + catch ( final IllegalArgumentException e ) { + throw e; + } + boolean all = true; + if ( ( first != 0 ) || ( last != 0 ) ) { + all = false; + } + int counter = 0; + final Map> node_to_ext_nodes_map = new HashMap>(); + for( final Phylogeny evaluator : evaluators ) { + if ( all || ( ( counter >= first ) && ( counter <= last ) ) ) { + if ( strict ) { + if ( evaluator.getNumberOfExternalNodes() != target.getNumberOfExternalNodes() ) { + throw new IllegalArgumentException( "evaluator #" + counter + + " does not have the same number of external nodes [" + + evaluator.getNumberOfExternalNodes() + "] than the corresponding target [" + + target.getNumberOfExternalNodes() + "]" ); + } + } + final TreeSplitMatrix s = new TreeSplitMatrix( evaluator, strict, target ); + for( final PhylogenyNodeIterator it = target.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + if ( !node.isExternal() && !node.isRoot() ) { + if ( node.getParent().isRoot() + && ( target.getRoot().getNumberOfDescendants() == 2 ) + && ( target.getRoot().getChildNode1().isExternal() || target.getRoot().getChildNode2() + .isExternal() ) ) { + continue; + } + if ( !node_to_ext_nodes_map.containsKey( node ) ) { + addExternalNodesToMap( node_to_ext_nodes_map, node ); + } + final Set ex_descs = node_to_ext_nodes_map.get( node ); + final Confidence c = ConfidenceAssessor.obtainConfidence( node, confidence_type ); + if ( s.match( ex_descs ) ) { + c.setValue( c.getValue() + value ); + } + } + } + } + ++counter; + } + } + + private final static Confidence obtainConfidence( final PhylogenyNode n, final String confidence_type ) { + final List confidences = n.getBranchData().getConfidences(); + Confidence match = null; + for( final Confidence confidence : confidences ) { + if ( confidence.getType().equals( confidence_type ) ) { + if ( match != null ) { + throw new IllegalArgumentException( "confidence [" + confidence_type + "] is not unique" ); + } + match = confidence; + } + } + if ( match == null ) { + match = new Confidence( 0, confidence_type ); + confidences.add( match ); + } + return match; + } +} diff --git a/forester/java/src/org/forester/tools/PhylogenyDecorator.java b/forester/java/src/org/forester/tools/PhylogenyDecorator.java new file mode 100644 index 0000000..086e51a --- /dev/null +++ b/forester/java/src/org/forester/tools/PhylogenyDecorator.java @@ -0,0 +1,525 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.tools; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.io.parsers.nhx.NHXFormatException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; +import org.forester.phylogeny.data.DomainArchitecture; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.BasicTable; +import org.forester.util.BasicTableParser; +import org.forester.util.ForesterUtil; + +public final class PhylogenyDecorator { + + // From evoruby/lib/evo/apps/tseq_taxonomy_processor.rb: + final private static String TP_TAXONOMY_CODE = "TAXONOMY_CODE"; + final private static String TP_TAXONOMY_ID = "TAXONOMY_ID"; + final private static String TP_TAXONOMY_ID_PROVIDER = "TAXONOMY_ID_PROVIDER"; + final private static String TP_TAXONOMY_SN = "TAXONOMY_SN"; + final private static String TP_TAXONOMY_CN = "TAXONOMY_CN"; + final private static String TP_TAXONOMY_SYN = "TAXONOMY_SYN"; + final private static String TP_SEQ_SYMBOL = "SEQ_SYMBOL"; + final private static String TP_SEQ_ACCESSION = "SEQ_ACCESSION"; + final private static String TP_SEQ_ACCESSION_SOURCE = "SEQ_ACCESSION_SOURCE"; + final private static String TP_SEQ_ANNOTATION_DESC = "SEQ_ANNOTATION_DESC"; + final private static String TP_SEQ_ANNOTATION_REF = "SEQ_ANNOTATION_REF"; + final private static String TP_SEQ_MOL_SEQ = "SEQ_MOL_SEQ"; + final private static String TP_SEQ_NAME = "SEQ_NAME"; + final private static String TP_NODE_NAME = "NODE_NAME"; + final private static Pattern NODENAME_SEQNUMBER_TAXDOMAINNUMBER = Pattern + .compile( "^([a-fA-Z0-9]{1,5})_([A-Z0-9]{2,4}[A-Z])(\\d{1,4})$" ); + public final static boolean SANITIZE = false; + public final static boolean VERBOSE = true; + + private PhylogenyDecorator() { + // Not needed. + } + + public static void decorate( final Phylogeny phylogeny, + final Map> map, + final boolean picky, + final int numbers_of_chars_allowed_to_remove_if_not_found_in_map ) + throws IllegalArgumentException { + for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + final String name = node.getName(); + if ( !ForesterUtil.isEmpty( name ) ) { + if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) { + Map new_values = map.get( name ); + int x = 0; + while ( ( new_values == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) + && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) { + new_values = map.get( name.substring( 0, name.length() - x ) ); + ++x; + } + if ( new_values != null ) { + if ( new_values.containsKey( TP_TAXONOMY_CODE ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setTaxonomyCode( new_values.get( TP_TAXONOMY_CODE ) ); + } + if ( new_values.containsKey( TP_TAXONOMY_ID ) + && new_values.containsKey( TP_TAXONOMY_ID_PROVIDER ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setIdentifier( new Identifier( new_values + .get( TP_TAXONOMY_ID ), new_values.get( TP_TAXONOMY_ID_PROVIDER ) ) ); + } + else if ( new_values.containsKey( TP_TAXONOMY_ID ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setIdentifier( new Identifier( new_values + .get( TP_TAXONOMY_ID ) ) ); + } + if ( new_values.containsKey( TP_TAXONOMY_SN ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setScientificName( new_values.get( TP_TAXONOMY_SN ) ); + } + if ( new_values.containsKey( TP_TAXONOMY_CN ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setCommonName( new_values.get( TP_TAXONOMY_CN ) ); + } + if ( new_values.containsKey( TP_TAXONOMY_SYN ) ) { + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().getSynonyms().add( new_values.get( TP_TAXONOMY_SYN ) ); + } + if ( new_values.containsKey( TP_SEQ_ACCESSION ) + && new_values.containsKey( TP_SEQ_ACCESSION_SOURCE ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence().setAccession( new Accession( new_values + .get( TP_SEQ_ACCESSION ), new_values.get( TP_SEQ_ACCESSION_SOURCE ) ) ); + } + if ( new_values.containsKey( TP_SEQ_ANNOTATION_DESC ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + final Annotation ann = new Annotation( "?" ); + ann.setDesc( new_values.get( TP_SEQ_ANNOTATION_DESC ) ); + node.getNodeData().getSequence().addAnnotation( ann ); + } + if ( new_values.containsKey( TP_SEQ_ANNOTATION_REF ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + final Annotation ann = new Annotation( new_values.get( TP_SEQ_ANNOTATION_REF ) ); + node.getNodeData().getSequence().addAnnotation( ann ); + } + if ( new_values.containsKey( TP_SEQ_SYMBOL ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence().setSymbol( new_values.get( TP_SEQ_SYMBOL ) ); + } + if ( new_values.containsKey( TP_SEQ_NAME ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence().setName( new_values.get( TP_SEQ_NAME ) ); + } + if ( new_values.containsKey( TP_SEQ_MOL_SEQ ) ) { + ForesterUtil.ensurePresenceOfSequence( node ); + node.getNodeData().getSequence().setMolecularSequence( new_values.get( TP_SEQ_MOL_SEQ ) ); + } + if ( new_values.containsKey( TP_NODE_NAME ) ) { + node.setName( new_values.get( TP_NODE_NAME ) ); + } + } + } + else if ( picky ) { + throw new IllegalArgumentException( "\"" + name + "\" not found in name map" ); + } + } + } + } + + /** + * + * + * + * + * + * @param phylogeny + * @param map + * maps names (in phylogeny) to new values + * @param field + * @param picky + * @throws IllegalArgumentException + * @throws NHXFormatException + */ + public static void decorate( final Phylogeny phylogeny, + final Map map, + final FIELD field, + final boolean extract_bracketed_scientific_name, + final boolean picky, + final boolean cut_name_after_space, + final boolean process_name_intelligently, + final boolean process_similar_to, + final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, + final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException, + NHXFormatException { + PhylogenyDecorator.decorate( phylogeny, + map, + field, + extract_bracketed_scientific_name, + picky, + null, + cut_name_after_space, + process_name_intelligently, + process_similar_to, + numbers_of_chars_allowed_to_remove_if_not_found_in_map, + move_domain_numbers_at_end_to_middle ); + } + + /** + * + * + * + * @param phylogeny + * @param map + * maps names (in phylogeny) to new values if intermediate_map is + * null otherwise maps intermediate value to new value + * @param field + * @param picky + * @param intermediate_map + * maps name (in phylogeny) to a intermediate value + * @throws IllegalArgumentException + */ + public static void decorate( final Phylogeny phylogeny, + final Map map, + final FIELD field, + final boolean extract_bracketed_scientific_name, + final boolean picky, + final Map intermediate_map, + final boolean cut_name_after_space, + final boolean process_name_intelligently, + final boolean process_similar_to, + final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, + final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException { + if ( extract_bracketed_scientific_name && ( field == FIELD.TAXONOMY_SCIENTIFIC_NAME ) ) { + throw new IllegalArgumentException( "Attempt to extract bracketed scientific name together with data field pointing to scientific name" ); + } + for( final PhylogenyNodeIterator iter = phylogeny.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + String name = node.getName(); + if ( !ForesterUtil.isEmpty( name ) ) { + if ( intermediate_map != null ) { + name = PhylogenyDecorator.extractIntermediate( intermediate_map, name ); + } + if ( map.containsKey( name ) || ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) ) { + String new_value = map.get( name ); + int x = 0; + while ( ( new_value == null ) && ( numbers_of_chars_allowed_to_remove_if_not_found_in_map > 0 ) + && ( x <= numbers_of_chars_allowed_to_remove_if_not_found_in_map ) ) { + new_value = map.get( name.substring( 0, name.length() - x ) ); + ++x; + } + if ( new_value != null ) { + new_value = new_value.trim(); + new_value.replaceAll( "/\\s+/", " " ); + if ( extract_bracketed_scientific_name && new_value.endsWith( "]" ) ) { + extractBracketedScientificNames( node, new_value ); + } + switch ( field ) { + case SEQUENCE_ANNOTATION_DESC: + if ( PhylogenyDecorator.VERBOSE ) { + System.out.println( name + ": " + new_value ); + } + if ( !node.getNodeData().isHasSequence() ) { + node.getNodeData().setSequence( new Sequence() ); + } + final Annotation annotation = new Annotation( "?" ); + annotation.setDesc( new_value ); + node.getNodeData().getSequence().addAnnotation( annotation ); + break; + case DOMAIN_STRUCTURE: + if ( PhylogenyDecorator.VERBOSE ) { + System.out.println( name + ": " + new_value ); + } + if ( !node.getNodeData().isHasSequence() ) { + node.getNodeData().setSequence( new Sequence() ); + } + node.getNodeData().getSequence() + .setDomainArchitecture( new DomainArchitecture( new_value ) ); + break; + case TAXONOMY_CODE: + if ( PhylogenyDecorator.VERBOSE ) { + System.out.println( name + ": " + new_value ); + } + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setTaxonomyCode( new_value ); + break; + case TAXONOMY_SCIENTIFIC_NAME: + if ( PhylogenyDecorator.VERBOSE ) { + System.out.println( name + ": " + new_value ); + } + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setScientificName( new_value ); + break; + case SEQUENCE_NAME: + if ( PhylogenyDecorator.VERBOSE ) { + System.out.println( name + ": " + new_value ); + } + if ( !node.getNodeData().isHasSequence() ) { + node.getNodeData().setSequence( new Sequence() ); + } + node.getNodeData().getSequence().setName( new_value ); + break; + case NODE_NAME: + if ( PhylogenyDecorator.VERBOSE ) { + System.out.print( name + " -> " ); + } + if ( cut_name_after_space ) { + if ( PhylogenyDecorator.VERBOSE ) { + System.out.print( new_value + " -> " ); + } + new_value = PhylogenyDecorator.deleteAtFirstSpace( new_value ); + } + else if ( process_name_intelligently ) { + if ( PhylogenyDecorator.VERBOSE ) { + System.out.print( new_value + " -> " ); + } + new_value = PhylogenyDecorator.processNameIntelligently( new_value ); + } + else if ( process_similar_to ) { + if ( PhylogenyDecorator.VERBOSE ) { + System.out.print( new_value + " -> " ); + } + new_value = PhylogenyDecorator.processSimilarTo( new_value ); + } + if ( PhylogenyDecorator.SANITIZE ) { + new_value = PhylogenyDecorator.sanitize( new_value ); + } + if ( PhylogenyDecorator.VERBOSE ) { + System.out.println( new_value ); + } + node.setName( new_value ); + break; + default: + throw new RuntimeException( "unknown field \"" + field + "\"" ); + } + if ( move_domain_numbers_at_end_to_middle && ( field != FIELD.NODE_NAME ) ) { + node.setName( moveDomainNumbersAtEnd( node.getName() ) ); + } + } + } + else if ( picky ) { + throw new IllegalArgumentException( "\"" + name + "\" not found in name map" ); + } + } + } + } + + public static void decorate( final Phylogeny[] phylogenies, + final Map> map, + final boolean picky, + final int numbers_of_chars_allowed_to_remove_if_not_found_in_map ) + throws IllegalArgumentException, NHXFormatException { + for( int i = 0; i < phylogenies.length; ++i ) { + PhylogenyDecorator.decorate( phylogenies[ i ], + map, + picky, + numbers_of_chars_allowed_to_remove_if_not_found_in_map ); + } + } + + public static void decorate( final Phylogeny[] phylogenies, + final Map map, + final FIELD field, + final boolean extract_bracketed_scientific_name, + final boolean picky, + final boolean cut_name_after_space, + final boolean process_name_intelligently, + final boolean process_similar_to, + final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, + final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException, + NHXFormatException { + for( int i = 0; i < phylogenies.length; ++i ) { + PhylogenyDecorator.decorate( phylogenies[ i ], + map, + field, + extract_bracketed_scientific_name, + picky, + cut_name_after_space, + process_name_intelligently, + process_similar_to, + numbers_of_chars_allowed_to_remove_if_not_found_in_map, + move_domain_numbers_at_end_to_middle ); + } + } + + public static void decorate( final Phylogeny[] phylogenies, + final Map map, + final FIELD field, + final boolean extract_bracketed_scientific_name, + final boolean picky, + final Map intermediate_map, + final boolean cut_name_after_space, + final boolean process_name_intelligently, + final boolean process_similar_to, + final int numbers_of_chars_allowed_to_remove_if_not_found_in_map, + final boolean move_domain_numbers_at_end_to_middle ) throws IllegalArgumentException, + NHXFormatException { + for( int i = 0; i < phylogenies.length; ++i ) { + PhylogenyDecorator.decorate( phylogenies[ i ], + map, + field, + extract_bracketed_scientific_name, + picky, + intermediate_map, + cut_name_after_space, + process_name_intelligently, + process_similar_to, + numbers_of_chars_allowed_to_remove_if_not_found_in_map, + move_domain_numbers_at_end_to_middle ); + } + } + + private static String deleteAtFirstSpace( final String name ) { + final int first_space = name.indexOf( " " ); + if ( first_space > 1 ) { + return name.substring( 0, first_space ).trim(); + } + return name; + } + + private static void extractBracketedScientificNames( final PhylogenyNode node, final String new_value ) { + final int i = new_value.lastIndexOf( "[" ); + final String scientific_name = new_value.substring( i + 1, new_value.length() - 1 ); + ForesterUtil.ensurePresenceOfTaxonomy( node ); + node.getNodeData().getTaxonomy().setScientificName( scientific_name ); + } + + private static String extractIntermediate( final Map intermediate_map, final String name ) { + String new_name = null; + if ( PhylogenyDecorator.VERBOSE ) { + System.out.print( name + " => " ); + } + if ( intermediate_map.containsKey( name ) ) { + new_name = intermediate_map.get( name ); + if ( ForesterUtil.isEmpty( new_name ) ) { + throw new IllegalArgumentException( "\"" + name + "\" maps to null or empty string in secondary map" ); + } + } + else { + throw new IllegalArgumentException( "\"" + name + "\" not found in name secondary map" ); + } + if ( PhylogenyDecorator.VERBOSE ) { + System.out.println( new_name + " " ); + } + return new_name; + } + + private static String moveDomainNumbersAtEnd( final String node_name ) { + final Matcher m = NODENAME_SEQNUMBER_TAXDOMAINNUMBER.matcher( node_name ); + if ( m.matches() ) { + final String seq_number = m.group( 1 ); + final String tax = m.group( 2 ); + final String domain_number = m.group( 3 ); + return seq_number + "_[" + domain_number + "]_" + tax; + } + else { + return node_name; + } + } + + public static Map> parseMappingTable( final File mapping_table_file ) + throws IOException { + final Map> map = new HashMap>(); + BasicTable mapping_table = null; + mapping_table = BasicTableParser.parse( mapping_table_file, "\t", false ); + for( int row = 0; row < mapping_table.getNumberOfRows(); ++row ) { + final Map row_map = new HashMap(); + String name = null; + for( int col = 0; col < mapping_table.getNumberOfColumns(); ++col ) { + final String table_cell = mapping_table.getValue( col, row ); + if ( col == 0 ) { + name = table_cell; + } + else if ( table_cell != null ) { + final String key = table_cell.substring( 0, table_cell.indexOf( ':' ) ); + final String val = table_cell.substring( table_cell.indexOf( ':' ) + 1, table_cell.length() ); + row_map.put( key, val ); + } + } + map.put( name, row_map ); + } + return map; + } + + private static String processNameIntelligently( final String name ) { + final String[] s = name.split( " " ); + if ( s.length < 2 ) { + return name; + } + else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "|" ) > 0 ) ) { + return s[ 0 ]; + } + else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "|" ) > 0 ) ) { + return s[ 1 ]; + } + else if ( ( s[ 0 ].indexOf( "_" ) > 0 ) && ( s[ 0 ].indexOf( "." ) > 0 ) ) { + return s[ 0 ]; + } + else if ( ( s[ 1 ].indexOf( "_" ) > 0 ) && ( s[ 1 ].indexOf( "." ) > 0 ) ) { + return s[ 1 ]; + } + else if ( s[ 0 ].indexOf( "_" ) > 0 ) { + return s[ 0 ]; + } + else if ( s[ 1 ].indexOf( "_" ) > 0 ) { + return s[ 1 ]; + } + else { + return s[ 0 ]; + } + } + + private static String processSimilarTo( final String name ) { + final int i = name.toLowerCase().indexOf( "similar to" ); + String similar_to = ""; + if ( i >= 0 ) { + similar_to = " similarity=" + name.substring( i + 10 ).trim(); + } + final String pi = processNameIntelligently( name ); + return pi + similar_to; + } + + private static String sanitize( String s ) { + s = s.replace( ' ', '_' ); + s = s.replace( '(', '{' ); + s = s.replace( ')', '}' ); + s = s.replace( '[', '{' ); + s = s.replace( ']', '}' ); + s = s.replace( ',', '_' ); + return s; + } + + public static enum FIELD { + NODE_NAME, SEQUENCE_ANNOTATION_DESC, DOMAIN_STRUCTURE, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, SEQUENCE_NAME; + } +} diff --git a/forester/java/src/org/forester/tools/SupportCount.java b/forester/java/src/org/forester/tools/SupportCount.java new file mode 100644 index 0000000..a3c7fa8 --- /dev/null +++ b/forester/java/src/org/forester/tools/SupportCount.java @@ -0,0 +1,250 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.tools; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +/* + * A simple class containing a static method to evaluate the topology of a given + * phylogeny with a list of resampled phylogenies. + * + * + * @author Christian M Zmasek + */ +public final class SupportCount { + + private SupportCount() { + } + + public static double compare( final Phylogeny phylogeny, + final Phylogeny evaluator_phylogeny, + final boolean strip_evaluator_phylogeny, + final boolean update_support_in_phylogeny, + final boolean re_root ) { + String[] seq_names_to_keep = null; + if ( strip_evaluator_phylogeny ) { + seq_names_to_keep = phylogeny.getAllExternalNodeNames(); + SupportCount.strip( seq_names_to_keep, evaluator_phylogeny ); + } + if ( re_root ) { + final String child0_name = phylogeny.getFirstExternalNode().getName(); + phylogeny.reRoot( phylogeny.getNode( child0_name ) ); + evaluator_phylogeny.reRoot( evaluator_phylogeny.getNode( child0_name ) ); + } + final Map> phylogeny_external_names_per_node = SupportCount + .extractExternalNamesPerNode( phylogeny ); + return ( SupportCount.compare( phylogeny, + evaluator_phylogeny, + phylogeny_external_names_per_node, + update_support_in_phylogeny, + -1 ) ); + } + + /** + * + * Precondition: phylogeny and evaluator_phylogeny have to be rooted in the + * same manner. + * + * Returns a measure of the similarity ("average bootstrap similarity") + * between the topologies of phylogeny and evaluator_phylogeny: (sum of + * branches which divide phylogeny in a manner consitent with + * evaluator_phylogeny)/sum of branches in phylogeny. Therefore, this + * measure is 1.0 for indentical topologies and 0.0 for completely + * incompatible topologies. + * + * + * @param phylogeny + * @param evaluator_phylogeny + * @param external_names_per_node + * @param update_support_in_phylogeny + * set to true to update support values in phylogeny, otherwise, + * just calculation of the "average bootstrap similarity" + * @return a measure of the similarity ("average bootstrap similarity") + * between phylogeny and evaluator_phylogeny + */ + private static double compare( final Phylogeny phylogeny, + final Phylogeny evaluator_phylogeny, + final Map> phylogeny_external_names_per_node, + final boolean update_support_in_phylogeny, + final double similarity_threshold ) { + int matching_branches = 0; + int phylogeny_total_internal_branches = 0; + for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { + if ( !it.next().isExternal() ) { + ++phylogeny_total_internal_branches; + } + } + final Map support_values = new HashMap(); + E: for( final PhylogenyNodeIterator evaluator_phylogeny_it = evaluator_phylogeny.iteratorPostorder(); evaluator_phylogeny_it + .hasNext(); ) { + final List c1 = new ArrayList(); + for( final Object element : evaluator_phylogeny_it.next().getAllExternalDescendants() ) { + c1.add( ( ( PhylogenyNode ) element ).getName() ); + } + for( final Integer id : phylogeny_external_names_per_node.keySet() ) { + final List c2 = phylogeny_external_names_per_node.get( id ); + if ( ( c2.size() == c1.size() ) && c2.containsAll( c1 ) ) { + if ( c2.size() > 1 ) { + matching_branches++; + } + if ( update_support_in_phylogeny ) { + final PhylogenyNode node = phylogeny.getNode( id.intValue() ); + double d = PhylogenyMethods.getConfidenceValue( node ); + if ( d < 1.0 ) { + d = 1.0; + } + else { + ++d; + } + support_values.put( node, new Double( d ) ); + } + continue E; + } + } + } + final double similarity = ( double ) matching_branches / phylogeny_total_internal_branches; + if ( ( similarity_threshold < 0.0 ) || ( similarity >= similarity_threshold ) ) { + for( final PhylogenyNode node : support_values.keySet() ) { + double b = support_values.get( node ).doubleValue(); + if ( b < 0 ) { + b = 0.0; + } + PhylogenyMethods.setBootstrapConfidence( node, b ); + } + } + return similarity; + } + + public static void count( final Phylogeny phylogeny, + final Phylogeny[] evaluator_phylogenies, + final boolean strip_evaluator_phylogenies, + final boolean verbose ) { + SupportCount.count( phylogeny, evaluator_phylogenies, strip_evaluator_phylogenies, -1, verbose ); + } + + /** + * This counts the support of topology phylogeny by the topologies in + * phylogenies. If phylogenies contains topogies with names not present in + * phylogeny, strip_phylogenies must be set to true. phylogeny must not + * contain names not found in all phylogenies. + * + * @param phylogeny + * the topology to be evaluated + * @param evaluator_phylogenies + * the topologies used for evaluation + * @param strip_evaluator_phylogenies + * set to true if phylogenies contains topologies with names not + * present in phylogeny + */ + public static List count( final Phylogeny phylogeny, + final Phylogeny[] evaluator_phylogenies, + final boolean strip_evaluator_phylogenies, + final double similarity_threshold, + final boolean verbose ) { + String[] seq_names_to_keep = null; + final List evaluator_phylogenies_above_threshold = new ArrayList(); + if ( strip_evaluator_phylogenies ) { + seq_names_to_keep = phylogeny.getAllExternalNodeNames(); + } + final String child0_name = phylogeny.getFirstExternalNode().getName(); + phylogeny.reRoot( phylogeny.getNode( child0_name ) ); + final Map> phylogeny_external_names_per_node = SupportCount + .extractExternalNamesPerNode( phylogeny ); + if ( verbose ) { + System.out.println(); + System.out.println( "evaluator phylogeny #: similarity score (max is 1.0)" ); + System.out.println( "----------------------------------------------------" ); + System.out.println(); + } + for( int i = 0; i < evaluator_phylogenies.length; ++i ) { + final Phylogeny evaluator_phylogeny = evaluator_phylogenies[ i ]; + evaluator_phylogeny.reRoot( evaluator_phylogeny.getNode( child0_name ) ); + Phylogeny unstripped_evaluator_phylogeny = evaluator_phylogeny; + if ( strip_evaluator_phylogenies ) { + unstripped_evaluator_phylogeny = evaluator_phylogeny.copy(); + SupportCount.strip( seq_names_to_keep, evaluator_phylogeny ); + evaluator_phylogeny.orderAppearance( true ); // This is for + // easer + // comparison if + // phylos are saved + // to file. + } + final double s = SupportCount.compare( phylogeny, + evaluator_phylogenies[ i ], + phylogeny_external_names_per_node, + true, + similarity_threshold ); + if ( ( similarity_threshold < 0.0 ) || ( s >= similarity_threshold ) ) { + unstripped_evaluator_phylogeny.orderAppearance( true ); + evaluator_phylogenies_above_threshold.add( unstripped_evaluator_phylogeny ); + } + if ( verbose ) { + if ( similarity_threshold < 0.0 ) { + System.out.println( i + ": " + s ); + } + else if ( s >= similarity_threshold ) { + System.out.println( i + ": " + s + " <====" ); + } + else { + System.out.println( i + ": " + s ); + } + } + } + if ( verbose ) { + System.out.println( "----------------------------------------------------" ); + System.out.println(); + } + return evaluator_phylogenies_above_threshold; + } + + private static Map> extractExternalNamesPerNode( final Phylogeny phylogeny ) + throws NoSuchElementException { + final HashMap> phylogeny_external_names_per_node = new HashMap>(); + for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + final List l = n.getAllExternalDescendants(); + final ArrayList c = new ArrayList(); + phylogeny_external_names_per_node.put( new Integer( n.getId() ), c ); + for( final PhylogenyNode phylogenyNode : l ) { + c.add( phylogenyNode.getName() ); + } + } + return phylogeny_external_names_per_node; + } + + private static void strip( final String[] to_keep, final Phylogeny to_be_stripped ) { + PhylogenyMethods.deleteExternalNodesPositiveSelection( to_keep, to_be_stripped ); + } +} diff --git a/forester/java/src/org/forester/tools/TreeSplitMatrix.java b/forester/java/src/org/forester/tools/TreeSplitMatrix.java new file mode 100644 index 0000000..d1e015b --- /dev/null +++ b/forester/java/src/org/forester/tools/TreeSplitMatrix.java @@ -0,0 +1,257 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.tools; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +public class TreeSplitMatrix { + + private final SortedMap> _data; + private final Map _positive_counts; + private final boolean _strict; + + public TreeSplitMatrix( final Phylogeny evaluator, final boolean strict, final Phylogeny target ) { + Set target_external_nodes = null; + if ( !strict ) { + if ( ( target == null ) || target.isEmpty() ) { + throw new IllegalArgumentException( "target must not be null or empty if non-strict evalution is expected" ); + } + target_external_nodes = new HashSet(); + for( final PhylogenyNodeIterator it = target.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( target_external_nodes.contains( n ) ) { + throw new IllegalArgumentException( "node [" + n.toString() + "] of target is not unique" ); + } + target_external_nodes.add( n ); + } + } + _data = new TreeMap>(); + _positive_counts = new HashMap(); + _strict = strict; + decompose( evaluator, target_external_nodes ); + } + + /** + * If strict is true, target nodes (all external nodes of the phylogeny for + * which support values are to be calculated) is not used for anything during construction. + * + * + * @param target + * @param evaluator + * @param strict + */ + public TreeSplitMatrix( final Phylogeny evaluator, + final boolean strict, + final Set target_external_nodes ) { + if ( !strict && ( ( target_external_nodes == null ) || target_external_nodes.isEmpty() ) ) { + throw new IllegalArgumentException( "target nodes list must not be null or empty if non-strict evalution is expected" ); + } + _data = new TreeMap>(); + _positive_counts = new HashMap(); + _strict = strict; + decompose( evaluator, target_external_nodes ); + } + + private boolean contains( final PhylogenyNode node ) { + return _data.keySet().contains( node ); + } + + private void decompose( final Phylogeny phy, final Set target_external_nodes ) { + setUpKeys( phy, target_external_nodes ); + setUpValues( phy, target_external_nodes ); + sanityCheck(); + } + + private int getNumberOfTrueValuesAt( final int index ) { + if ( _positive_counts.containsKey( index ) ) { + return _positive_counts.get( index ); + } + return 0; + } + + private boolean getValue( final PhylogenyNode node, final int index ) { + if ( _data.containsKey( node ) ) { + return _data.get( node ).get( index ); + } + return false; + } + + private char getValueAsChar( final PhylogenyNode node, final int index ) { + if ( getValue( node, index ) ) { + return '.'; + } + else { + return ' '; + } + } + + private Set keySet() { + return _data.keySet(); + } + + public boolean match( final Set query_nodes ) { + final Set my_query_nodes = query_nodes; + if ( _strict ) { + if ( !keySet().containsAll( my_query_nodes ) ) { + throw new IllegalArgumentException( "external nodes of target and evaluator do not match" ); + } + } + //else { + //THIS IS WRONG + // my_query_nodes.retainAll( keySet() ); + //} + for( int i = 0; i < size(); ++i ) { + if ( match( my_query_nodes, i ) ) { + return true; + } + } + return false; + } + + private boolean match( final Set query_nodes, final int i ) { + final int counts = getNumberOfTrueValuesAt( i ); + final int q_counts = query_nodes.size(); + boolean positive_matches = true; + boolean negative_matches = true; + if ( q_counts != counts ) { + positive_matches = false; + } + if ( q_counts != keySet().size() - counts ) { + negative_matches = false; + } + if ( !positive_matches && !negative_matches ) { + return false; + } + for( final PhylogenyNode query_node : query_nodes ) { + if ( !contains( query_node ) ) { + if ( _strict ) { + //TODO remove me after testing + throw new RuntimeException( "this should not have happened, for query " + query_node + ":\n" + + toString() ); + } + else { + return false; //TODO really?!?!? + } + } + if ( getValue( query_node, i ) ) { + negative_matches = false; + } + else { + positive_matches = false; + } + if ( !positive_matches && !negative_matches ) { + return false; + } + } + return true; + } + + private void sanityCheck() { + int size = -1; + for( final PhylogenyNode key : keySet() ) { + if ( size < 0 ) { + size = size( key ); + } + else if ( size != size( key ) ) { + throw new RuntimeException( "this should not have happened: failed to build split matrix" ); + } + } + } + + private void setUpKeys( final Phylogeny phy, final Set target_external_nodes ) { + for( final PhylogenyNodeIterator it = phy.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( _strict || target_external_nodes.contains( n ) ) { + if ( _data.containsKey( n ) ) { + throw new IllegalArgumentException( "node '" + n.toString() + "' of evaluator is not unique" ); + } + _data.put( n, new ArrayList() ); + } + } + } + + private void setUpValues( final Phylogeny phy, final Set target_external_nodes ) { + int index = 0; + for( final PhylogenyNodeIterator it = phy.iteratorPreorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + final List current_ext_descs = node.getAllExternalDescendants(); + for( final PhylogenyNode key : keySet() ) { + //if ( _strict || target_external_nodes.contains( key ) ) { + if ( current_ext_descs.contains( key ) ) { + _data.get( key ).add( index, true ); + if ( !_positive_counts.containsKey( index ) ) { + _positive_counts.put( index, 1 ); + } + else { + _positive_counts.put( index, _positive_counts.get( index ) + 1 ); + } + } + else { + _data.get( key ).add( index, false ); + } + //} + } + index++; + } + } + + private int size() { + for( final PhylogenyNode key : keySet() ) { + return size( key ); + } + return 0; + } + + private int size( final PhylogenyNode node ) { + return _data.get( node ).size(); + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + for( final PhylogenyNode key : keySet() ) { + sb.append( key.getName() ); + sb.append( ":" ); + for( int i = 0; i < size( key ); ++i ) { + sb.append( " " ); + sb.append( getValueAsChar( key, i ) ); + } + sb.append( "\n" ); + } + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/util/AsciiHistogram.java b/forester/java/src/org/forester/util/AsciiHistogram.java new file mode 100644 index 0000000..07d3da8 --- /dev/null +++ b/forester/java/src/org/forester/util/AsciiHistogram.java @@ -0,0 +1,127 @@ +// $Id: +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +public class AsciiHistogram { + + final private DescriptiveStatistics _stats; + final private String _title; + + public AsciiHistogram( final DescriptiveStatistics stats ) { + _stats = stats; + _title = ""; + } + + public AsciiHistogram( final DescriptiveStatistics stats, final String title ) { + _stats = stats; + _title = title; + } + + private void drawToStringBuffer( final double min, + final char symbol, + final int size, + final int digits, + final StringBuffer sb, + final int[] bins, + final int max_count, + final int under, + final int over, + final double binning_factor ) { + final double draw_factor = ( double ) max_count / size; + final int counts_size = ForesterUtil.roundToInt( Math.log10( max_count ) ) + 1; + if ( !ForesterUtil.isEmpty( getTitle() ) ) { + sb.append( getTitle() ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + if ( under > 0 ) { + sb.append( "[" + under + "] " ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + for( int i = 0; i < bins.length; ++i ) { + final int count = bins[ i ]; + final double label = ForesterUtil.round( ( min + i * ( 1.0 / binning_factor ) ), digits ); + sb.append( ForesterUtil.pad( label + "", digits, '0', false ) ); + sb.append( " [" + ForesterUtil.pad( count + "", counts_size, ' ', true ) + "] " ); + final int s = ForesterUtil.roundToInt( count / draw_factor ); + for( int j = 0; j < s; ++j ) { + sb.append( symbol ); + } + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + if ( over > 0 ) { + sb.append( "[" + over + "] " ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + } + + private DescriptiveStatistics getDescriptiveStatistics() { + return _stats; + } + + private String getTitle() { + return _title; + } + + public StringBuffer toStringBuffer( final double min, + final double max, + final int number_of_bins, + final char symbol, + final int size, + final int digits ) { + if ( min >= max ) { + throw new IllegalArgumentException( "min [" + min + "] is larger than or equal to max [" + max + "]" ); + } + if ( number_of_bins < 3 ) { + throw new IllegalArgumentException( "number of bins is smaller than 3" ); + } + if ( size < 2 ) { + throw new IllegalArgumentException( "size is smaller than 2" ); + } + final StringBuffer sb = new StringBuffer(); + int max_count = 0; + final double binning_factor = number_of_bins / ( max - min ); + final int[] bins = BasicDescriptiveStatistics + .performBinning( getDescriptiveStatistics().getDataAsDoubleArray(), min, max, number_of_bins ); + for( final int bin : bins ) { + if ( bin > max_count ) { + max_count = bin; + } + } + drawToStringBuffer( min, symbol, size, digits, sb, bins, max_count, 0, 0, binning_factor ); + return sb; + } + + public StringBuffer toStringBuffer( final int bins, final char symbol, final int size, final int digits ) { + return toStringBuffer( getDescriptiveStatistics().getMin(), + getDescriptiveStatistics().getMax(), + bins, + symbol, + size, + digits ); + } +} diff --git a/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java b/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java new file mode 100644 index 0000000..699526f --- /dev/null +++ b/forester/java/src/org/forester/util/BasicDescriptiveStatistics.java @@ -0,0 +1,340 @@ +// $Id: +// $ +// +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class BasicDescriptiveStatistics implements DescriptiveStatistics { + + private List _data; + private double _sum; + private double _min; + private double _max; + private double _sigma; + private boolean _recalc_sigma; + + public BasicDescriptiveStatistics() { + init(); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#addValue(double) + */ + public void addValue( final double d ) { + _recalc_sigma = true; + _sum += d; + _data.add( new Double( d ) ); + if ( d < _min ) { + _min = d; + } + if ( d > _max ) { + _max = d; + } + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#arithmeticMean() + */ + public double arithmeticMean() { + validate(); + return getSum() / getN(); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#asSummary() + */ + public String asSummary() { + if ( getN() > 1 ) { + return arithmeticMean() + DescriptiveStatistics.PLUS_MINUS + sampleStandardDeviation() + " [" + getMin() + + "..." + getMax() + "]"; + } + else { + return "" + arithmeticMean(); + } + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#coefficientOfVariation() + */ + public double coefficientOfVariation() { + validate(); + return ( sampleStandardDeviation() / arithmeticMean() ); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#getDataAsDoubleArray() + */ + public double[] getDataAsDoubleArray() { + validate(); + final double[] data_array = new double[ getN() ]; + for( int i = 0; i < getN(); ++i ) { + data_array[ i ] = getValue( i ); + } + return data_array; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#getMax() + */ + public double getMax() { + validate(); + return _max; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#getMin() + */ + public double getMin() { + validate(); + return _min; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#getN() + */ + public int getN() { + return _data.size(); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#getSum() + */ + public double getSum() { + validate(); + return _sum; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#getSummaryAsString() + */ + public String getSummaryAsString() { + validate(); + final double mean = arithmeticMean(); + final double sd = sampleStandardDeviation(); + return "" + mean + ( ( char ) 177 ) + sd + " [" + getMin() + "..." + getMax() + "]"; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#getValue(int) + */ + public double getValue( final int index ) { + validate(); + return ( ( ( _data.get( index ) ) ).doubleValue() ); + } + + private void init() { + _data = new ArrayList(); + _sum = 0.0; + _min = Double.MAX_VALUE; + _max = -Double.MAX_VALUE; + _sigma = 0.0; + _recalc_sigma = true; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#median() + */ + public double median() { + validate(); + double median = 0.0; + if ( getN() == 1 ) { + median = getValue( 0 ); + } + else { + final int index = ( getN() / 2 ); + final double[] data_array = getDataAsDoubleArray(); + Arrays.sort( data_array ); + if ( ( ( data_array.length ) % 2 ) == 0 ) { + // even number of data values + median = ( data_array[ index - 1 ] + data_array[ index ] ) / 2.0; + } + else { + median = data_array[ index ]; + } + } + return median; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#midrange() + */ + public double midrange() { + validate(); + return ( _min + _max ) / 2.0; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#pearsonianSkewness() + */ + public double pearsonianSkewness() { + validate(); + final double mean = arithmeticMean(); + final double median = median(); + final double sd = sampleStandardDeviation(); + return ( ( 3 * ( mean - median ) ) / sd ); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#sampleStandardDeviation() + */ + public double sampleStandardDeviation() { + return Math.sqrt( sampleVariance() ); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#sampleStandardUnit(double) + */ + public double sampleStandardUnit( final double value ) { + validate(); + return BasicDescriptiveStatistics.sampleStandardUnit( value, arithmeticMean(), sampleStandardDeviation() ); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#sampleVariance() + */ + public double sampleVariance() { + validate(); + if ( getN() < 2 ) { + throw new ArithmeticException( "attempt to calculate sample variance for less then two values" ); + } + return ( sumDeviations() / ( getN() - 1 ) ); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#standardErrorOfMean() + */ + public double standardErrorOfMean() { + validate(); + return ( sampleStandardDeviation() / Math.sqrt( getN() ) ); + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#sumDeviations() + */ + public double sumDeviations() { + validate(); + if ( _recalc_sigma ) { + _recalc_sigma = false; + _sigma = 0.0; + final double mean = arithmeticMean(); + for( int i = 0; i < getN(); ++i ) { + _sigma += Math.pow( ( getValue( i ) - mean ), 2 ); + } + } + return _sigma; + } + + /* (non-Javadoc) + * @see org.forester.util.DescriptiveStatisticsI#toString() + */ + @Override + public String toString() { + if ( getN() < 1 ) { + return "empty data set statistics"; + } + final StringBuffer sb = new StringBuffer(); + sb.append( "Descriptive statistics:" ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "n : " + getN() ); + if ( getN() > 1 ) { + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "min : " + getMin() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "max : " + getMax() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "midrange : " + midrange() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "median : " + median() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "mean : " + arithmeticMean() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "sd : " + sampleStandardDeviation() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "variance : " + sampleVariance() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "standard error of mean : " + standardErrorOfMean() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "coefficient of variation: " + coefficientOfVariation() ); + sb.append( ForesterUtil.getLineSeparator() ); + sb.append( "pearsonian skewness : " + pearsonianSkewness() ); + } + return sb.toString(); + } + + private void validate() throws ArithmeticException { + if ( getN() < 1 ) { + throw new ArithmeticException( "attempt to get a result from empty data set statistics" ); + } + } + + public static int[] performBinning( final double[] values, + final double min, + final double max, + final int number_of_bins ) { + if ( min >= max ) { + throw new IllegalArgumentException( "min [" + min + "] is larger than or equal to max [" + max + "]" ); + } + if ( number_of_bins < 3 ) { + throw new IllegalArgumentException( "number of bins is smaller than 3" ); + } + final int[] bins = new int[ number_of_bins ]; + final double binning_factor = number_of_bins / ( max - min ); + final int last_index = number_of_bins - 1; + for( final double d : values ) { + if ( !( ( d > max ) || ( d < min ) ) ) { + final int bin = ( int ) ( ( d - min ) * binning_factor ); + if ( bin > last_index ) { + ++bins[ last_index ]; + } + else { + ++bins[ bin ]; + } + } + } + return bins; + } + + /** + * Computes the sample standard unit (z-score). Used to compute 'value' in + * terms of standard units. Note that 'value', 'mean' and 'sd' must be all + * from the same sample data. + * + * @param value + * a double in the sample for which + * @param mean + * the mean of the sample. + * @param sd + * The standard deviation of the sample. + * @return 'value' in terms of standard units + */ + public static double sampleStandardUnit( final double value, final double mean, final double sd ) { + return ( value - mean ) / sd; + } +} diff --git a/forester/java/src/org/forester/util/BasicTable.java b/forester/java/src/org/forester/util/BasicTable.java new file mode 100644 index 0000000..2469129 --- /dev/null +++ b/forester/java/src/org/forester/util/BasicTable.java @@ -0,0 +1,188 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.util; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class BasicTable { + + private Map> _rows; + private int _max_row; + private int _max_col; + + public BasicTable() { + init(); + } + + public Map getColumnsAsMap( final int key_col, final int value_col ) throws IllegalArgumentException { + final Map map = new HashMap(); + for( int row = 0; row < getNumberOfRows(); ++row ) { + final String key = ( String ) getValue( key_col, row ); + final E value = getValue( value_col, row ); + if ( ( key != null ) && ( value != null ) ) { + if ( map.containsKey( key ) ) { + throw new IllegalArgumentException( "attempt to use non-unique table value as key [" + key + "]" ); + } + map.put( key, value ); + } + } + return map; + } + + public Map getColumnsAsMapDouble( final int key_col, final int value_col ) + throws IllegalArgumentException, IOException { + final Map map = new HashMap(); + for( int row = 0; row < getNumberOfRows(); ++row ) { + final String key = ( String ) getValue( key_col, row ); + double value = 0; + try { + value = Double.parseDouble( getValueAsString( value_col, row ) ); + } + catch ( final NumberFormatException e ) { + throw new IOException( e ); + } + if ( key != null ) { + if ( map.containsKey( key ) ) { + throw new IllegalArgumentException( "attempt to use non-unique table value as key [" + key + "]" ); + } + map.put( key, value ); + } + } + return map; + } + + // Returns -1 if not found, IllegalArgumentException if not unique. + public int findRow( final String first_col_value ) throws IllegalArgumentException { + int result = -1; + for( int i = 0; i < this.getNumberOfRows(); ++i ) { + if ( getValueAsString( 0, i ).equals( first_col_value ) ) { + if ( result >= 0 ) { + throw new IllegalArgumentException( "\"" + first_col_value + "\" is not unique" ); + } + result = i; + } + } + return result; + } + + public int getNumberOfColumns() { + return _max_col + 1; + } + + public int getNumberOfRows() { + return _max_row + 1; + } + + private Map getRow( final int row ) { + return getRows().get( "" + row ); + } + + private Map> getRows() { + return _rows; + } + + public E getValue( final int col, final int row ) throws IllegalArgumentException { + if ( ( row > getNumberOfRows() - 1 ) || ( row < 0 ) ) { + throw new IllegalArgumentException( "value for row (" + row + ") is out of range [number of rows: " + + getNumberOfRows() + "]" ); + } + else if ( ( col >= getNumberOfColumns() ) || ( row < 0 ) ) { + throw new IllegalArgumentException( "value for column (" + col + ") is out of range [number of columns: " + + getNumberOfColumns() + "]" ); + } + final Map row_map = getRow( row ); + if ( ( row_map == null ) || ( row_map.size() < 1 ) ) { + return null; + } + return row_map.get( "" + col ); + } + + public String getValueAsString( final int col, final int row ) throws IllegalArgumentException { + if ( getValue( col, row ) != null ) { + return getValue( col, row ).toString(); + } + return null; + } + + private void init() { + _rows = new HashMap>(); + setMaxCol( -1 ); + setMaxRow( -1 ); + } + + public boolean isEmpty() { + return getNumberOfRows() <= 0; + } + + private void setMaxCol( final int max_col ) { + _max_col = max_col; + } + + private void setMaxRow( final int max_row ) { + _max_row = max_row; + } + + public void setValue( final int col, final int row, final E value ) { + if ( ( row < 0 ) || ( col < 0 ) ) { + throw new IllegalArgumentException( "attempt to use negative values for row or column" ); + } + if ( row > getNumberOfRows() - 1 ) { + setMaxRow( row ); + } + if ( col > getNumberOfColumns() - 1 ) { + setMaxCol( col ); + } + final String row_key = "" + row; + Map row_map = null; + if ( getRows().containsKey( row_key ) ) { + row_map = getRows().get( row_key ); + } + else { + row_map = new HashMap(); + getRows().put( row_key, row_map ); + } + row_map.put( "" + col, value ); + } + + @Override + public String toString() { + final StringBuffer sb = new StringBuffer(); + for( int row = 0; row < getNumberOfRows(); ++row ) { + for( int col = 0; col < getNumberOfColumns(); ++col ) { + sb.append( getValue( col, row ) ); + if ( col < getNumberOfColumns() - 1 ) { + sb.append( " " ); + } + } + if ( row < getNumberOfRows() - 1 ) { + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + } + return sb.toString(); + } +} diff --git a/forester/java/src/org/forester/util/BasicTableParser.java b/forester/java/src/org/forester/util/BasicTableParser.java new file mode 100644 index 0000000..9ebae2b --- /dev/null +++ b/forester/java/src/org/forester/util/BasicTableParser.java @@ -0,0 +1,108 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; + +public class BasicTableParser { + + private final static String START_OF_COMMENT_LINE_DEFAULT = "#"; + + private BasicTableParser() { + } + + public static BasicTable parse( final Object source, final String column_delimiter ) throws IOException { + return BasicTableParser.parse( source, column_delimiter, false, START_OF_COMMENT_LINE_DEFAULT, false ).get( 0 ); + } + + public static BasicTable parse( final Object source, + final String column_delimiter, + final boolean use_first_separator_only ) throws IOException { + return BasicTableParser.parse( source, + column_delimiter, + use_first_separator_only, + START_OF_COMMENT_LINE_DEFAULT, + false ).get( 0 ); + } + + public static List> parse( final Object source, + final String column_delimiter, + final boolean use_first_separator_only, + final String start_of_comment_line, + final boolean tables_separated_by_single_string_line ) + throws IOException { + final BufferedReader reader = ForesterUtil.obtainReader( source ); + final List> tables = new ArrayList>(); + BasicTable table = new BasicTable(); + int row = 0; + String line; + boolean saw_first_table = false; + final boolean use_start_of_comment_line = !( ForesterUtil.isEmpty( start_of_comment_line ) ); + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( saw_first_table + && ( ForesterUtil.isEmpty( line ) || ( tables_separated_by_single_string_line && ( line + .indexOf( column_delimiter ) < 0 ) ) ) ) { + if ( !table.isEmpty() ) { + tables.add( table ); + } + table = new BasicTable(); + row = 0; + } + else if ( !ForesterUtil.isEmpty( line ) + && ( !use_start_of_comment_line || !line.startsWith( start_of_comment_line ) ) ) { + saw_first_table = true; + final StringTokenizer st = new StringTokenizer( line, column_delimiter ); + int col = 0; + if ( st.hasMoreTokens() ) { + table.setValue( col++, row, st.nextToken().trim() ); + } + if ( !use_first_separator_only ) { + while ( st.hasMoreTokens() ) { + table.setValue( col++, row, st.nextToken().trim() ); + } + } + else { + final StringBuffer rest = new StringBuffer(); + while ( st.hasMoreTokens() ) { + rest.append( st.nextToken() ); + } + table.setValue( col++, row, rest.toString().trim() ); + } + ++row; + } + } + if ( !table.isEmpty() ) { + tables.add( table ); + } + reader.close(); + return tables; + } +} diff --git a/forester/java/src/org/forester/util/CommandLineArguments.java b/forester/java/src/org/forester/util/CommandLineArguments.java new file mode 100644 index 0000000..0fc0485 --- /dev/null +++ b/forester/java/src/org/forester/util/CommandLineArguments.java @@ -0,0 +1,281 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +public final class CommandLineArguments { + + private final static String OPTIONS_PREFIX = "-"; + private final static String EXTENDED_OPTIONS_PREFIX = "--"; + private final static String OPTIONS_SEPARATOR = "="; + private Map _options; + private Map _extended_options; + private List _names; + private String _command_line_str; + + public CommandLineArguments( final String[] args ) throws IOException { + init(); + parseCommandLineArguments( args ); + } + + private Map getAllOptions() { + final Map o = new HashMap(); + o.putAll( getOptionsList() ); + o.putAll( getExtendedOptionsList() ); + return o; + } + + public String getCommandLineArgsAsString() { + return _command_line_str; + } + + private Map getExtendedOptionsList() { + return _extended_options; + } + + public File getFile( final int i ) { + return new File( getNames()[ i ] ); + } + + public String getName( final int i ) { + return getNames()[ i ]; + } + + public String[] getNames() { + final String[] a = new String[ getNamesList().size() ]; + return getNamesList().toArray( a ); + } + + private List getNamesList() { + return _names; + } + + public int getNumberOfNames() { + return getNames().length; + } + + private Map getOptionsList() { + return _options; + } + + public String getOptionValue( final String option_name ) throws IllegalArgumentException { + final Map o = getAllOptions(); + if ( o.containsKey( option_name ) ) { + final String value = o.get( option_name ); + if ( !ForesterUtil.isEmpty( value ) ) { + return value; + } + else { + throw new IllegalArgumentException( "value for \"" + option_name + "\" is not set" ); + } + } + else { + throw new IllegalArgumentException( "option \"" + option_name + "\" is not set" ); + } + } + + /** + * Removes quotes + * + */ + public String getOptionValueAsCleanString( final String option_name ) throws IllegalArgumentException { + return getOptionValue( option_name ).replaceAll( "\"", "" ).replaceAll( "\'", "" ); + } + + public double getOptionValueAsDouble( final String option_name ) throws IOException { + double d = -Double.MAX_VALUE; + try { + d = new Double( getOptionValue( option_name ) ).doubleValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "value for option \"" + option_name + "\" is expected to be of type double" ); + } + return d; + } + + public int getOptionValueAsInt( final String option_name ) throws IOException { + int i = Integer.MIN_VALUE; + try { + i = new Integer( getOptionValue( option_name ) ).intValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "value for option \"" + option_name + "\" is expected to be of type integer" ); + } + return i; + } + + public long getOptionValueAsLong( final String option_name ) throws IOException { + long l = Long.MIN_VALUE; + try { + l = new Long( getOptionValue( option_name ) ).longValue(); + } + catch ( final NumberFormatException e ) { + throw new IOException( "value for option \"" + option_name + "\" is expected to be of type long" ); + } + return l; + } + + private void init() { + _options = new HashMap(); + _extended_options = new HashMap(); + _names = new ArrayList(); + _command_line_str = ""; + } + + public boolean isOptionHasAValue( final String option_name ) { + final Map o = getAllOptions(); + if ( o.containsKey( option_name ) ) { + final String value = o.get( option_name ); + return ( !ForesterUtil.isEmpty( value ) ); + } + else { + throw new IllegalArgumentException( "option \"" + option_name + "\" is not set" ); + } + } + + public boolean isOptionSet( final String option_name ) { + final Map o = getAllOptions(); + return ( o.containsKey( option_name ) ); + } + + public boolean isOptionValueSet( final String option_name ) throws IllegalArgumentException { + final Map o = getAllOptions(); + if ( o.containsKey( option_name ) ) { + return !( ForesterUtil.isEmpty( o.get( option_name ) ) ); + } + else { + throw new IllegalArgumentException( "option \"" + option_name + "\" is not set" ); + } + } + + private void parseCommandLineArguments( final String[] args ) throws IOException { + for( int i = 0; i < args.length; ++i ) { + final String arg = args[ i ].trim(); + _command_line_str += arg; + if ( i < args.length - 1 ) { + _command_line_str += " "; + } + if ( arg.startsWith( CommandLineArguments.EXTENDED_OPTIONS_PREFIX ) ) { + parseOption( arg.substring( CommandLineArguments.EXTENDED_OPTIONS_PREFIX.length() ), + getExtendedOptionsList() ); + } + else if ( arg.startsWith( CommandLineArguments.OPTIONS_PREFIX ) ) { + parseOption( arg.substring( CommandLineArguments.OPTIONS_PREFIX.length() ), getOptionsList() ); + } + else { + getNamesList().add( arg ); + } + } + } + + private void parseOption( final String option, final Map options_map ) throws IOException { + final int sep_index = option.indexOf( CommandLineArguments.OPTIONS_SEPARATOR ); + if ( sep_index < 1 ) { + if ( ForesterUtil.isEmpty( option ) ) { + throw new IOException( "attempt to set option with an empty name" ); + } + if ( getAllOptions().containsKey( option ) ) { + throw new IOException( "attempt to set option \"" + option + "\" mutiple times" ); + } + options_map.put( option, null ); + } + else { + final String key = option.substring( 0, sep_index ); + final String value = option.substring( sep_index + 1 ); + if ( ForesterUtil.isEmpty( key ) ) { + throw new IllegalArgumentException( "attempt to set option with an empty name" ); + } + // if ( ForesterUtil.isEmpty( value ) ) { + // throw new IllegalArgumentException( "attempt to set option with an empty value" ); + // } + if ( getAllOptions().containsKey( key ) ) { + throw new IllegalArgumentException( "attempt to set option \"" + key + "\" mutiple times [" + option + + "]" ); + } + options_map.put( key, value ); + } + } + + public List validateAllowedOptions( final List allowed_options ) { + final Map options = getAllOptions(); + final List dissallowed = new ArrayList(); + for( final String o : options.keySet() ) { + if ( !allowed_options.contains( o ) ) { + dissallowed.add( o ); + } + } + return dissallowed; + } + + public String validateAllowedOptionsAsString( final List allowed_options ) { + final List dissallowed = validateAllowedOptions( allowed_options ); + String dissallowed_string = ""; + for( final Iterator iter = dissallowed.iterator(); iter.hasNext(); ) { + dissallowed_string += "\"" + iter.next(); + if ( iter.hasNext() ) { + dissallowed_string += "\", "; + } + else { + dissallowed_string += "\""; + } + } + return dissallowed_string; + } + + public List validateMandatoryOptions( final List mandatory_options ) { + final Map options = getAllOptions(); + final List missing = new ArrayList(); + for( final String string : mandatory_options ) { + final String ma = string; + if ( !options.containsKey( ma ) ) { + missing.add( ma ); + } + } + return missing; + } + + public String validateMandatoryOptionsAsString( final List mandatory_options ) { + final List missing = validateMandatoryOptions( mandatory_options ); + String missing_string = ""; + for( final Iterator iter = missing.iterator(); iter.hasNext(); ) { + missing_string += "\"" + iter.next(); + if ( iter.hasNext() ) { + missing_string += "\", "; + } + else { + missing_string += "\""; + } + } + return missing_string; + } +} diff --git a/forester/java/src/org/forester/util/CommandProcessBuilder.java b/forester/java/src/org/forester/util/CommandProcessBuilder.java new file mode 100644 index 0000000..054f12c --- /dev/null +++ b/forester/java/src/org/forester/util/CommandProcessBuilder.java @@ -0,0 +1,81 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +public class CommandProcessBuilder { + + public static Process execute( final List command, final File working_dir ) throws InterruptedException, + IOException { + final ProcessBuilder builder = new ProcessBuilder( command ); + if ( working_dir != null ) { + if ( !working_dir.exists() ) { + throw new IllegalArgumentException( "directory [" + working_dir.getAbsolutePath() + "] does not exist" ); + } + if ( !working_dir.isDirectory() ) { + throw new IllegalArgumentException( "[" + working_dir.getAbsolutePath() + "] is not a directory" ); + } + if ( !working_dir.canWrite() ) { + throw new IllegalArgumentException( "cannot write to [" + working_dir.getAbsolutePath() + "]" ); + } + builder.directory( working_dir ); + } + final Process process = builder.start(); + return process; + } + + public static void main( final String args[] ) { + final List command = new ArrayList(); + command.add( System.getenv( "windir" ) + "\\system32\\" + "tree.com" ); + command.add( "/A" ); + Process p; + System.out.println( "Directory : " + System.getenv( "temp" ) ); + try { + p = CommandProcessBuilder.execute( command, new File( System.getenv( "temp" ) ) ); + final InputStream is = p.getInputStream(); + final InputStreamReader isr = new InputStreamReader( is ); + final BufferedReader br = new BufferedReader( isr ); + String line; + while ( ( line = br.readLine() ) != null ) { + System.out.println( line ); + } + System.out.println( "OK." ); + } + catch ( final InterruptedException e ) { + e.printStackTrace(); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } +} diff --git a/forester/java/src/org/forester/util/DescriptiveStatistics.java b/forester/java/src/org/forester/util/DescriptiveStatistics.java new file mode 100644 index 0000000..83b2f4f --- /dev/null +++ b/forester/java/src/org/forester/util/DescriptiveStatistics.java @@ -0,0 +1,83 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +public interface DescriptiveStatistics { + + public final static String PLUS_MINUS = "" + ( char ) 177; + + public abstract void addValue( final double d ); + + public abstract double arithmeticMean(); + + public abstract String asSummary(); + + /** + * Computes the coefficient of variation. Used to express standard deviation + * independent of units of measure. + * + * @return + */ + public abstract double coefficientOfVariation(); + + public abstract double[] getDataAsDoubleArray(); + + public abstract double getMax(); + + public abstract double getMin(); + + public abstract int getN(); + + public abstract double getSum(); + + public abstract String getSummaryAsString(); + + public abstract double getValue( final int index ); + + public abstract double median(); + + public abstract double midrange(); + + /** + * Determines relationship between the mean and the median. This reflects + * how the data differs from the normal bell shaped distribution. + * + * @return + */ + public abstract double pearsonianSkewness(); + + public abstract double sampleStandardDeviation(); + + public abstract double sampleStandardUnit( final double value ); + + public abstract double sampleVariance(); + + public abstract double standardErrorOfMean(); + + public abstract double sumDeviations(); + + public abstract String toString(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/util/ExternalProgram.java b/forester/java/src/org/forester/util/ExternalProgram.java new file mode 100644 index 0000000..54aafe8 --- /dev/null +++ b/forester/java/src/org/forester/util/ExternalProgram.java @@ -0,0 +1,124 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +public class ExternalProgram { + + public static boolean isExecuteableFile( final File path_to_cmd_f ) { + if ( !path_to_cmd_f.exists() ) { + return false; + } + else if ( path_to_cmd_f.isDirectory() ) { + return false; + } + else if ( !path_to_cmd_f.canExecute() ) { + return false; + } + return true; + } + private Process _process; + private final String _path_to_cmd; + + public ExternalProgram( final String path_to_cmd ) { + final File path_to_cmd_f = new File( path_to_cmd ); + checkCmdFile( path_to_cmd_f ); + _path_to_cmd = path_to_cmd_f.getAbsolutePath(); + } + + private void checkCmdFile( final File path_to_cmd_f ) { + if ( !path_to_cmd_f.exists() ) { + throw new IllegalArgumentException( "[" + path_to_cmd_f.getAbsolutePath() + "] does not exist" ); + } + else if ( path_to_cmd_f.isDirectory() ) { + throw new IllegalArgumentException( "[" + path_to_cmd_f.getAbsolutePath() + "] is a directory" ); + } + else if ( !path_to_cmd_f.canExecute() ) { + throw new IllegalArgumentException( "[" + path_to_cmd_f.getAbsolutePath() + "] is not executeable" ); + } + } + + public InputStream getErrorStream() { + return getProcess().getErrorStream(); + } + + public InputStream getInputStream() { + return getProcess().getInputStream(); + } + + public OutputStream getOutputStream() { + return getProcess().getOutputStream(); + } + + private String getPathToCmd() { + return _path_to_cmd; + } + + private Process getProcess() { + return _process; + } + + public Process launch( final String[] opts ) throws IOException, InterruptedException { + String[] cmd; + if ( ( opts == null ) || ( opts.length < 1 ) ) { + cmd = new String[ 1 ]; + } + else { + cmd = new String[ opts.length + 1 ]; + for( int i = 0; i < opts.length; i++ ) { + cmd[ i + 1 ] = opts[ i ]; + } + } + cmd[ 0 ] = getPathToCmd(); + System.out.println(); + for( final String element : cmd ) { + System.out.print( element + " " ); + } + System.out.println(); + setProcess( Runtime.getRuntime().exec( cmd ) ); + return getProcess(); + } + + private void setProcess( final Process process ) { + _process = process; + } + + public int waitFor() { + try { + return getProcess().waitFor(); + } + catch ( final InterruptedException e ) { + // TODO Auto-generated catch block + getProcess().destroy(); + e.printStackTrace(); + return -1; + } + } +} diff --git a/forester/java/src/org/forester/util/FailedConditionCheckException.java b/forester/java/src/org/forester/util/FailedConditionCheckException.java new file mode 100644 index 0000000..d297377 --- /dev/null +++ b/forester/java/src/org/forester/util/FailedConditionCheckException.java @@ -0,0 +1,43 @@ +// $Id: +// Exp $ +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +public class FailedConditionCheckException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = -860013990231493438L; + + public FailedConditionCheckException() { + super(); + } + + public FailedConditionCheckException( final String message ) { + super( message ); + } +} diff --git a/forester/java/src/org/forester/util/ForesterConstants.java b/forester/java/src/org/forester/util/ForesterConstants.java new file mode 100644 index 0000000..3edec56 --- /dev/null +++ b/forester/java/src/org/forester/util/ForesterConstants.java @@ -0,0 +1,39 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2000-2009 Christian M. Zmasek +// Copyright (C) 2007-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +public final class ForesterConstants { + + public final static String PHYLO_XML_VERSION = "1.10"; + public final static String PHYLO_XML_LOCATION = "http://www.phyloxml.org"; + public final static String PHYLO_XML_XSD = "phyloxml.xsd"; + public final static String XML_SCHEMA_INSTANCE = "http://www.w3.org/2001/XMLSchema-instance"; + public final static String LOCAL_PHYLOXML_XSD_RESOURCE = "resources/phyloxml.xsd"; + public final static String PHYLO_XML_SUFFIX = ".xml"; + public final static String UTF8 = "UTF-8"; + public final static String PHYLO_XML_REFERENCE = "Han MV and Zmasek CM (2009): \"phyloXML: XML for evolutionary biology and comparative genomics\", BMC Bioinformatics 10:356"; + public final static boolean RELEASE = false; +} diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java new file mode 100644 index 0000000..ada203b --- /dev/null +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -0,0 +1,1245 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +import java.awt.Color; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.StringReader; +import java.math.BigDecimal; +import java.net.URL; +import java.text.DateFormat; +import java.text.DecimalFormat; +import java.text.DecimalFormatSymbols; +import java.text.NumberFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; +import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.phyloxml.PhyloXmlParser; +import org.forester.io.parsers.tol.TolParser; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyMethods; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Distribution; +import org.forester.phylogeny.data.Sequence; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; + +public final class ForesterUtil { + + public final static String FILE_SEPARATOR = System.getProperty( "file.separator" ); + public final static String LINE_SEPARATOR = System.getProperty( "line.separator" ); + public final static String JAVA_VENDOR = System.getProperty( "java.vendor" ); + public final static String JAVA_VERSION = System.getProperty( "java.version" ); + public final static String OS_ARCH = System.getProperty( "os.arch" ); + public final static String OS_NAME = System.getProperty( "os.name" ); + public final static String OS_VERSION = System.getProperty( "os.version" ); + public final static Pattern PARANTHESESABLE_NH_CHARS_PATTERN = Pattern.compile( "[(),;\\s]" ); + public final static double ZERO_DIFF = 1.0E-9; + public static final BigDecimal NULL_BD = new BigDecimal( 0 ); + public static final NumberFormat FORMATTER_9; + public static final NumberFormat FORMATTER_6; + public static final NumberFormat FORMATTER_06; + public static final NumberFormat FORMATTER_3; + static { + final DecimalFormatSymbols dfs = new DecimalFormatSymbols(); + dfs.setDecimalSeparator( '.' ); + // dfs.setGroupingSeparator( ( char ) 0 ); + FORMATTER_9 = new DecimalFormat( "#.#########", dfs ); + FORMATTER_6 = new DecimalFormat( "#.######", dfs ); + FORMATTER_06 = new DecimalFormat( "0.######", dfs ); + FORMATTER_3 = new DecimalFormat( "#.###", dfs ); + } + + private ForesterUtil() { + } + + final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) { + if ( sb.length() > 0 ) { + sb.append( separator ); + } + } + + final public static boolean isEmpty( final List l ) { + if ( ( l == null ) || l.isEmpty() ) { + return true; + } + for( final Object o : l ) { + if ( o != null ) { + return false; + } + } + return true; + } + + final public static boolean isEmpty( final Set s ) { + if ( ( s == null ) || s.isEmpty() ) { + return true; + } + for( final Object o : s ) { + if ( o != null ) { + return false; + } + } + return true; + } + + /** + * This calculates a color. If value is equal to min the returned color is + * minColor, if value is equal to max the returned color is maxColor, + * otherwise a color 'proportional' to value is returned. + * + * @param value + * the value + * @param min + * the smallest value + * @param max + * the largest value + * @param minColor + * the color for min + * @param maxColor + * the color for max + * @return a Color + */ + final public static Color calcColor( double value, + final double min, + final double max, + final Color minColor, + final Color maxColor ) { + if ( value < min ) { + value = min; + } + if ( value > max ) { + value = max; + } + final double x = ForesterUtil.calculateColorFactor( value, max, min ); + final int red = ForesterUtil.calculateColorComponent( minColor.getRed(), maxColor.getRed(), x ); + final int green = ForesterUtil.calculateColorComponent( minColor.getGreen(), maxColor.getGreen(), x ); + final int blue = ForesterUtil.calculateColorComponent( minColor.getBlue(), maxColor.getBlue(), x ); + return new Color( red, green, blue ); + } + + /** + * This calculates a color. If value is equal to min the returned color is + * minColor, if value is equal to max the returned color is maxColor, if + * value is equal to mean the returned color is meanColor, otherwise a color + * 'proportional' to value is returned -- either between min-mean or + * mean-max + * + * @param value + * the value + * @param min + * the smallest value + * @param max + * the largest value + * @param mean + * the mean/median value + * @param minColor + * the color for min + * @param maxColor + * the color for max + * @param meanColor + * the color for mean + * @return a Color + */ + final public static Color calcColor( double value, + final double min, + final double max, + final double mean, + final Color minColor, + final Color maxColor, + final Color meanColor ) { + if ( value < min ) { + value = min; + } + if ( value > max ) { + value = max; + } + if ( value < mean ) { + final double x = ForesterUtil.calculateColorFactor( value, mean, min ); + final int red = ForesterUtil.calculateColorComponent( minColor.getRed(), meanColor.getRed(), x ); + final int green = ForesterUtil.calculateColorComponent( minColor.getGreen(), meanColor.getGreen(), x ); + final int blue = ForesterUtil.calculateColorComponent( minColor.getBlue(), meanColor.getBlue(), x ); + return new Color( red, green, blue ); + } + else if ( value > mean ) { + final double x = ForesterUtil.calculateColorFactor( value, max, mean ); + final int red = ForesterUtil.calculateColorComponent( meanColor.getRed(), maxColor.getRed(), x ); + final int green = ForesterUtil.calculateColorComponent( meanColor.getGreen(), maxColor.getGreen(), x ); + final int blue = ForesterUtil.calculateColorComponent( meanColor.getBlue(), maxColor.getBlue(), x ); + return new Color( red, green, blue ); + } + else { + return meanColor; + } + } + + /** + * Helper method for calcColor methods. + * + * @param smallercolor_component_x + * color component the smaller color + * @param largercolor_component_x + * color component the larger color + * @param x + * factor + * @return an int representing a color component + */ + final private static int calculateColorComponent( final double smallercolor_component_x, + final double largercolor_component_x, + final double x ) { + return ( int ) ( smallercolor_component_x + ( ( x * ( largercolor_component_x - smallercolor_component_x ) ) / 255.0 ) ); + } + + /** + * Helper method for calcColor methods. + * + * + * @param value + * the value + * @param larger + * the largest value + * @param smaller + * the smallest value + * @return a normalized value between larger and smaller + */ + final private static double calculateColorFactor( final double value, final double larger, final double smaller ) { + return ( 255.0 * ( value - smaller ) ) / ( larger - smaller ); + } + + final public static String collapseWhiteSpace( final String s ) { + return s.replaceAll( "[\\s]+", " " ); + } + + final public static String colorToHex( final Color color ) { + final String rgb = Integer.toHexString( color.getRGB() ); + return rgb.substring( 2, rgb.length() ); + } + + synchronized public static void copyFile( final File in, final File out ) throws IOException { + final FileInputStream in_s = new FileInputStream( in ); + final FileOutputStream out_s = new FileOutputStream( out ); + try { + final byte[] buf = new byte[ 1024 ]; + int i = 0; + while ( ( i = in_s.read( buf ) ) != -1 ) { + out_s.write( buf, 0, i ); + } + } + catch ( final IOException e ) { + throw e; + } + finally { + if ( in_s != null ) { + in_s.close(); + } + if ( out_s != null ) { + out_s.close(); + } + } + } + + final public static int countChars( final String str, final char c ) { + int count = 0; + for( int i = 0; i < str.length(); ++i ) { + if ( str.charAt( i ) == c ) { + ++count; + } + } + return count; + } + + final public static BufferedWriter createBufferedWriter( final File file ) throws IOException { + if ( file.exists() ) { + throw new IOException( "[" + file + "] already exists" ); + } + return new BufferedWriter( new FileWriter( file ) ); + } + + final public static BufferedWriter createBufferedWriter( final String name ) throws IOException { + return new BufferedWriter( new FileWriter( createFileForWriting( name ) ) ); + } + + final public static File createFileForWriting( final String name ) throws IOException { + final File file = new File( name ); + if ( file.exists() ) { + throw new IOException( "[" + name + "] already exists" ); + } + return file; + } + + final public static PhylogenyParser createParserDependingFileContents( final File file, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { + PhylogenyParser parser = null; + final String first_line = ForesterUtil.getFirstLine( file ).trim().toLowerCase(); + if ( first_line.startsWith( "<" ) ) { + parser = new PhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + if ( ForesterConstants.RELEASE ) { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + } + else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) + || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { + parser = new NexusPhylogeniesParser(); + } + else { + parser = new NHXParser(); + } + return parser; + } + + final public static PhylogenyParser createParserDependingOnFileType( final File file, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { + PhylogenyParser parser = null; + parser = createParserDependingOnSuffix( file.getName(), phyloxml_validate_against_xsd ); + if ( parser == null ) { + parser = createParserDependingFileContents( file, phyloxml_validate_against_xsd ); + } + return parser; + } + + /** + * Return null if it can not guess the parser to use based on name suffix. + * + * @param filename + * @return + */ + final public static PhylogenyParser createParserDependingOnSuffix( final String filename, + final boolean phyloxml_validate_against_xsd ) { + PhylogenyParser parser = null; + final String filename_lc = filename.toLowerCase(); + if ( filename_lc.endsWith( ".tol" ) || filename_lc.endsWith( ".tolxml" ) || filename_lc.endsWith( ".tol.zip" ) ) { + parser = new TolParser(); + } + else if ( filename_lc.endsWith( ".xml" ) || filename_lc.endsWith( ".px" ) || filename_lc.endsWith( "phyloxml" ) + || filename_lc.endsWith( ".zip" ) ) { + parser = new PhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + if ( ForesterConstants.RELEASE ) { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + } + else if ( filename_lc.endsWith( ".nexus" ) || filename_lc.endsWith( ".nex" ) || filename_lc.endsWith( ".nx" ) ) { + parser = new NexusPhylogeniesParser(); + } + else if ( filename_lc.endsWith( ".nhx" ) || filename_lc.endsWith( ".nh" ) || filename_lc.endsWith( ".newick" ) ) { + parser = new NHXParser(); + } + return parser; + } + + final public static PhylogenyParser createParserDependingOnUrlContents( final URL url, + final boolean phyloxml_validate_against_xsd ) + throws FileNotFoundException, IOException { + final String lc_filename = url.getFile().toString().toLowerCase(); + PhylogenyParser parser = createParserDependingOnSuffix( lc_filename, phyloxml_validate_against_xsd ); + if ( ( parser != null ) && lc_filename.endsWith( ".zip" ) ) { + if ( parser instanceof PhyloXmlParser ) { + ( ( PhyloXmlParser ) parser ).setZippedInputstream( true ); + } + else if ( parser instanceof TolParser ) { + ( ( TolParser ) parser ).setZippedInputstream( true ); + } + } + if ( parser == null ) { + final String first_line = getFirstLine( url ).trim().toLowerCase(); + if ( first_line.startsWith( "<" ) ) { + parser = new PhyloXmlParser(); + if ( phyloxml_validate_against_xsd ) { + final ClassLoader cl = PhyloXmlParser.class.getClassLoader(); + final URL xsd_url = cl.getResource( ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE ); + if ( xsd_url != null ) { + ( ( PhyloXmlParser ) parser ).setValidateAgainstSchema( xsd_url.toString() ); + } + else { + throw new RuntimeException( "failed to get URL for phyloXML XSD from jar file from [" + + ForesterConstants.LOCAL_PHYLOXML_XSD_RESOURCE + "]" ); + } + } + } + else if ( ( first_line.startsWith( "nexus" ) ) || ( first_line.startsWith( "#nexus" ) ) + || ( first_line.startsWith( "# nexus" ) ) || ( first_line.startsWith( "begin" ) ) ) { + parser = new NexusPhylogeniesParser(); + } + else { + parser = new NHXParser(); + } + } + return parser; + } + + final public static void ensurePresenceOfDate( final PhylogenyNode node ) { + if ( !node.getNodeData().isHasDate() ) { + node.getNodeData().setDate( new org.forester.phylogeny.data.Date() ); + } + } + + final public static void ensurePresenceOfDistribution( final PhylogenyNode node ) { + if ( !node.getNodeData().isHasDistribution() ) { + node.getNodeData().setDistribution( new Distribution( "" ) ); + } + } + + public static void ensurePresenceOfSequence( final PhylogenyNode node ) { + if ( !node.getNodeData().isHasSequence() ) { + node.getNodeData().setSequence( new Sequence() ); + } + } + + public static void ensurePresenceOfTaxonomy( final PhylogenyNode node ) { + if ( !node.getNodeData().isHasTaxonomy() ) { + node.getNodeData().setTaxonomy( new Taxonomy() ); + } + } + + /** + * Extracts a code if and only if: + * one and only one _, + * shorter than 25, + * no |, + * no ., + * if / present it has to be after the _, + * if PFAM_STYLE_ONLY: / must be present, + * tax code can only contain uppercase letters and numbers, + * and must contain at least one uppercase letter. + * Return null if no code extractable. + * + * @param name + * @param limit_to_five + * @return + */ + public static String extractTaxonomyCodeFromNodeName( final String name, + final boolean limit_to_five, + final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction ) { + if ( ( name.indexOf( "_" ) > 0 ) + && ( name.length() < 25 ) + && ( name.lastIndexOf( "_" ) == name.indexOf( "_" ) ) + && ( name.indexOf( "|" ) < 0 ) + && ( name.indexOf( "." ) < 0 ) + && ( ( taxonomy_extraction != ForesterUtil.TAXONOMY_EXTRACTION.PFAM_STYLE_ONLY ) || ( name + .indexOf( "/" ) >= 0 ) ) + && ( ( ( name.indexOf( "/" ) ) < 0 ) || ( name.indexOf( "/" ) > name.indexOf( "_" ) ) ) ) { + final String[] s = name.split( "[_/]" ); + if ( s.length > 1 ) { + String str = s[ 1 ]; + if ( limit_to_five ) { + if ( str.length() > 5 ) { + str = str.substring( 0, 5 ); + } + else if ( ( str.length() < 5 ) && ( str.startsWith( "RAT" ) || str.startsWith( "PIG" ) ) ) { + str = str.substring( 0, 3 ); + } + } + final Matcher letters_and_numbers = NHXParser.UC_LETTERS_NUMBERS_PATTERN.matcher( str ); + if ( !letters_and_numbers.matches() ) { + return null; + } + final Matcher numbers_only = NHXParser.NUMBERS_ONLY_PATTERN.matcher( str ); + if ( numbers_only.matches() ) { + return null; + } + return str; + } + } + return null; + } + + public static void fatalError( final String prg_name, final String message ) { + System.err.println(); + System.err.println( "[" + prg_name + "] > " + message ); + System.err.println(); + System.exit( -1 ); + } + + public static String[] file2array( final File file ) throws IOException { + final List list = file2list( file ); + final String[] ary = new String[ list.size() ]; + int i = 0; + for( final String s : list ) { + ary[ i++ ] = s; + } + return ary; + } + + final public static List file2list( final File file ) throws IOException { + final List list = new ArrayList(); + final BufferedReader in = new BufferedReader( new FileReader( file ) ); + String str; + while ( ( str = in.readLine() ) != null ) { + str = str.trim(); + if ( ( str.length() > 0 ) && !str.startsWith( "#" ) ) { + for( final String s : splitString( str ) ) { + list.add( s ); + } + } + } + in.close(); + return list; + } + + final public static SortedSet file2set( final File file ) throws IOException { + final SortedSet set = new TreeSet(); + final BufferedReader in = new BufferedReader( new FileReader( file ) ); + String str; + while ( ( str = in.readLine() ) != null ) { + str = str.trim(); + if ( ( str.length() > 0 ) && !str.startsWith( "#" ) ) { + for( final String s : splitString( str ) ) { + set.add( s ); + } + } + } + in.close(); + return set; + } + + final public static String getCurrentDateTime() { + final DateFormat format = new SimpleDateFormat( "yyyy/MM/dd HH:mm:ss" ); + return format.format( new Date() ); + } + + final public static String getFileSeparator() { + return ForesterUtil.FILE_SEPARATOR; + } + + final public static String getFirstLine( final Object source ) throws FileNotFoundException, IOException { + BufferedReader reader = null; + if ( source instanceof File ) { + final File f = ( File ) source; + if ( !f.exists() ) { + throw new IOException( "[" + f.getAbsolutePath() + "] does not exist" ); + } + else if ( !f.isFile() ) { + throw new IOException( "[" + f.getAbsolutePath() + "] is not a file" ); + } + else if ( !f.canRead() ) { + throw new IOException( "[" + f.getAbsolutePath() + "] is not a readable" ); + } + reader = new BufferedReader( new FileReader( f ) ); + } + else if ( source instanceof InputStream ) { + reader = new BufferedReader( new InputStreamReader( ( InputStream ) source ) ); + } + else if ( source instanceof String ) { + reader = new BufferedReader( new StringReader( ( String ) source ) ); + } + else if ( source instanceof StringBuffer ) { + reader = new BufferedReader( new StringReader( source.toString() ) ); + } + else if ( source instanceof URL ) { + reader = new BufferedReader( new InputStreamReader( ( ( URL ) source ).openStream() ) ); + } + else { + throw new IllegalArgumentException( "dont know how to read [" + source.getClass() + "]" ); + } + String line; + while ( ( line = reader.readLine() ) != null ) { + line = line.trim(); + if ( !ForesterUtil.isEmpty( line ) ) { + if ( reader != null ) { + reader.close(); + } + return line; + } + } + if ( reader != null ) { + reader.close(); + } + return line; + } + + final public static String getLineSeparator() { + return ForesterUtil.LINE_SEPARATOR; + } + + /** + * Returns all custom data tag names of this Phylogeny as Hashtable. Tag + * names are keys, values are Boolean set to false. + */ + final public static Hashtable getPropertyRefs( final Phylogeny phylogeny ) { + final Hashtable ht = new Hashtable(); + if ( phylogeny.isEmpty() ) { + return ht; + } + for( final PhylogenyNodeIterator iter = phylogeny.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode current_node = iter.next(); + if ( current_node.getNodeData().isHasProperties() ) { + final String[] tags = current_node.getNodeData().getProperties().getPropertyRefs(); + for( int i = 0; i < tags.length; ++i ) { + ht.put( tags[ i ], new Boolean( false ) ); + } + } + } + return ht; + } + + final public static void increaseCountingMap( final Map counting_map, final String item_name ) { + if ( !counting_map.containsKey( item_name ) ) { + counting_map.put( item_name, 1 ); + } + else { + counting_map.put( item_name, counting_map.get( item_name ) + 1 ); + } + } + + final static public boolean isAllNonEmptyInternalLabelsArePositiveNumbers( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !n.isRoot() && !n.isExternal() ) { + if ( !ForesterUtil.isEmpty( n.getName() ) ) { + double d = -1.0; + try { + d = Double.parseDouble( n.getName() ); + } + catch ( final Exception e ) { + d = -1.0; + } + if ( d < 0.0 ) { + return false; + } + } + } + } + return true; + } + + final public static boolean isEmpty( final String s ) { + return ( ( s == null ) || ( s.length() < 1 ) ); + } + + final public static boolean isEqual( final double a, final double b ) { + return ( ( Math.abs( a - b ) ) < ZERO_DIFF ); + } + + final public static boolean isEven( final int n ) { + return n % 2 == 0; + } + + final static public boolean isHasAtLeastNodeWithEvent( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + if ( it.next().getNodeData().isHasEvent() ) { + return true; + } + } + return false; + } + + /** + * Returns true if at least one branch has a length larger than zero. + * + * + * @param phy + */ + final static public boolean isHasAtLeastOneBranchLengthLargerThanZero( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + if ( it.next().getDistanceToParent() > 0.0 ) { + return true; + } + } + return false; + } + + final static public boolean isHasAtLeastOneBranchWithSupportValues( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + if ( it.next().getBranchData().isHasConfidences() ) { + return true; + } + } + return false; + } + + /** + * This determines whether String[] a and String[] b have at least one + * String in common (intersect). Returns false if at least one String[] is + * null or empty. + * + * @param a + * a String[] b a String[] + * @return true if both a and b or not empty or null and contain at least + * one element in common false otherwise + */ + final public static boolean isIntersecting( final String[] a, final String[] b ) { + if ( ( a == null ) || ( b == null ) ) { + return false; + } + if ( ( a.length < 1 ) || ( b.length < 1 ) ) { + return false; + } + for( int i = 0; i < a.length; ++i ) { + final String ai = a[ i ]; + for( int j = 0; j < b.length; ++j ) { + if ( ( ai != null ) && ( b[ j ] != null ) && ai.equals( b[ j ] ) ) { + return true; + } + } + } + return false; + } + + final public static double isLargerOrEqualToZero( final double d ) { + if ( d > 0.0 ) { + return d; + } + else { + return 0.0; + } + } + + final public static boolean isNull( final BigDecimal s ) { + return ( ( s == null ) || ( s.compareTo( NULL_BD ) == 0 ) ); + } + + final public static String isReadableFile( final File f ) { + if ( !f.exists() ) { + return "file [" + f + "] does not exist"; + } + if ( f.isDirectory() ) { + return "[" + f + "] is a directory"; + } + if ( !f.isFile() ) { + return "[" + f + "] is not a file"; + } + if ( !f.canRead() ) { + return "file [" + f + "] is not readable"; + } + if ( f.length() < 1 ) { + return "file [" + f + "] is empty"; + } + return null; + } + + final public static String isReadableFile( final String s ) { + return isReadableFile( new File( s ) ); + } + + final public static String isWritableFile( final File f ) { + if ( f.isDirectory() ) { + return "[" + f + "] is a directory"; + } + if ( f.exists() ) { + return "[" + f + "] already exists"; + } + return null; + } + + /** + * Helper for method "stringToColor". + *

    + * (Last modified: 12/20/03) + */ + final public static int limitRangeForColor( int i ) { + if ( i > 255 ) { + i = 255; + } + else if ( i < 0 ) { + i = 0; + } + return i; + } + + final public static SortedMap listToSortedCountsMap( final List list ) { + final SortedMap map = new TreeMap(); + for( final Object key : list ) { + if ( !map.containsKey( key ) ) { + map.put( key, 1 ); + } + else { + map.put( key, map.get( key ) + 1 ); + } + } + return map; + } + + final public static StringBuffer mapToStringBuffer( final Map map, final String key_value_separator ) { + final StringBuffer sb = new StringBuffer(); + for( final Iterator iter = map.keySet().iterator(); iter.hasNext(); ) { + final Object key = iter.next(); + sb.append( key.toString() ); + sb.append( key_value_separator ); + sb.append( map.get( key ).toString() ); + sb.append( ForesterUtil.getLineSeparator() ); + } + return sb; + } + + final public static String normalizeString( final String s, + final int length, + final boolean left_pad, + final char pad_char ) { + if ( s.length() > length ) { + return s.substring( 0, length ); + } + else { + final StringBuffer pad = new StringBuffer( length - s.length() ); + for( int i = 0; i < ( length - s.length() ); ++i ) { + pad.append( pad_char ); + } + if ( left_pad ) { + return pad + s; + } + else { + return s + pad; + } + } + } + + final public static BufferedReader obtainReader( final Object source ) throws IOException, FileNotFoundException { + BufferedReader reader = null; + if ( source instanceof File ) { + final File f = ( File ) source; + if ( !f.exists() ) { + throw new IOException( "\"" + f.getAbsolutePath() + "\" does not exist" ); + } + else if ( !f.isFile() ) { + throw new IOException( "\"" + f.getAbsolutePath() + "\" is not a file" ); + } + else if ( !f.canRead() ) { + throw new IOException( "\"" + f.getAbsolutePath() + "\" is not a readable" ); + } + reader = new BufferedReader( new FileReader( f ) ); + } + else if ( source instanceof InputStream ) { + reader = new BufferedReader( new InputStreamReader( ( InputStream ) source ) ); + } + else if ( source instanceof String ) { + reader = new BufferedReader( new StringReader( ( String ) source ) ); + } + else if ( source instanceof StringBuffer ) { + reader = new BufferedReader( new StringReader( source.toString() ) ); + } + else { + throw new IllegalArgumentException( "attempt to parse object of type [" + source.getClass() + + "] (can only parse objects of type File, InputStream, String, or StringBuffer)" ); + } + return reader; + } + + final public static StringBuffer pad( final double number, final int size, final char pad, final boolean left_pad ) { + return pad( new StringBuffer( number + "" ), size, pad, left_pad ); + } + + final public static StringBuffer pad( final String string, final int size, final char pad, final boolean left_pad ) { + return pad( new StringBuffer( string ), size, pad, left_pad ); + } + + final public static StringBuffer pad( final StringBuffer string, + final int size, + final char pad, + final boolean left_pad ) { + final StringBuffer padding = new StringBuffer(); + final int s = size - string.length(); + if ( s < 1 ) { + return new StringBuffer( string.substring( 0, size ) ); + } + for( int i = 0; i < s; ++i ) { + padding.append( pad ); + } + if ( left_pad ) { + return padding.append( string ); + } + else { + return string.append( padding ); + } + } + + final public static double parseDouble( final String str ) throws ParseException { + if ( ForesterUtil.isEmpty( str ) ) { + return 0.0; + } + return Double.parseDouble( str ); + } + + final public static int parseInt( final String str ) throws ParseException { + if ( ForesterUtil.isEmpty( str ) ) { + return 0; + } + return Integer.parseInt( str ); + } + + final public static void postOrderRelabelInternalNodes( final Phylogeny phylogeny, final int starting_number ) { + int i = starting_number; + for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + if ( !node.isExternal() ) { + node.setName( String.valueOf( i++ ) ); + } + } + } + + final public static void printArray( final Object[] a ) { + for( int i = 0; i < a.length; ++i ) { + System.out.println( "[" + i + "]=" + a[ i ] ); + } + } + + final public static void printCountingMap( final Map counting_map ) { + for( final String key : counting_map.keySet() ) { + System.out.println( key + ": " + counting_map.get( key ) ); + } + } + + final public static void printErrorMessage( final String prg_name, final String message ) { + System.out.println( "[" + prg_name + "] > error: " + message ); + } + + final public static void printProgramInformation( final String prg_name, final String prg_version, final String date ) { + final int l = prg_name.length() + prg_version.length() + date.length() + 4; + System.out.println(); + System.out.println( prg_name + " " + prg_version + " (" + date + ")" ); + for( int i = 0; i < l; ++i ) { + System.out.print( "_" ); + } + System.out.println(); + } + + final public static void printProgramInformation( final String prg_name, + final String prg_version, + final String date, + final String email, + final String www ) { + final int l = prg_name.length() + prg_version.length() + date.length() + 4; + System.out.println(); + System.out.println( prg_name + " " + prg_version + " (" + date + ")" ); + for( int i = 0; i < l; ++i ) { + System.out.print( "_" ); + } + System.out.println(); + System.out.println(); + System.out.println( "WWW : " + www ); + System.out.println( "Contact: " + email ); + if ( !ForesterUtil.isEmpty( ForesterUtil.JAVA_VERSION ) && !ForesterUtil.isEmpty( ForesterUtil.JAVA_VENDOR ) ) { + System.out.println(); + System.out.println( "[running on Java " + ForesterUtil.JAVA_VERSION + " " + ForesterUtil.JAVA_VENDOR + "]" ); + } + System.out.println(); + } + + final public static void printWarningMessage( final String prg_name, final String message ) { + System.out.println( "[" + prg_name + "] > warning: " + message ); + } + + final public static void programMessage( final String prg_name, final String message ) { + System.out.println( "[" + prg_name + "] > " + message ); + } + + final public static String removeSuffix( final String file_name ) { + final int i = file_name.lastIndexOf( '.' ); + if ( i > 1 ) { + return file_name.substring( 0, i ); + } + return file_name; + } + + /** + * Removes all white space from String s. + * + * @return String s with white space removed + */ + final public static String removeWhiteSpace( String s ) { + int i; + for( i = 0; i <= s.length() - 1; i++ ) { + if ( ( s.charAt( i ) == ' ' ) || ( s.charAt( i ) == '\t' ) || ( s.charAt( i ) == '\n' ) + || ( s.charAt( i ) == '\r' ) ) { + s = s.substring( 0, i ) + s.substring( i + 1 ); + i--; + } + } + return s; + } + + final public static boolean isContainsParanthesesableNhCharacter( final String nh ) { + return PARANTHESESABLE_NH_CHARS_PATTERN.matcher( nh ).find(); + } + + final public static String replaceIllegalNhCharacters( final String nh ) { + if ( nh == null ) { + return ""; + } + return nh.trim().replaceAll( "[\\[\\]:]+", "_" ); + } + + final public static String replaceIllegalNhxCharacters( final String nhx ) { + if ( nhx == null ) { + return ""; + } + return nhx.trim().replaceAll( "[\\[\\](),:;\\s]+", "_" ); + } + + final public static double round( final double value, final int decimal_place ) { + BigDecimal bd = new BigDecimal( value ); + bd = bd.setScale( decimal_place, BigDecimal.ROUND_HALF_UP ); + return bd.doubleValue(); + } + + /** + * Rounds d to an int. + */ + final public static int roundToInt( final double d ) { + return ( int ) ( d + 0.5 ); + } + + final public static int roundToInt( final float f ) { + return ( int ) ( f + 0.5f ); + } + + final public static String sanitizeString( final String s ) { + if ( s == null ) { + return ""; + } + else { + return s.trim(); + } + } + + final private static String[] splitString( final String str ) { + final String regex = "[\\s;,]+"; + return str.split( regex ); + } + + final public static String stringArrayToString( final String[] a ) { + final StringBuffer sb = new StringBuffer(); + if ( ( a != null ) && ( a.length > 0 ) ) { + for( int i = 0; i < a.length - 1; ++i ) { + sb.append( a[ i ] + ", " ); + } + sb.append( a[ a.length - 1 ] ); + } + return sb.toString(); + } + + final static public void transferInternalNamesToBootstrapSupport( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !n.isExternal() && !ForesterUtil.isEmpty( n.getName() ) ) { + double value = -1; + try { + value = Double.parseDouble( n.getName() ); + } + catch ( final NumberFormatException e ) { + throw new IllegalArgumentException( "failed to parse number from [" + n.getName() + "]: " + + e.getLocalizedMessage() ); + } + if ( value >= 0.0 ) { + n.getBranchData().addConfidence( new Confidence( value, "bootstrap" ) ); + n.setName( "" ); + } + } + } + } + + final static public void transferInternalNodeNamesToConfidence( final Phylogeny phy ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( !n.isRoot() && !n.isExternal() && !n.getBranchData().isHasConfidences() ) { + if ( !ForesterUtil.isEmpty( n.getName() ) ) { + double d = -1.0; + try { + d = Double.parseDouble( n.getName() ); + } + catch ( final Exception e ) { + d = -1.0; + } + if ( d >= 0.0 ) { + n.getBranchData().addConfidence( new Confidence( d, "" ) ); + n.setName( "" ); + } + } + } + } + } + + final static public void transferNodeNameToField( final Phylogeny phy, final PhylogenyNodeField field ) { + final PhylogenyNodeIterator it = phy.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + final String name = n.getName().trim(); + if ( !ForesterUtil.isEmpty( name ) ) { + switch ( field ) { + case TAXONOMY_CODE: + //temp hack + // if ( name.length() > 5 ) { + // n.setName( "" ); + // if ( !n.getNodeData().isHasTaxonomy() ) { + // n.getNodeData().setTaxonomy( new Taxonomy() ); + // } + // n.getNodeData().getTaxonomy().setScientificName( name ); + // break; + // } + // + n.setName( "" ); + PhylogenyMethods.setTaxonomyCode( n, name ); + break; + case TAXONOMY_SCIENTIFIC_NAME: + n.setName( "" ); + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + n.getNodeData().getTaxonomy().setScientificName( name ); + break; + case TAXONOMY_COMMON_NAME: + n.setName( "" ); + if ( !n.getNodeData().isHasTaxonomy() ) { + n.getNodeData().setTaxonomy( new Taxonomy() ); + } + n.getNodeData().getTaxonomy().setCommonName( name ); + break; + case SEQUENCE_SYMBOL: + n.setName( "" ); + if ( !n.getNodeData().isHasSequence() ) { + n.getNodeData().setSequence( new Sequence() ); + } + n.getNodeData().getSequence().setSymbol( name ); + break; + case SEQUENCE_NAME: + n.setName( "" ); + if ( !n.getNodeData().isHasSequence() ) { + n.getNodeData().setSequence( new Sequence() ); + } + n.getNodeData().getSequence().setName( name ); + break; + } + } + } + } + + final public static void unexpectedFatalError( final String prg_name, final Exception e ) { + System.err.println(); + System.err.println( "[" + prg_name + + "] > unexpected error (Should not have occured! Please contact program author(s).)" ); + e.printStackTrace( System.err ); + System.err.println(); + System.exit( -1 ); + } + + final public static void unexpectedFatalError( final String prg_name, final String message ) { + System.err.println(); + System.err.println( "[" + prg_name + + "] > unexpected error. Should not have occured! Please contact program author(s)." ); + System.err.println( message ); + System.err.println(); + System.exit( -1 ); + } + + final public static void unexpectedFatalError( final String prg_name, final String message, final Exception e ) { + System.err.println(); + System.err.println( "[" + prg_name + + "] > unexpected error. Should not have occured! Please contact program author(s)." ); + System.err.println( message ); + e.printStackTrace( System.err ); + System.err.println(); + System.exit( -1 ); + } + + public final static String wordWrap( final String str, final int width ) { + final StringBuilder sb = new StringBuilder( str ); + int start = 0; + int ls = -1; + int i = 0; + while ( i < sb.length() ) { + if ( sb.charAt( i ) == ' ' ) { + ls = i; + } + if ( sb.charAt( i ) == '\n' ) { + ls = -1; + start = i + 1; + } + if ( i > start + width - 1 ) { + if ( ls != -1 ) { + sb.setCharAt( ls, '\n' ); + start = ls + 1; + ls = -1; + } + else { + sb.insert( i, '\n' ); + start = i + 1; + } + } + i++; + } + return sb.toString(); + } + + public static enum PhylogenyNodeField { + CLADE_NAME, TAXONOMY_CODE, TAXONOMY_SCIENTIFIC_NAME, TAXONOMY_COMMON_NAME, SEQUENCE_SYMBOL, SEQUENCE_NAME; + } + + public static enum TAXONOMY_EXTRACTION { + NO, YES, PFAM_STYLE_ONLY; + } +} diff --git a/forester/java/src/org/forester/util/GeneralTable.java b/forester/java/src/org/forester/util/GeneralTable.java new file mode 100644 index 0000000..7b0a655 --- /dev/null +++ b/forester/java/src/org/forester/util/GeneralTable.java @@ -0,0 +1,139 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org + +package org.forester.util; + +import java.text.NumberFormat; +import java.util.HashMap; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +public class GeneralTable { + + private Map> _rows; + private SortedSet _row_identifiers; + private SortedSet _column_identifiers; + + public GeneralTable() { + init(); + } + + public SortedSet getColumnIdentifiers() { + return _column_identifiers; + } + + private Map getRow( final IDENTIFIER_TYPE row ) { + return getRows().get( row ); + } + + public SortedSet getRowIdentifiers() { + return _row_identifiers; + } + + private Map> getRows() { + return _rows; + } + + public VALUE_TYPE getValue( final IDENTIFIER_TYPE col, final IDENTIFIER_TYPE row ) throws IllegalArgumentException { + final Map row_map = getRow( row ); + if ( ( row_map == null ) || ( row_map.size() < 1 ) ) { + return null; + } + return row_map.get( col ); + } + + public String getValueAsString( final IDENTIFIER_TYPE col, final IDENTIFIER_TYPE row ) + throws IllegalArgumentException { + final VALUE_TYPE value = getValue( col, row ); + return ( value == null ? "" : getValue( col, row ).toString() ); + } + + private void init() { + _rows = new HashMap>(); + _row_identifiers = new TreeSet(); + _column_identifiers = new TreeSet(); + } + + public void setValue( final IDENTIFIER_TYPE col, final IDENTIFIER_TYPE row, final VALUE_TYPE value ) { + getColumnIdentifiers().add( col ); + getRowIdentifiers().add( row ); + Map row_map = null; + if ( getRows().containsKey( row ) ) { + row_map = getRows().get( row ); + } + else { + row_map = new HashMap(); + getRows().put( row, row_map ); + } + row_map.put( col, value ); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append( "\t" ); + for( final IDENTIFIER_TYPE col : getColumnIdentifiers() ) { + sb.append( col.toString() ); + sb.append( "\t" ); + } + sb.append( ForesterUtil.LINE_SEPARATOR ); + for( final IDENTIFIER_TYPE row : getRowIdentifiers() ) { + sb.append( row.toString() ); + sb.append( "\t" ); + for( final IDENTIFIER_TYPE col : getColumnIdentifiers() ) { + sb.append( getValueAsString( col, row ) ); + sb.append( "\t" ); + } + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb.toString(); + } + + public String toString( final NumberFormat number_format ) { + final StringBuilder sb = new StringBuilder(); + sb.append( "\t" ); + for( final IDENTIFIER_TYPE col : getColumnIdentifiers() ) { + sb.append( col.toString() ); + sb.append( "\t" ); + } + sb.append( ForesterUtil.LINE_SEPARATOR ); + for( final IDENTIFIER_TYPE row : getRowIdentifiers() ) { + sb.append( row.toString() ); + sb.append( "\t" ); + for( final IDENTIFIER_TYPE col : getColumnIdentifiers() ) { + try { + sb.append( number_format.format( getValue( col, row ) ) ); + } + catch ( final IllegalArgumentException e ) { + sb.append( getValueAsString( col, row ) ); + } + sb.append( "\t" ); + } + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb.toString(); + } +} \ No newline at end of file diff --git a/forester/java/src/org/forester/util/IllegalFormatUseException.java b/forester/java/src/org/forester/util/IllegalFormatUseException.java new file mode 100644 index 0000000..7b3704b --- /dev/null +++ b/forester/java/src/org/forester/util/IllegalFormatUseException.java @@ -0,0 +1,42 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +public class IllegalFormatUseException extends IllegalArgumentException { + + /** + * + */ + private static final long serialVersionUID = -1126329548396073983L; + + public IllegalFormatUseException() { + super(); + } + + public IllegalFormatUseException( final String message ) { + super( message ); + } +} diff --git a/forester/java/src/org/forester/util/SystemCommandExecutor.java b/forester/java/src/org/forester/util/SystemCommandExecutor.java new file mode 100644 index 0000000..5b32dc4 --- /dev/null +++ b/forester/java/src/org/forester/util/SystemCommandExecutor.java @@ -0,0 +1,154 @@ +// $Id: +/** + * This class can be used to execute a system command from a Java application. + * See the documentation for the public methods of this class for more + * information. + * + * Documentation for this class is available at this URL: + * + * http://devdaily.com/java/java-processbuilder-process-system-exec + * + * + * Copyright 2010 alvin j. alexander, devdaily.com. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * this program. If not, see . + * + * Please ee the following page for the LGPL license: + * http://www.gnu.org/licenses/lgpl.txt + * + */ + +package org.forester.util; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.List; + +public class SystemCommandExecutor { + + private final List commandInformation; + private final String adminPassword; + private ThreadedStreamHandler inputStreamHandler; + private ThreadedStreamHandler errorStreamHandler; + + /** + * Pass in the system command you want to run as a List of Strings, as shown here: + * + * List commands = new ArrayList(); + * commands.add("/sbin/ping"); + * commands.add("-c"); + * commands.add("5"); + * commands.add("www.google.com"); + * SystemCommandExecutor commandExecutor = new SystemCommandExecutor(commands); + * commandExecutor.executeCommand(); + * + * Note: I've removed the other constructor that was here to support executing + * the sudo command. I'll add that back in when I get the sudo command + * working to the point where it won't hang when the given password is + * wrong. + * + * @param commandInformation The command you want to run. + */ + public SystemCommandExecutor( final List commandInformation ) { + if ( ( commandInformation == null ) || commandInformation.isEmpty() ) { + throw new IllegalArgumentException( "The commandInformation is required." ); + } + checkCmdFile( new File( commandInformation.get( 0 ) ) ); + this.commandInformation = commandInformation; + adminPassword = null; + } + + public static boolean isExecuteableFile( final File path_to_cmd_f ) { + if ( !path_to_cmd_f.exists() ) { + return false; + } + else if ( path_to_cmd_f.isDirectory() ) { + return false; + } + else if ( !path_to_cmd_f.canExecute() ) { + return false; + } + return true; + } + + private void checkCmdFile( final File path_to_cmd_f ) { + if ( !path_to_cmd_f.exists() ) { + throw new IllegalArgumentException( "[" + path_to_cmd_f.getAbsolutePath() + "] does not exist" ); + } + else if ( path_to_cmd_f.isDirectory() ) { + throw new IllegalArgumentException( "[" + path_to_cmd_f.getAbsolutePath() + "] is a directory" ); + } + else if ( !path_to_cmd_f.canExecute() ) { + throw new IllegalArgumentException( "[" + path_to_cmd_f.getAbsolutePath() + "] is not executeable" ); + } + } + + public int executeCommand() throws IOException, InterruptedException { + int exitValue = -99; + try { + final ProcessBuilder pb = new ProcessBuilder( commandInformation ); + final Process process = pb.start(); + // you need this if you're going to write something to the command's input stream + // (such as when invoking the 'sudo' command, and it prompts you for a password). + final OutputStream stdOutput = process.getOutputStream(); + // i'm currently doing these on a separate line here in case i need to set them to null + // to get the threads to stop. + // see http://java.sun.com/j2se/1.5.0/docs/guide/misc/threadPrimitiveDeprecation.html + final InputStream inputStream = process.getInputStream(); + final InputStream errorStream = process.getErrorStream(); + // these need to run as java threads to get the standard output and error from the command. + // the inputstream handler gets a reference to our stdOutput in case we need to write + // something to it, such as with the sudo command + inputStreamHandler = new ThreadedStreamHandler( inputStream, stdOutput, adminPassword ); + errorStreamHandler = new ThreadedStreamHandler( errorStream ); + // TODO the inputStreamHandler has a nasty side-effect of hanging if the given password is wrong; fix it + inputStreamHandler.start(); + errorStreamHandler.start(); + // TODO a better way to do this? + exitValue = process.waitFor(); + // TODO a better way to do this? + inputStreamHandler.interrupt(); + errorStreamHandler.interrupt(); + inputStreamHandler.join(); + errorStreamHandler.join(); + } + catch ( final IOException e ) { + // TODO deal with this here, or just throw it? + throw e; + } + catch ( final InterruptedException e ) { + // generated by process.waitFor() call + // TODO deal with this here, or just throw it? + throw e; + } + finally { + return exitValue; + } + } + + /** + * Get the standard error (stderr) from the command you just exec'd. + */ + public StringBuilder getStandardErrorFromCommand() { + return errorStreamHandler.getOutputBuffer(); + } + + /** + * Get the standard output (stdout) from the command you just exec'd. + */ + public StringBuilder getStandardOutputFromCommand() { + return inputStreamHandler.getOutputBuffer(); + } +} diff --git a/forester/java/src/org/forester/util/ThreadedStreamHandler.java b/forester/java/src/org/forester/util/ThreadedStreamHandler.java new file mode 100644 index 0000000..ed70e75 --- /dev/null +++ b/forester/java/src/org/forester/util/ThreadedStreamHandler.java @@ -0,0 +1,135 @@ +// $Id: +/** + * This class is intended to be used with the SystemCommandExecutor class to let + * users execute system commands from Java applications. + * + * This class is based on work that was shared in a JavaWorld article named + * "When System.exec() won't". That article is available at this url: + * + * http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html + * + * Documentation for this class is available at this URL: + * + * http://devdaily.com/java/java-processbuilder-process-system-exec + * + * + * Copyright 2010 alvin j. alexander, devdaily.com. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * this program. If not, see . + * + * Please ee the following page for the LGPL license: + * http://www.gnu.org/licenses/lgpl.txt + * + */ + +package org.forester.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.PrintWriter; + +class ThreadedStreamHandler extends Thread { + + InputStream inputStream; + String adminPassword; + OutputStream outputStream; + PrintWriter printWriter; + StringBuilder outputBuffer = new StringBuilder( 65536 ); + private boolean sudoIsRequested = false; + + /** + * A simple constructor for when the sudo command is not necessary. + * This constructor will just run the command you provide, without + * running sudo before the command, and without expecting a password. + * + * @param inputStream + * @param streamType + */ + ThreadedStreamHandler( final InputStream inputStream ) { + this.inputStream = inputStream; + } + + /** + * Use this constructor when you want to invoke the 'sudo' command. + * The outputStream must not be null. If it is, you'll regret it. :) + * + * TODO this currently hangs if the admin password given for the sudo command is wrong. + * + * @param inputStream + * @param streamType + * @param outputStream + * @param adminPassword + */ + ThreadedStreamHandler( final InputStream inputStream, final OutputStream outputStream, final String adminPassword ) { + this.inputStream = inputStream; + this.outputStream = outputStream; + printWriter = new PrintWriter( outputStream ); + this.adminPassword = adminPassword; + sudoIsRequested = true; + } + + private void doSleep( final long millis ) { + try { + Thread.sleep( millis ); + } + catch ( final InterruptedException e ) { + // ignore + } + } + + public StringBuilder getOutputBuffer() { + return outputBuffer; + } + + @Override + public void run() { + // on mac os x 10.5.x, when i run a 'sudo' command, i need to write + // the admin password out immediately; that's why this code is + // here. + if ( sudoIsRequested ) { + //doSleep(500); + printWriter.println( adminPassword ); + printWriter.flush(); + } + BufferedReader bufferedReader = null; + final String newline = ForesterUtil.LINE_SEPARATOR; + try { + bufferedReader = new BufferedReader( new InputStreamReader( inputStream ) ); + String line = null; + while ( ( line = bufferedReader.readLine() ) != null ) { + // outputBuffer.append( line + "\n" ); // CMZ change + outputBuffer.append( line ); + outputBuffer.append( newline ); + } + } + catch ( final IOException ioe ) { + // TODO handle this better + ioe.printStackTrace(); + } + catch ( final Throwable t ) { + // TODO handle this better + t.printStackTrace(); + } + finally { + try { + bufferedReader.close(); + } + catch ( final IOException e ) { + // ignore this one + } + } + } +} diff --git a/forester/java/src/org/forester/util/WindowsUtils.java b/forester/java/src/org/forester/util/WindowsUtils.java new file mode 100644 index 0000000..b62906d --- /dev/null +++ b/forester/java/src/org/forester/util/WindowsUtils.java @@ -0,0 +1,87 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// From: http://www.rgagnon.com/javadetails/java-0652.html +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.util; + +import java.io.IOException; +import java.io.InputStream; +import java.io.StringWriter; + +public class WindowsUtils { + + private static final String REGQUERY_UTIL = "reg query "; + private static final String REGSTR_TOKEN = "REG_SZ"; + private static final String DESKTOP_FOLDER_CMD = REGQUERY_UTIL + + "\"HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\" + + "Explorer\\Shell Folders\" /v DESKTOP"; + + private WindowsUtils() { + } + + public static String getCurrentUserDesktopPath() { + try { + final Process process = Runtime.getRuntime().exec( DESKTOP_FOLDER_CMD ); + final StreamReader reader = new StreamReader( process.getInputStream() ); + reader.start(); + process.waitFor(); + reader.join(); + final String result = reader.getResult(); + final int p = result.indexOf( REGSTR_TOKEN ); + if ( p == -1 ) { + return null; + } + return result.substring( p + REGSTR_TOKEN.length() ).trim(); + } + catch ( final Exception e ) { + return null; + } + } + + static class StreamReader extends Thread { + + private final InputStream is; + private final StringWriter sw; + + StreamReader( final InputStream is ) { + this.is = is; + sw = new StringWriter(); + } + + String getResult() { + return sw.toString(); + } + + @Override + public void run() { + try { + int c; + while ( ( c = is.read() ) != -1 ) { + sw.write( c ); + } + } + catch ( final IOException e ) { + // Do nothing + } + } + } +} -- 1.7.10.2