// $Id: // 19:38:35 cmzmasek Exp $ // FORESTER -- software libraries and applications // for evolutionary biology research and applications. // // Copyright (C) 2008-2009 Christian M. Zmasek // Copyright (C) 2008-2009 Burnham Institute for Medical Research // All rights reserved // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com // WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.surfacing; import java.util.HashSet; import java.util.Set; import org.forester.protein.BinaryDomainCombination; public class DomainArchitectureBasedGenomeSimilarityCalculator { public static final double MAX_SIMILARITY_SCORE = 1.0; public static final double MIN_SIMILARITY_SCORE = 0.0; private Set _all_binary_domain_combinations; private Set _all_domains; private boolean _allow_domains_to_be_ignored; private Set _binary_domain_combinations_specific_to_0; private Set _binary_domain_combinations_specific_to_1; final private GenomeWideCombinableDomains _combinable_domains_genome_0; final private GenomeWideCombinableDomains _combinable_domains_genome_1; private Set _domain_ids_to_ignore; private Set _domains_specific_to_0; private Set _domains_specific_to_1; private Set _shared_binary_domain_combinations; private Set _shared_domains; public DomainArchitectureBasedGenomeSimilarityCalculator( final GenomeWideCombinableDomains combinable_domains_genome_0, final GenomeWideCombinableDomains combinable_domains_genome_1 ) { if ( ( combinable_domains_genome_0 == null ) || ( combinable_domains_genome_0.getSize() < 1 ) || ( combinable_domains_genome_1 == null ) || ( combinable_domains_genome_1.getSize() < 1 ) ) { throw new IllegalArgumentException( "attempt to compare null or empty combinable domains collection" ); } if ( combinable_domains_genome_0.getSpecies().equals( combinable_domains_genome_1.getSpecies() ) ) { throw new IllegalArgumentException( "attempt to compare combinable domains collection from the same species" ); } _combinable_domains_genome_0 = combinable_domains_genome_0; _combinable_domains_genome_1 = combinable_domains_genome_1; init(); forceRecalculation(); } public void addDomainIdToIgnore( final String domain_id_to_ignore ) { forceRecalculation(); getDomainIdsToIgnore().add( domain_id_to_ignore ); } /** * This returns a score between 0.0 (no binary domain combination in common) * and 1.0 (all binary domain combinations in common) measuring the similarity between two * genomes based on the number of shared binary domain combinations: * * t: sum of (distinct) binary domain combinations * s: sum of shared (distinct) binary domain combinations * * 1 - ( ( t - s ) / t ) * * @return shared binary domain combinations based similarity score */ public double calculateSharedBinaryDomainCombinationBasedGenomeSimilarityScore() { final double t = getAllBinaryDomainCombinations().size(); final double s = getSharedBinaryDomainCombinations().size(); if ( t == 0.0 ) { return MIN_SIMILARITY_SCORE; } return ( MAX_SIMILARITY_SCORE - ( ( t - s ) / t ) ); } /** * This returns a score between 0.0 (no domains in common) * and 1.0 (all domains in common) measuring the similarity between two * genomes based on the number of shared domains: * * t: sum of (distinct) domains * s: sum of shared (distinct) domains * * 1 - ( ( t - s ) / t ) * * @return shared domains based similarity score */ public double calculateSharedDomainsBasedGenomeSimilarityScore() { final double t = getAllDomains().size(); final double s = getSharedDomains().size(); if ( t == 0.0 ) { return MIN_SIMILARITY_SCORE; } return ( MAX_SIMILARITY_SCORE - ( ( t - s ) / t ) ); } public void deleteAllDomainIdsToIgnore() { forceRecalculation(); setDomainIdsToIgnore( new HashSet() ); } /** * Does not return binary combinations which contain one or two domains * to be ignored -- if ignoring is allowed. * * @return SortedSet */ public Set getAllBinaryDomainCombinations() { if ( _all_binary_domain_combinations == null ) { final Set all = new HashSet(); all.addAll( getCombinableDomainsGenome0().toBinaryDomainCombinations() ); all.addAll( getCombinableDomainsGenome1().toBinaryDomainCombinations() ); if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { _all_binary_domain_combinations = pruneBinaryCombinations( all ); } else { _all_binary_domain_combinations = all; } } return _all_binary_domain_combinations; } /** * Does not return domains which are to be * ignored -- if ignoring is allowed. * * * @return */ public Set getAllDomains() { if ( _all_domains == null ) { final Set all = new HashSet(); all.addAll( getCombinableDomainsGenome0().getAllDomainIds() ); all.addAll( getCombinableDomainsGenome1().getAllDomainIds() ); if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { _all_domains = pruneDomains( all ); } else { _all_domains = all; } } return _all_domains; } public Set getBinaryDomainCombinationsSpecificToGenome0() { if ( _binary_domain_combinations_specific_to_0 == null ) { _binary_domain_combinations_specific_to_0 = getBinaryDomainCombinationsSpecificToGenome( true ); } return _binary_domain_combinations_specific_to_0; } public Set getBinaryDomainCombinationsSpecificToGenome1() { if ( _binary_domain_combinations_specific_to_1 == null ) { _binary_domain_combinations_specific_to_1 = getBinaryDomainCombinationsSpecificToGenome( false ); } return _binary_domain_combinations_specific_to_1; } public Set getDomainsSpecificToGenome0() { if ( _domains_specific_to_0 == null ) { _domains_specific_to_0 = getDomainsSpecificToGenome( true ); } return _domains_specific_to_0; } public Set getDomainsSpecificToGenome1() { if ( _domains_specific_to_1 == null ) { _domains_specific_to_1 = getDomainsSpecificToGenome( false ); } return _domains_specific_to_1; } public Set getSharedBinaryDomainCombinations() { if ( _shared_binary_domain_combinations == null ) { final Set shared = new HashSet(); final Set bc0 = getCombinableDomainsGenome0().toBinaryDomainCombinations(); final Set bc1 = getCombinableDomainsGenome1().toBinaryDomainCombinations(); for( final BinaryDomainCombination binary_domain_combination0 : bc0 ) { if ( bc1.contains( binary_domain_combination0 ) ) { shared.add( binary_domain_combination0 ); } } _shared_binary_domain_combinations = shared; if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { _shared_binary_domain_combinations = pruneBinaryCombinations( shared ); } } return _shared_binary_domain_combinations; } public Set getSharedDomains() { if ( _shared_domains == null ) { final Set shared = new HashSet(); final Set d0 = getCombinableDomainsGenome0().getAllDomainIds(); final Set d1 = getCombinableDomainsGenome1().getAllDomainIds(); for( final String domain0 : d0 ) { if ( d1.contains( domain0 ) ) { shared.add( domain0 ); } } _shared_domains = shared; if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { _shared_domains = pruneDomains( shared ); } } return _shared_domains; } public void setAllowDomainsToBeIgnored( final boolean allow_domains_to_be_ignored ) { forceRecalculation(); _allow_domains_to_be_ignored = allow_domains_to_be_ignored; } void setDomainIdsToIgnore( final Set domain_ids_to_ignore ) { forceRecalculation(); _domain_ids_to_ignore = domain_ids_to_ignore; } private void forceRecalculation() { _all_domains = null; _shared_domains = null; _domains_specific_to_0 = null; _domains_specific_to_1 = null; _all_binary_domain_combinations = null; _shared_binary_domain_combinations = null; _binary_domain_combinations_specific_to_0 = null; _binary_domain_combinations_specific_to_1 = null; } private Set getBinaryDomainCombinationsSpecificToGenome( final boolean specific_to_genome_0 ) { final Set specific = new HashSet(); final Set bc0 = getCombinableDomainsGenome0().toBinaryDomainCombinations(); final Set bc1 = getCombinableDomainsGenome1().toBinaryDomainCombinations(); if ( specific_to_genome_0 ) { for( final BinaryDomainCombination binary_domain_combination0 : bc0 ) { if ( !bc1.contains( binary_domain_combination0 ) ) { specific.add( binary_domain_combination0 ); } } } else { for( final BinaryDomainCombination binary_domain_combination1 : bc1 ) { if ( !bc0.contains( binary_domain_combination1 ) ) { specific.add( binary_domain_combination1 ); } } } if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { return pruneBinaryCombinations( specific ); } return specific; } private GenomeWideCombinableDomains getCombinableDomainsGenome0() { return _combinable_domains_genome_0; } private GenomeWideCombinableDomains getCombinableDomainsGenome1() { return _combinable_domains_genome_1; } private Set getDomainIdsToIgnore() { return _domain_ids_to_ignore; } private Set getDomainsSpecificToGenome( final boolean specific_to_genome_0 ) { final Set specific = new HashSet(); final Set d0 = getCombinableDomainsGenome0().getAllDomainIds(); final Set d1 = getCombinableDomainsGenome1().getAllDomainIds(); if ( specific_to_genome_0 ) { for( final String domain0 : d0 ) { if ( !d1.contains( domain0 ) ) { specific.add( domain0 ); } } } else { for( final String domain1 : d1 ) { if ( !d0.contains( domain1 ) ) { specific.add( domain1 ); } } } if ( isAllowDomainsToBeIgnored() && !getDomainIdsToIgnore().isEmpty() ) { return pruneDomains( specific ); } return specific; } private void init() { deleteAllDomainIdsToIgnore(); setAllowDomainsToBeIgnored( false ); } private boolean isAllowDomainsToBeIgnored() { return _allow_domains_to_be_ignored; } private Set pruneBinaryCombinations( final Set all ) { final Set pruned = new HashSet(); for( final BinaryDomainCombination bc : all ) { if ( ( !getDomainIdsToIgnore().contains( bc.getId0() ) ) && ( !getDomainIdsToIgnore().contains( bc.getId1() ) ) ) { pruned.add( bc ); } } return pruned; } private Set pruneDomains( final Set all ) { final Set pruned = new HashSet(); for( final String d : all ) { if ( !getDomainIdsToIgnore().contains( d ) ) { pruned.add( d ); } } return pruned; } }