// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.surfacing;
+import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
-import org.forester.protein.DomainId;
import org.forester.species.Species;
import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.DescriptiveStatistics;
+import org.forester.util.ForesterUtil;
public class BasicDomainSimilarityCalculator implements DomainSimilarityCalculator {
final DomainSimilarity.DomainSimilaritySortField _sort;
+ private final boolean _calc_similarity_score;
private final boolean _sort_by_species_count_first;
private final boolean _treat_as_binary_comparison;
+ private final boolean _verbose;
public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
final boolean sort_by_species_count_first,
- final boolean treat_as_binary_comparison ) {
+ final boolean treat_as_binary_comparison,
+ final boolean calc_similarity_score,
+ final boolean verbose ) {
_sort = sort;
_sort_by_species_count_first = sort_by_species_count_first;
_treat_as_binary_comparison = treat_as_binary_comparison;
+ _calc_similarity_score = calc_similarity_score;
+ _verbose = verbose;
+ }
+
+ public BasicDomainSimilarityCalculator( final DomainSimilarity.DomainSimilaritySortField sort,
+ final boolean sort_by_species_count_first,
+ final boolean treat_as_binary_comparison,
+ final boolean calc_similarity_score ) {
+ _sort = sort;
+ _sort_by_species_count_first = sort_by_species_count_first;
+ _treat_as_binary_comparison = treat_as_binary_comparison;
+ _calc_similarity_score = calc_similarity_score;
+ _verbose = false;
}
@Override
throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinale domains collections" );
}
final SortedSet<DomainSimilarity> similarities = new TreeSet<DomainSimilarity>();
- final SortedSet<DomainId> keys = new TreeSet<DomainId>();
+ final SortedSet<String> keys = new TreeSet<String>();
for( final GenomeWideCombinableDomains cdc : cdc_list ) {
keys.addAll( ( cdc ).getAllCombinableDomainsIds().keySet() );
}
- for( final DomainId key : keys ) {
+ final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
+ int counter = 1;
+ if ( _verbose ) {
+ System.out.println( keys.size() );
+ }
+ for( final String key : keys ) {
+ if ( _verbose ) {
+ ForesterUtil.updateProgress( counter, pf );
+ }
+ counter++;
final List<CombinableDomains> same_id_cd_list = new ArrayList<CombinableDomains>( cdc_list.size() );
final List<Species> species_with_key_id_domain = new ArrayList<Species>();
for( final GenomeWideCombinableDomains cdc : cdc_list ) {
continue;
}
}
- // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // OLD: if ( same_id_cd_list.size() > 1 ) {
if ( same_id_cd_list.size() > 0 ) {
if ( !ignore_domains_specific_to_one_genome || ( same_id_cd_list.size() > 1 ) ) {
final DomainSimilarity s = calculateSimilarity( pairwise_calculator, same_id_cd_list );
}
}
}
- // ~~~ NEW:
else {
throw new RuntimeException( "this should not have happened" );
}
- // ~~~ OLD:
- // else if ( same_id_cd_list.size() == 1 ) {
- // TODO need to go in file
- // System.out.println( "only in one species [" +
- // species_with_key_id_domain.get( 0 ) + "]: " + key_id );
- //}
- //else {
- // throw new RuntimeException( "this should not have happened" );
- // }
+ }
+ if ( _verbose ) {
+ System.out.println();
}
return similarities;
}
+ public boolean isCalcSimilarityScore() {
+ return _calc_similarity_score;
+ }
+
private DomainSimilarity calculateSimilarity( final PairwiseDomainSimilarityCalculator pairwise_calculator,
final List<CombinableDomains> domains_list ) {
if ( domains_list.size() == 1 ) {
- // BIG CHANGE IN LOGIC: Tuesday July 08, 0;55
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // ~~~OLD:
- //throw new IllegalArgumentException( "attempt to calculate multiple combinable domains similarity for less than two combinable domains" );
- // ~~~new:
- final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
+ final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
species_data.put( domains_list.get( 0 ).getSpecies(),
createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
- return new PrintableDomainSimilarity( domains_list.get( 0 ),
- 1.0,
- 1.0,
- 1.0,
- 1.0,
- 0.0,
- 0,
- 0,
- 0,
- species_data,
- getSort(),
- isSortBySpeciesCountFirst(),
- isTreatAsBinaryComparison() );
+ if ( !isCalcSimilarityScore() ) {
+ return new DomainSimilarity( domains_list.get( 0 ),
+ 0,
+ 0,
+ species_data,
+ isSortBySpeciesCountFirst(),
+ isTreatAsBinaryComparison() );
+ }
+ else {
+ return new DomainSimilarity( domains_list.get( 0 ),
+ 1.0,
+ 1.0,
+ 1.0,
+ 1.0,
+ 0.0,
+ 0,
+ 0,
+ 0,
+ species_data,
+ isSortBySpeciesCountFirst(),
+ isTreatAsBinaryComparison() );
+ }
}
- final DescriptiveStatistics stat = new BasicDescriptiveStatistics();
- final SortedMap<Species, SpeciesSpecificDomainSimilariyData> species_data = new TreeMap<Species, SpeciesSpecificDomainSimilariyData>();
+ DescriptiveStatistics stat = null;
+ if ( isCalcSimilarityScore() ) {
+ stat = new BasicDescriptiveStatistics();
+ }
+ final SortedMap<Species, SpeciesSpecificDcData> species_data = new TreeMap<Species, SpeciesSpecificDcData>();
species_data.put( domains_list.get( 0 ).getSpecies(),
createSpeciesSpecificDomainSimilariyData( domains_list.get( 0 ) ) );
int max_difference_in_counts = 0;
if ( Math.abs( difference ) > Math.abs( max_difference ) ) {
max_difference = difference;
}
- stat.addValue( pairwise_similarity.getSimilarityScore() );
+ if ( isCalcSimilarityScore() ) {
+ stat.addValue( pairwise_similarity.getSimilarityScore() );
+ }
}
}
- if ( stat.getN() < 1 ) {
- throw new AssertionError( "empty descriptive statistics: this should not have happened" );
- }
- if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
- throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
+ if ( isCalcSimilarityScore() ) {
+ if ( stat.getN() < 1 ) {
+ throw new RuntimeException( "empty descriptive statistics: this should not have happened" );
+ }
+ if ( ( stat.getN() != 1 ) && isTreatAsBinaryComparison() ) {
+ throw new IllegalArgumentException( "attmpt to treat similarity with N not equal to one as binary comparison" );
+ }
}
- if ( ( /*stat.getN() != 1 ||*/!isTreatAsBinaryComparison() ) && ( max_difference_in_counts < 0 ) ) {
+ if ( !isTreatAsBinaryComparison() && ( max_difference_in_counts < 0 ) ) {
max_difference_in_counts = Math.abs( max_difference_in_counts );
if ( !is_domain_combination_based ) {
- max_difference = Math.abs( max_difference ); //=max_difference_in_counts for !is_domain_combination_based.
+ max_difference = Math.abs( max_difference );
}
}
DomainSimilarity similarity = null;
- if ( stat.getN() == 1 ) {
- similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
- stat.getMin(),
- stat.getMax(),
- stat.arithmeticMean(),
- stat.median(),
- 0.0,
- stat.getN(),
- max_difference_in_counts,
- max_difference,
- species_data,
- getSort(),
- isSortBySpeciesCountFirst(),
- isTreatAsBinaryComparison() );
+ if ( !isCalcSimilarityScore() ) {
+ similarity = new DomainSimilarity( domains_list.get( 0 ),
+ max_difference_in_counts,
+ max_difference,
+ species_data,
+ isSortBySpeciesCountFirst(),
+ isTreatAsBinaryComparison() );
}
else {
- similarity = new PrintableDomainSimilarity( domains_list.get( 0 ),
- stat.getMin(),
- stat.getMax(),
- stat.arithmeticMean(),
- stat.median(),
- stat.sampleStandardDeviation(),
- stat.getN(),
- max_difference_in_counts,
- max_difference,
- species_data,
- getSort(),
- isSortBySpeciesCountFirst(),
- isTreatAsBinaryComparison() );
+ if ( stat.getN() == 1 ) {
+ similarity = new DomainSimilarity( domains_list.get( 0 ),
+ stat.getMin(),
+ stat.getMax(),
+ stat.arithmeticMean(),
+ stat.median(),
+ 0.0,
+ stat.getN(),
+ max_difference_in_counts,
+ max_difference,
+ species_data,
+ isSortBySpeciesCountFirst(),
+ isTreatAsBinaryComparison() );
+ }
+ else {
+ similarity = new DomainSimilarity( domains_list.get( 0 ),
+ stat.getMin(),
+ stat.getMax(),
+ stat.arithmeticMean(),
+ stat.median(),
+ stat.sampleStandardDeviation(),
+ stat.getN(),
+ max_difference_in_counts,
+ max_difference,
+ species_data,
+ isSortBySpeciesCountFirst(),
+ isTreatAsBinaryComparison() );
+ }
}
return similarity;
}
- private DomainSimilarity.DomainSimilaritySortField getSort() {
- return _sort;
- }
-
private boolean isSortBySpeciesCountFirst() {
return _sort_by_species_count_first;
}
return _treat_as_binary_comparison;
}
- private static SpeciesSpecificDomainSimilariyData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
- final SpeciesSpecificDomainSimilariyData sd = new PrintableSpeciesSpecificDomainSimilariyData( cd.getKeyDomainProteinsCount(),
- cd.getKeyDomainCount(),
- cd.getNumberOfCombinableDomains(),
- cd.getKeyDomainConfidenceDescriptiveStatistics() );
- for( final DomainId domain : cd.getCombinableDomains() ) {
+ private static SpeciesSpecificDcData createSpeciesSpecificDomainSimilariyData( final CombinableDomains cd ) {
+ final SpeciesSpecificDcData sd = new PrintableSpeciesSpecificDcData( cd.getKeyDomainCount(),
+ cd.getNumberOfCombinableDomains() );
+ for( final String prot : cd.getKeyDomainProteins() ) {
+ sd.addKeyDomainProtein( prot );
+ }
+ for( final String domain : cd.getCombinableDomains() ) {
sd.addProteinsExhibitingCombinationCount( domain, cd.getNumberOfProteinsExhibitingCombination( domain ) );
}
return sd;