2 package org.forester.surfacing;
4 import java.text.DecimalFormat;
5 import java.text.NumberFormat;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.List;
14 import java.util.SortedMap;
15 import java.util.SortedSet;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
19 import org.forester.go.GoId;
20 import org.forester.protein.BinaryDomainCombination;
21 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
22 import org.forester.protein.Domain;
23 import org.forester.protein.Protein;
24 import org.forester.species.Species;
25 import org.forester.util.BasicDescriptiveStatistics;
26 import org.forester.util.DescriptiveStatistics;
27 import org.forester.util.ForesterUtil;
29 public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains {
31 private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" );
32 private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_COUNT_ORDER = new Comparator<CombinableDomains>() {
35 public int compare( final CombinableDomains d1,
36 final CombinableDomains d2 ) {
37 if ( d1.getKeyDomainCount() < d2
38 .getKeyDomainCount() ) {
42 .getKeyDomainCount() > d2
43 .getKeyDomainCount() ) {
54 private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator<CombinableDomains>() {
57 public int compare( final CombinableDomains d1,
58 final CombinableDomains d2 ) {
59 if ( d1.getKeyDomainProteinsCount() < d2
60 .getKeyDomainProteinsCount() ) {
64 .getKeyDomainProteinsCount() > d2
65 .getKeyDomainProteinsCount() ) {
76 private static final Comparator<CombinableDomains> DESCENDING_COMBINATIONS_COUNT_ORDER = new Comparator<CombinableDomains>() {
79 public int compare( final CombinableDomains d1,
80 final CombinableDomains d2 ) {
81 if ( d1.getNumberOfCombinableDomains() < d2
82 .getNumberOfCombinableDomains() ) {
86 .getNumberOfCombinableDomains() > d2
87 .getNumberOfCombinableDomains() ) {
98 final private SortedMap<String, CombinableDomains> _combinable_domains_map;
99 final private Species _species;
100 final private DomainCombinationType _dc_type;
102 private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) {
103 _combinable_domains_map = new TreeMap<String, CombinableDomains>();
108 private void add( final String key, final CombinableDomains cdc ) {
109 _combinable_domains_map.put( key, cdc );
113 public boolean contains( final String key_id ) {
114 return _combinable_domains_map.containsKey( key_id );
118 public CombinableDomains get( final String key_id ) {
119 return _combinable_domains_map.get( key_id );
123 public SortedMap<String, CombinableDomains> getAllCombinableDomainsIds() {
124 return _combinable_domains_map;
128 public SortedSet<String> getAllDomainIds() {
129 final SortedSet<String> domains = new TreeSet<String>();
130 for( final String key : getAllCombinableDomainsIds().keySet() ) {
131 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
132 final List<String> ds = cb.getAllDomains();
133 for( final String d : ds ) {
141 public DomainCombinationType getDomainCombinationType() {
146 public SortedSet<String> getMostPromiscuosDomain() {
147 final SortedSet<String> doms = new TreeSet<String>();
148 final int max = ( int ) getPerGenomeDomainPromiscuityStatistics().getMax();
149 for( final String key : getAllCombinableDomainsIds().keySet() ) {
150 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
151 if ( cb.getNumberOfCombinableDomains() == max ) {
159 public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics() {
160 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
161 for( final String key : getAllCombinableDomainsIds().keySet() ) {
162 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
163 stats.addValue( cb.getNumberOfCombinableDomains() );
169 public int getSize() {
170 return _combinable_domains_map.size();
174 public Species getSpecies() {
179 public SortedSet<BinaryDomainCombination> toBinaryDomainCombinations() {
180 final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
181 for( final String key : getAllCombinableDomainsIds().keySet() ) {
182 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
183 for( final BinaryDomainCombination b : cb.toBinaryDomainCombinations() ) {
184 binary_combinations.add( b );
187 return binary_combinations;
191 public String toString() {
192 return toStringBuilder( GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID ).toString();
195 // Produces something like:
196 // 2-oxoacid_dh 5 5 2 4.8E-67 Biotin_lipoyl [4], E3_binding [3]
198 public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) {
199 final StringBuilder sb = new StringBuilder();
200 final List<CombinableDomains> combinable_domains = new ArrayList<CombinableDomains>();
201 for( final String key : getAllCombinableDomainsIds().keySet() ) {
202 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
203 combinable_domains.add( cb );
205 if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT ) {
206 Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_COUNT_ORDER );
208 else if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT ) {
209 Collections.sort( combinable_domains,
210 BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER );
212 else if ( sort_order == GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT ) {
213 Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_COMBINATIONS_COUNT_ORDER );
215 for( final CombinableDomains cb : combinable_domains ) {
216 sb.append( ForesterUtil.pad( new StringBuffer( cb.getKeyDomain().toString() ), 18, ' ', false ) );
217 sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) );
218 sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) );
219 sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) );
220 sb.append( ForesterUtil.pad( new StringBuffer( ""
221 + FORMATTER.format( cb.getKeyDomainConfidenceDescriptiveStatistics()
226 sb.append( cb.getCombiningDomainIdsAsStringBuilder() );
227 sb.append( ForesterUtil.getLineSeparator() );
232 private static void countDomains( final Map<String, Integer> domain_counts,
233 final Map<String, Integer> domain_protein_counts,
234 final Map<String, DescriptiveStatistics> stats,
235 final Set<String> saw_c,
237 final double support ) {
238 if ( domain_counts.containsKey( id_i ) ) {
239 domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) );
240 if ( !saw_c.contains( id_i ) ) {
241 domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) );
245 stats.put( id_i, new BasicDescriptiveStatistics() );
246 domain_counts.put( id_i, 1 );
247 domain_protein_counts.put( id_i, 1 );
249 stats.get( id_i ).addValue( support );
253 public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
254 final boolean ignore_combination_with_same_domain,
255 final Species species ) {
256 return createInstance( protein_list,
257 ignore_combination_with_same_domain,
260 DomainCombinationType.BASIC,
265 public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
266 final boolean ignore_combination_with_same_domain,
267 final Species species,
268 final DomainCombinationType dc_type ) {
269 return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type, null, null );
272 public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
273 final boolean ignore_combination_with_same_domain,
274 final Species species,
275 final Map<String, List<GoId>> domain_id_to_go_ids_map,
276 final DomainCombinationType dc_type,
277 final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
278 final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
279 final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
280 final Map<String, Integer> domain_counts = new HashMap<String, Integer>();
281 final Map<String, Integer> domain_protein_counts = new HashMap<String, Integer>();
282 final Map<String, DescriptiveStatistics> stats = new HashMap<String, DescriptiveStatistics>();
283 for( final Protein protein : protein_list ) {
284 if ( !protein.getSpecies().equals( species ) ) {
285 throw new IllegalArgumentException( "species (" + protein.getSpecies()
286 + ") does not match species of combinable domains collection (" + species + ")" );
288 final Set<String> saw_i = new HashSet<String>();
289 final Set<String> saw_c = new HashSet<String>();
290 for( int i = 0; i < protein.getProteinDomains().size(); ++i ) {
291 final Domain pd_i = protein.getProteinDomain( i );
292 final String id_i = pd_i.getDomainId();
293 final int current_start = pd_i.getFrom();
294 BasicGenomeWideCombinableDomains.countDomains( domain_counts,
295 domain_protein_counts,
299 pd_i.getPerSequenceEvalue() );
300 if ( !saw_i.contains( id_i ) ) {
301 if ( dc_type == DomainCombinationType.BASIC ) {
304 CombinableDomains domain_combination = null;
305 if ( instance.contains( id_i ) ) {
306 domain_combination = instance.get( id_i );
309 if ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) {
310 domain_combination = new AdjactantDirectedCombinableDomains( pd_i.getDomainId(), species );
312 else if ( dc_type == DomainCombinationType.DIRECTED ) {
313 domain_combination = new DirectedCombinableDomains( pd_i.getDomainId(), species );
316 domain_combination = new BasicCombinableDomains( pd_i.getDomainId(), species );
318 // ^^ if ( ( domain_id_to_go_ids_map != null )
319 // ^^ && domain_id_to_go_ids_map.containsKey( pd_i.getDomainId() ) ) {
320 // ^^ final List<GoId> go_ids = domain_id_to_go_ids_map.get( pd_i.getDomainId() );
321 // ^^ for( final GoId go_id : go_ids ) {
322 // ^^ domain_combination.getKeyDomain().addGoId( go_id );
325 instance.add( id_i, domain_combination );
327 final Set<String> saw_j = new HashSet<String>();
328 if ( ignore_combination_with_same_domain ) {
331 Domain closest = null;
332 for( int j = 0; j < protein.getNumberOfProteinDomains(); ++j ) {
333 if ( ( dc_type != DomainCombinationType.BASIC )
334 && ( current_start >= protein.getProteinDomain( j ).getFrom() ) ) {
338 final String id = protein.getProteinDomain( j ).getDomainId();
339 if ( !saw_j.contains( id ) ) {
341 if ( dc_type != DomainCombinationType.DIRECTED_ADJACTANT ) {
343 .addCombinableDomain( protein.getProteinDomain( j ).getDomainId() );
346 if ( closest == null ) {
347 closest = protein.getProteinDomain( j );
350 if ( protein.getProteinDomain( j ).getFrom() < closest.getFrom() ) {
351 closest = protein.getProteinDomain( j );
358 if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
359 domain_combination.addCombinableDomain( closest.getDomainId() );
361 if ( protein_length_stats_by_dc != null ) {
362 final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
363 for( final BinaryDomainCombination dc : dcs ) {
364 final String dc_str = dc.toString();
365 if ( !protein_length_stats_by_dc.containsKey( dc_str ) ) {
366 protein_length_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
368 protein_length_stats_by_dc.get( dc_str ).addValue( protein.getLength() );
371 if ( domain_number_stats_by_dc != null ) {
372 final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
373 for( final BinaryDomainCombination dc : dcs ) {
374 final String dc_str = dc.toString();
375 if ( !domain_number_stats_by_dc.containsKey( dc_str ) ) {
376 domain_number_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
378 domain_number_stats_by_dc.get( dc_str ).addValue( protein.getNumberOfProteinDomains() );
385 for( final String key_id : domain_counts.keySet() ) {
386 instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) );
387 instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) );
388 instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) );