2 package org.forester.surfacing;
4 import java.text.DecimalFormat;
5 import java.text.NumberFormat;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.List;
14 import java.util.SortedMap;
15 import java.util.SortedSet;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
19 import org.forester.go.GoId;
20 import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType;
21 import org.forester.util.BasicDescriptiveStatistics;
22 import org.forester.util.DescriptiveStatistics;
23 import org.forester.util.ForesterUtil;
25 public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains {
27 private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" );
28 private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_COUNT_ORDER = new Comparator<CombinableDomains>() {
30 public int compare( final CombinableDomains d1,
31 final CombinableDomains d2 ) {
33 .getKeyDomainCount() < d2
34 .getKeyDomainCount() ) {
38 .getKeyDomainCount() > d2
39 .getKeyDomainCount() ) {
52 private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator<CombinableDomains>() {
54 public int compare( final CombinableDomains d1,
55 final CombinableDomains d2 ) {
57 .getKeyDomainProteinsCount() < d2
58 .getKeyDomainProteinsCount() ) {
62 .getKeyDomainProteinsCount() > d2
63 .getKeyDomainProteinsCount() ) {
76 private static final Comparator<CombinableDomains> DESCENDING_COMBINATIONS_COUNT_ORDER = new Comparator<CombinableDomains>() {
78 public int compare( final CombinableDomains d1,
79 final CombinableDomains d2 ) {
81 .getNumberOfCombinableDomains() < d2
82 .getNumberOfCombinableDomains() ) {
86 .getNumberOfCombinableDomains() > d2
87 .getNumberOfCombinableDomains() ) {
100 final private SortedMap<DomainId, CombinableDomains> _combinable_domains_map;
101 final private Species _species;
102 final private DomainCombinationType _dc_type;
104 private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) {
105 _combinable_domains_map = new TreeMap<DomainId, CombinableDomains>();
110 private void add( final DomainId key, final CombinableDomains cdc ) {
111 _combinable_domains_map.put( key, cdc );
114 public boolean contains( final DomainId key_id ) {
115 return _combinable_domains_map.containsKey( key_id );
118 public CombinableDomains get( final DomainId key_id ) {
119 return _combinable_domains_map.get( key_id );
122 public SortedMap<DomainId, CombinableDomains> getAllCombinableDomainsIds() {
123 return _combinable_domains_map;
127 public SortedSet<DomainId> getAllDomainIds() {
128 final SortedSet<DomainId> domains = new TreeSet<DomainId>();
129 for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
130 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
131 final List<DomainId> ds = cb.getAllDomains();
132 for( final DomainId d : ds ) {
140 public DomainCombinationType getDomainCombinationType() {
145 public SortedSet<DomainId> getMostPromiscuosDomain() {
146 final SortedSet<DomainId> doms = new TreeSet<DomainId>();
147 final int max = ( int ) getPerGenomeDomainPromiscuityStatistics().getMax();
148 for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
149 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
150 if ( cb.getNumberOfCombinableDomains() == max ) {
158 public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics() {
159 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
160 for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
161 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
162 stats.addValue( cb.getNumberOfCombinableDomains() );
167 public int getSize() {
168 return _combinable_domains_map.size();
171 public Species getSpecies() {
176 public SortedSet<BinaryDomainCombination> toBinaryDomainCombinations() {
177 final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
178 for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
179 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
180 for( final BinaryDomainCombination b : cb.toBinaryDomainCombinations() ) {
181 binary_combinations.add( b );
184 return binary_combinations;
188 public String toString() {
189 return toStringBuilder( GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID ).toString();
192 // Produces something like:
193 // 2-oxoacid_dh 5 5 2 4.8E-67 Biotin_lipoyl [4], E3_binding [3]
194 public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) {
195 final StringBuilder sb = new StringBuilder();
196 final List<CombinableDomains> combinable_domains = new ArrayList<CombinableDomains>();
197 for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
198 final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
199 combinable_domains.add( cb );
201 if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT ) {
202 Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_COUNT_ORDER );
204 else if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT ) {
205 Collections.sort( combinable_domains,
206 BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER );
208 else if ( sort_order == GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT ) {
209 Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_COMBINATIONS_COUNT_ORDER );
211 for( final CombinableDomains cb : combinable_domains ) {
212 sb.append( ForesterUtil.pad( new StringBuffer( cb.getKeyDomain().toString() ), 18, ' ', false ) );
213 sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) );
214 sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) );
215 sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) );
217 .append( ForesterUtil
218 .pad( new StringBuffer( ""
220 .format( cb.getKeyDomainConfidenceDescriptiveStatistics().median() ) ),
224 sb.append( cb.getCombiningDomainIdsAsStringBuilder() );
225 sb.append( ForesterUtil.getLineSeparator() );
230 private static void countDomains( final Map<DomainId, Integer> domain_counts,
231 final Map<DomainId, Integer> domain_protein_counts,
232 final Map<DomainId, DescriptiveStatistics> stats,
233 final Set<DomainId> saw_c,
235 final double support ) {
236 if ( domain_counts.containsKey( id_i ) ) {
237 domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) );
238 if ( !saw_c.contains( id_i ) ) {
239 domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) );
243 stats.put( id_i, new BasicDescriptiveStatistics() );
244 domain_counts.put( id_i, 1 );
245 domain_protein_counts.put( id_i, 1 );
247 stats.get( id_i ).addValue( support );
251 public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
252 final boolean ignore_combination_with_same_domain,
253 final Species species ) {
254 return createInstance( protein_list,
255 ignore_combination_with_same_domain,
258 DomainCombinationType.BASIC );
261 public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
262 final boolean ignore_combination_with_same_domain,
263 final Species species,
264 final DomainCombinationType dc_type ) {
265 return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type );
268 public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
269 final boolean ignore_combination_with_same_domain,
270 final Species species,
271 final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
272 final DomainCombinationType dc_type ) {
273 final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
274 final Map<DomainId, Integer> domain_counts = new HashMap<DomainId, Integer>();
275 final Map<DomainId, Integer> domain_protein_counts = new HashMap<DomainId, Integer>();
276 final Map<DomainId, DescriptiveStatistics> stats = new HashMap<DomainId, DescriptiveStatistics>();
277 for( final Protein protein : protein_list ) {
278 if ( !protein.getSpecies().equals( species ) ) {
279 throw new IllegalArgumentException( "species (" + protein.getSpecies()
280 + ") does not match species of combinable domains collection (" + species + ")" );
282 final Set<DomainId> saw_i = new HashSet<DomainId>();
283 final Set<DomainId> saw_c = new HashSet<DomainId>();
284 for( int i = 0; i < protein.getProteinDomains().size(); ++i ) {
285 final Domain pd_i = protein.getProteinDomain( i );
286 final DomainId id_i = pd_i.getDomainId();
287 final int current_start = pd_i.getFrom();
288 BasicGenomeWideCombinableDomains.countDomains( domain_counts,
289 domain_protein_counts,
293 pd_i.getPerSequenceEvalue() );
294 if ( !saw_i.contains( id_i ) ) {
295 if ( dc_type == DomainCombinationType.BASIC ) {
298 CombinableDomains domain_combination = null;
299 if ( instance.contains( id_i ) ) {
300 domain_combination = instance.get( id_i );
303 if ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) {
304 domain_combination = new AdjactantDirectedCombinableDomains( pd_i.getDomainId(), species );
306 else if ( dc_type == DomainCombinationType.DIRECTED ) {
307 domain_combination = new DirectedCombinableDomains( pd_i.getDomainId(), species );
310 domain_combination = new BasicCombinableDomains( pd_i.getDomainId(), species );
312 if ( ( domain_id_to_go_ids_map != null )
313 && domain_id_to_go_ids_map.containsKey( pd_i.getDomainId() ) ) {
314 final List<GoId> go_ids = domain_id_to_go_ids_map.get( pd_i.getDomainId() );
315 for( final GoId go_id : go_ids ) {
316 domain_combination.getKeyDomain().addGoId( go_id );
319 instance.add( id_i, domain_combination );
321 final Set<DomainId> saw_j = new HashSet<DomainId>();
322 if ( ignore_combination_with_same_domain ) {
325 Domain closest = null;
326 for( int j = 0; j < protein.getNumberOfProteinDomains(); ++j ) {
327 if ( ( dc_type != DomainCombinationType.BASIC )
328 && ( current_start >= protein.getProteinDomain( j ).getFrom() ) ) {
332 final DomainId id = protein.getProteinDomain( j ).getDomainId();
333 if ( !saw_j.contains( id ) ) {
335 if ( dc_type != DomainCombinationType.DIRECTED_ADJACTANT ) {
337 .addCombinableDomain( protein.getProteinDomain( j ).getDomainId() );
340 if ( closest == null ) {
341 closest = protein.getProteinDomain( j );
344 if ( protein.getProteinDomain( j ).getFrom() < closest.getFrom() ) {
345 closest = protein.getProteinDomain( j );
352 if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
353 domain_combination.addCombinableDomain( closest.getDomainId() );
358 for( final DomainId key_id : domain_counts.keySet() ) {
359 instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) );
360 instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) );
361 instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) );