in progress
[jalview.git] / forester / java / src / org / forester / surfacing / BasicGenomeWideCombinableDomains.java
1
2 package org.forester.surfacing;
3
4 import java.text.DecimalFormat;
5 import java.text.NumberFormat;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.List;
12 import java.util.Map;
13 import java.util.Set;
14 import java.util.SortedMap;
15 import java.util.SortedSet;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
18
19 import org.forester.go.GoId;
20 import org.forester.protein.BinaryDomainCombination;
21 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
22 import org.forester.protein.Domain;
23 import org.forester.protein.DomainId;
24 import org.forester.protein.Protein;
25 import org.forester.species.Species;
26 import org.forester.util.BasicDescriptiveStatistics;
27 import org.forester.util.DescriptiveStatistics;
28 import org.forester.util.ForesterUtil;
29
30 public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains {
31
32     private final static NumberFormat                    FORMATTER                                  = new DecimalFormat( "0.0E0" );
33     private static final Comparator<CombinableDomains>   DESCENDING_KEY_DOMAIN_COUNT_ORDER          = new Comparator<CombinableDomains>() {
34
35                                                                                                         @Override
36                                                                                                         public int compare( final CombinableDomains d1,
37                                                                                                                             final CombinableDomains d2 ) {
38                                                                                                             if ( d1.getKeyDomainCount() < d2
39                                                                                                                     .getKeyDomainCount() ) {
40                                                                                                                 return 1;
41                                                                                                             }
42                                                                                                             else if ( d1
43                                                                                                                     .getKeyDomainCount() > d2
44                                                                                                                     .getKeyDomainCount() ) {
45                                                                                                                 return -1;
46                                                                                                             }
47                                                                                                             else {
48                                                                                                                 return d1
49                                                                                                                         .getKeyDomain()
50                                                                                                                         .getId()
51                                                                                                                         .compareTo( d2
52                                                                                                                                 .getKeyDomain()
53                                                                                                                                 .getId() );
54                                                                                                             }
55                                                                                                         }
56                                                                                                     };
57     private static final Comparator<CombinableDomains>   DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator<CombinableDomains>() {
58
59                                                                                                         @Override
60                                                                                                         public int compare( final CombinableDomains d1,
61                                                                                                                             final CombinableDomains d2 ) {
62                                                                                                             if ( d1.getKeyDomainProteinsCount() < d2
63                                                                                                                     .getKeyDomainProteinsCount() ) {
64                                                                                                                 return 1;
65                                                                                                             }
66                                                                                                             else if ( d1
67                                                                                                                     .getKeyDomainProteinsCount() > d2
68                                                                                                                     .getKeyDomainProteinsCount() ) {
69                                                                                                                 return -1;
70                                                                                                             }
71                                                                                                             else {
72                                                                                                                 return d1
73                                                                                                                         .getKeyDomain()
74                                                                                                                         .getId()
75                                                                                                                         .compareTo( d2
76                                                                                                                                 .getKeyDomain()
77                                                                                                                                 .getId() );
78                                                                                                             }
79                                                                                                         }
80                                                                                                     };
81     private static final Comparator<CombinableDomains>   DESCENDING_COMBINATIONS_COUNT_ORDER        = new Comparator<CombinableDomains>() {
82
83                                                                                                         @Override
84                                                                                                         public int compare( final CombinableDomains d1,
85                                                                                                                             final CombinableDomains d2 ) {
86                                                                                                             if ( d1.getNumberOfCombinableDomains() < d2
87                                                                                                                     .getNumberOfCombinableDomains() ) {
88                                                                                                                 return 1;
89                                                                                                             }
90                                                                                                             else if ( d1
91                                                                                                                     .getNumberOfCombinableDomains() > d2
92                                                                                                                     .getNumberOfCombinableDomains() ) {
93                                                                                                                 return -1;
94                                                                                                             }
95                                                                                                             else {
96                                                                                                                 return d1
97                                                                                                                         .getKeyDomain()
98                                                                                                                         .getId()
99                                                                                                                         .compareTo( d2
100                                                                                                                                 .getKeyDomain()
101                                                                                                                                 .getId() );
102                                                                                                             }
103                                                                                                         }
104                                                                                                     };
105     final private SortedMap<DomainId, CombinableDomains> _combinable_domains_map;
106     final private Species                                _species;
107     final private DomainCombinationType                  _dc_type;
108
109     private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) {
110         _combinable_domains_map = new TreeMap<DomainId, CombinableDomains>();
111         _species = species;
112         _dc_type = dc_type;
113     }
114
115     private void add( final DomainId key, final CombinableDomains cdc ) {
116         _combinable_domains_map.put( key, cdc );
117     }
118
119     @Override
120     public boolean contains( final DomainId key_id ) {
121         return _combinable_domains_map.containsKey( key_id );
122     }
123
124     @Override
125     public CombinableDomains get( final DomainId key_id ) {
126         return _combinable_domains_map.get( key_id );
127     }
128
129     @Override
130     public SortedMap<DomainId, CombinableDomains> getAllCombinableDomainsIds() {
131         return _combinable_domains_map;
132     }
133
134     @Override
135     public SortedSet<DomainId> getAllDomainIds() {
136         final SortedSet<DomainId> domains = new TreeSet<DomainId>();
137         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
138             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
139             final List<DomainId> ds = cb.getAllDomains();
140             for( final DomainId d : ds ) {
141                 domains.add( d );
142             }
143         }
144         return domains;
145     }
146
147     @Override
148     public DomainCombinationType getDomainCombinationType() {
149         return _dc_type;
150     }
151
152     @Override
153     public SortedSet<DomainId> getMostPromiscuosDomain() {
154         final SortedSet<DomainId> doms = new TreeSet<DomainId>();
155         final int max = ( int ) getPerGenomeDomainPromiscuityStatistics().getMax();
156         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
157             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
158             if ( cb.getNumberOfCombinableDomains() == max ) {
159                 doms.add( key );
160             }
161         }
162         return doms;
163     }
164
165     @Override
166     public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics() {
167         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
168         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
169             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
170             stats.addValue( cb.getNumberOfCombinableDomains() );
171         }
172         return stats;
173     }
174
175     @Override
176     public int getSize() {
177         return _combinable_domains_map.size();
178     }
179
180     @Override
181     public Species getSpecies() {
182         return _species;
183     }
184
185     @Override
186     public SortedSet<BinaryDomainCombination> toBinaryDomainCombinations() {
187         final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
188         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
189             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
190             for( final BinaryDomainCombination b : cb.toBinaryDomainCombinations() ) {
191                 binary_combinations.add( b );
192             }
193         }
194         return binary_combinations;
195     }
196
197     @Override
198     public String toString() {
199         return toStringBuilder( GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID ).toString();
200     }
201
202     // Produces something like: 
203     // 2-oxoacid_dh      5       5       2       4.8E-67   Biotin_lipoyl [4], E3_binding [3]
204     @Override
205     public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) {
206         final StringBuilder sb = new StringBuilder();
207         final List<CombinableDomains> combinable_domains = new ArrayList<CombinableDomains>();
208         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
209             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
210             combinable_domains.add( cb );
211         }
212         if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT ) {
213             Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_COUNT_ORDER );
214         }
215         else if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT ) {
216             Collections.sort( combinable_domains,
217                               BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER );
218         }
219         else if ( sort_order == GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT ) {
220             Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_COMBINATIONS_COUNT_ORDER );
221         }
222         for( final CombinableDomains cb : combinable_domains ) {
223             sb.append( ForesterUtil.pad( new StringBuffer( cb.getKeyDomain().toString() ), 18, ' ', false ) );
224             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) );
225             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) );
226             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) );
227             sb.append( ForesterUtil.pad( new StringBuffer( ""
228                                                  + FORMATTER.format( cb.getKeyDomainConfidenceDescriptiveStatistics()
229                                                          .median() ) ),
230                                          10,
231                                          ' ',
232                                          false ) );
233             sb.append( cb.getCombiningDomainIdsAsStringBuilder() );
234             sb.append( ForesterUtil.getLineSeparator() );
235         }
236         return sb;
237     }
238
239     private static void countDomains( final Map<DomainId, Integer> domain_counts,
240                                       final Map<DomainId, Integer> domain_protein_counts,
241                                       final Map<DomainId, DescriptiveStatistics> stats,
242                                       final Set<DomainId> saw_c,
243                                       final DomainId id_i,
244                                       final double support ) {
245         if ( domain_counts.containsKey( id_i ) ) {
246             domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) );
247             if ( !saw_c.contains( id_i ) ) {
248                 domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) );
249             }
250         }
251         else {
252             stats.put( id_i, new BasicDescriptiveStatistics() );
253             domain_counts.put( id_i, 1 );
254             domain_protein_counts.put( id_i, 1 );
255         }
256         stats.get( id_i ).addValue( support );
257         saw_c.add( id_i );
258     }
259
260     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
261                                                                    final boolean ignore_combination_with_same_domain,
262                                                                    final Species species ) {
263         return createInstance( protein_list,
264                                ignore_combination_with_same_domain,
265                                species,
266                                null,
267                                DomainCombinationType.BASIC,
268                                null,
269                                null );
270     }
271
272     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
273                                                                    final boolean ignore_combination_with_same_domain,
274                                                                    final Species species,
275                                                                    final DomainCombinationType dc_type ) {
276         return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type, null, null );
277     }
278
279     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
280                                                                    final boolean ignore_combination_with_same_domain,
281                                                                    final Species species,
282                                                                    final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
283                                                                    final DomainCombinationType dc_type,
284                                                                    final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
285                                                                    final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
286         final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
287         final Map<DomainId, Integer> domain_counts = new HashMap<DomainId, Integer>();
288         final Map<DomainId, Integer> domain_protein_counts = new HashMap<DomainId, Integer>();
289         final Map<DomainId, DescriptiveStatistics> stats = new HashMap<DomainId, DescriptiveStatistics>();
290         for( final Protein protein : protein_list ) {
291             if ( !protein.getSpecies().equals( species ) ) {
292                 throw new IllegalArgumentException( "species (" + protein.getSpecies()
293                         + ") does not match species of combinable domains collection (" + species + ")" );
294             }
295             final Set<DomainId> saw_i = new HashSet<DomainId>();
296             final Set<DomainId> saw_c = new HashSet<DomainId>();
297             for( int i = 0; i < protein.getProteinDomains().size(); ++i ) {
298                 final Domain pd_i = protein.getProteinDomain( i );
299                 final DomainId id_i = pd_i.getDomainId();
300                 final int current_start = pd_i.getFrom();
301                 BasicGenomeWideCombinableDomains.countDomains( domain_counts,
302                                                                domain_protein_counts,
303                                                                stats,
304                                                                saw_c,
305                                                                id_i,
306                                                                pd_i.getPerSequenceEvalue() );
307                 if ( !saw_i.contains( id_i ) ) {
308                     if ( dc_type == DomainCombinationType.BASIC ) {
309                         saw_i.add( id_i );
310                     }
311                     CombinableDomains domain_combination = null;
312                     if ( instance.contains( id_i ) ) {
313                         domain_combination = instance.get( id_i );
314                     }
315                     else {
316                         if ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) {
317                             domain_combination = new AdjactantDirectedCombinableDomains( pd_i.getDomainId(), species );
318                         }
319                         else if ( dc_type == DomainCombinationType.DIRECTED ) {
320                             domain_combination = new DirectedCombinableDomains( pd_i.getDomainId(), species );
321                         }
322                         else {
323                             domain_combination = new BasicCombinableDomains( pd_i.getDomainId(), species );
324                         }
325                         if ( ( domain_id_to_go_ids_map != null )
326                                 && domain_id_to_go_ids_map.containsKey( pd_i.getDomainId() ) ) {
327                             final List<GoId> go_ids = domain_id_to_go_ids_map.get( pd_i.getDomainId() );
328                             for( final GoId go_id : go_ids ) {
329                                 domain_combination.getKeyDomain().addGoId( go_id );
330                             }
331                         }
332                         instance.add( id_i, domain_combination );
333                     }
334                     final Set<DomainId> saw_j = new HashSet<DomainId>();
335                     if ( ignore_combination_with_same_domain ) {
336                         saw_j.add( id_i );
337                     }
338                     Domain closest = null;
339                     for( int j = 0; j < protein.getNumberOfProteinDomains(); ++j ) {
340                         if ( ( dc_type != DomainCombinationType.BASIC )
341                                 && ( current_start >= protein.getProteinDomain( j ).getFrom() ) ) {
342                             continue;
343                         }
344                         if ( i != j ) {
345                             final DomainId id = protein.getProteinDomain( j ).getDomainId();
346                             if ( !saw_j.contains( id ) ) {
347                                 saw_j.add( id );
348                                 if ( dc_type != DomainCombinationType.DIRECTED_ADJACTANT ) {
349                                     domain_combination
350                                             .addCombinableDomain( protein.getProteinDomain( j ).getDomainId() );
351                                 }
352                                 else {
353                                     if ( closest == null ) {
354                                         closest = protein.getProteinDomain( j );
355                                     }
356                                     else {
357                                         if ( protein.getProteinDomain( j ).getFrom() < closest.getFrom() ) {
358                                             closest = protein.getProteinDomain( j );
359                                         }
360                                     }
361                                 }
362                             }
363                         }
364                     }
365                     if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
366                         domain_combination.addCombinableDomain( closest.getDomainId() );
367                     }
368                     if ( protein_length_stats_by_dc != null ) {
369                         final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
370                         for( final BinaryDomainCombination dc : dcs ) {
371                             final String dc_str = dc.toString();
372                             if ( !protein_length_stats_by_dc.containsKey( dc_str ) ) {
373                                 protein_length_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
374                             }
375                             protein_length_stats_by_dc.get( dc_str ).addValue( protein.getLength() );
376                         }
377                     }
378                     if ( domain_number_stats_by_dc != null ) {
379                         final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
380                         for( final BinaryDomainCombination dc : dcs ) {
381                             final String dc_str = dc.toString();
382                             if ( !domain_number_stats_by_dc.containsKey( dc_str ) ) {
383                                 domain_number_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
384                             }
385                             domain_number_stats_by_dc.get( dc_str ).addValue( protein.getNumberOfProteinDomains() );
386                         }
387                     }
388                     //
389                 }
390             }
391         }
392         for( final DomainId key_id : domain_counts.keySet() ) {
393             instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) );
394             instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) );
395             instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) );
396         }
397         return instance;
398     }
399 }