inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / BasicGenomeWideCombinableDomains.java
1
2 package org.forester.surfacing;
3
4 import java.text.DecimalFormat;
5 import java.text.NumberFormat;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.List;
12 import java.util.Map;
13 import java.util.Set;
14 import java.util.SortedMap;
15 import java.util.SortedSet;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
18
19 import org.forester.go.GoId;
20 import org.forester.protein.BinaryDomainCombination;
21 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
22 import org.forester.protein.Domain;
23 import org.forester.protein.Protein;
24 import org.forester.species.Species;
25 import org.forester.util.BasicDescriptiveStatistics;
26 import org.forester.util.DescriptiveStatistics;
27 import org.forester.util.ForesterUtil;
28
29 public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains {
30
31     private final static NumberFormat                  FORMATTER                                  = new DecimalFormat( "0.0E0" );
32     private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_COUNT_ORDER          = new Comparator<CombinableDomains>() {
33
34                                                                                                       @Override
35                                                                                                       public int compare( final CombinableDomains d1,
36                                                                                                                           final CombinableDomains d2 ) {
37                                                                                                           if ( d1.getKeyDomainCount() < d2
38                                                                                                                   .getKeyDomainCount() ) {
39                                                                                                               return 1;
40                                                                                                           }
41                                                                                                           else if ( d1
42                                                                                                                   .getKeyDomainCount() > d2
43                                                                                                                   .getKeyDomainCount() ) {
44                                                                                                               return -1;
45                                                                                                           }
46                                                                                                           else {
47                                                                                                               return d1
48                                                                                                                       .getKeyDomain()
49                                                                                                                       .compareTo( d2
50                                                                                                                               .getKeyDomain() );
51                                                                                                           }
52                                                                                                       }
53                                                                                                   };
54     private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator<CombinableDomains>() {
55
56                                                                                                       @Override
57                                                                                                       public int compare( final CombinableDomains d1,
58                                                                                                                           final CombinableDomains d2 ) {
59                                                                                                           if ( d1.getKeyDomainProteinsCount() < d2
60                                                                                                                   .getKeyDomainProteinsCount() ) {
61                                                                                                               return 1;
62                                                                                                           }
63                                                                                                           else if ( d1
64                                                                                                                   .getKeyDomainProteinsCount() > d2
65                                                                                                                   .getKeyDomainProteinsCount() ) {
66                                                                                                               return -1;
67                                                                                                           }
68                                                                                                           else {
69                                                                                                               return d1
70                                                                                                                       .getKeyDomain()
71                                                                                                                       .compareTo( d2
72                                                                                                                               .getKeyDomain() );
73                                                                                                           }
74                                                                                                       }
75                                                                                                   };
76     private static final Comparator<CombinableDomains> DESCENDING_COMBINATIONS_COUNT_ORDER        = new Comparator<CombinableDomains>() {
77
78                                                                                                       @Override
79                                                                                                       public int compare( final CombinableDomains d1,
80                                                                                                                           final CombinableDomains d2 ) {
81                                                                                                           if ( d1.getNumberOfCombinableDomains() < d2
82                                                                                                                   .getNumberOfCombinableDomains() ) {
83                                                                                                               return 1;
84                                                                                                           }
85                                                                                                           else if ( d1
86                                                                                                                   .getNumberOfCombinableDomains() > d2
87                                                                                                                   .getNumberOfCombinableDomains() ) {
88                                                                                                               return -1;
89                                                                                                           }
90                                                                                                           else {
91                                                                                                               return d1
92                                                                                                                       .getKeyDomain()
93                                                                                                                       .compareTo( d2
94                                                                                                                               .getKeyDomain() );
95                                                                                                           }
96                                                                                                       }
97                                                                                                   };
98     final private SortedMap<String, CombinableDomains> _combinable_domains_map;
99     final private Species                              _species;
100     final private DomainCombinationType                _dc_type;
101
102     private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) {
103         _combinable_domains_map = new TreeMap<String, CombinableDomains>();
104         _species = species;
105         _dc_type = dc_type;
106     }
107
108     private void add( final String key, final CombinableDomains cdc ) {
109         _combinable_domains_map.put( key, cdc );
110     }
111
112     @Override
113     public boolean contains( final String key_id ) {
114         return _combinable_domains_map.containsKey( key_id );
115     }
116
117     @Override
118     public CombinableDomains get( final String key_id ) {
119         return _combinable_domains_map.get( key_id );
120     }
121
122     @Override
123     public SortedMap<String, CombinableDomains> getAllCombinableDomainsIds() {
124         return _combinable_domains_map;
125     }
126
127     @Override
128     public SortedSet<String> getAllDomainIds() {
129         final SortedSet<String> domains = new TreeSet<String>();
130         for( final String key : getAllCombinableDomainsIds().keySet() ) {
131             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
132             final List<String> ds = cb.getAllDomains();
133             for( final String d : ds ) {
134                 domains.add( d );
135             }
136         }
137         return domains;
138     }
139
140     @Override
141     public DomainCombinationType getDomainCombinationType() {
142         return _dc_type;
143     }
144
145     @Override
146     public SortedSet<String> getMostPromiscuosDomain() {
147         final SortedSet<String> doms = new TreeSet<String>();
148         final int max = ( int ) getPerGenomeDomainPromiscuityStatistics().getMax();
149         for( final String key : getAllCombinableDomainsIds().keySet() ) {
150             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
151             if ( cb.getNumberOfCombinableDomains() == max ) {
152                 doms.add( key );
153             }
154         }
155         return doms;
156     }
157
158     @Override
159     public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics() {
160         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
161         for( final String key : getAllCombinableDomainsIds().keySet() ) {
162             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
163             stats.addValue( cb.getNumberOfCombinableDomains() );
164         }
165         return stats;
166     }
167
168     @Override
169     public int getSize() {
170         return _combinable_domains_map.size();
171     }
172
173     @Override
174     public Species getSpecies() {
175         return _species;
176     }
177
178     @Override
179     public SortedSet<BinaryDomainCombination> toBinaryDomainCombinations() {
180         final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
181         for( final String key : getAllCombinableDomainsIds().keySet() ) {
182             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
183             for( final BinaryDomainCombination b : cb.toBinaryDomainCombinations() ) {
184                 binary_combinations.add( b );
185             }
186         }
187         return binary_combinations;
188     }
189
190     @Override
191     public String toString() {
192         return toStringBuilder( GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID ).toString();
193     }
194
195     // Produces something like: 
196     // 2-oxoacid_dh      5       5       2       4.8E-67   Biotin_lipoyl [4], E3_binding [3]
197     @Override
198     public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) {
199         final StringBuilder sb = new StringBuilder();
200         final List<CombinableDomains> combinable_domains = new ArrayList<CombinableDomains>();
201         for( final String key : getAllCombinableDomainsIds().keySet() ) {
202             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
203             combinable_domains.add( cb );
204         }
205         if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT ) {
206             Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_COUNT_ORDER );
207         }
208         else if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT ) {
209             Collections.sort( combinable_domains,
210                               BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER );
211         }
212         else if ( sort_order == GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT ) {
213             Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_COMBINATIONS_COUNT_ORDER );
214         }
215         for( final CombinableDomains cb : combinable_domains ) {
216             sb.append( ForesterUtil.pad( new StringBuffer( cb.getKeyDomain().toString() ), 18, ' ', false ) );
217             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) );
218             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) );
219             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) );
220             sb.append( ForesterUtil.pad( new StringBuffer( ""
221                                                  + FORMATTER.format( cb.getKeyDomainConfidenceDescriptiveStatistics()
222                                                          .median() ) ),
223                                          10,
224                                          ' ',
225                                          false ) );
226             sb.append( cb.getCombiningDomainIdsAsStringBuilder() );
227             sb.append( ForesterUtil.getLineSeparator() );
228         }
229         return sb;
230     }
231
232     private static void countDomains( final Map<String, Integer> domain_counts,
233                                       final Map<String, Integer> domain_protein_counts,
234                                       final Map<String, DescriptiveStatistics> stats,
235                                       final Set<String> saw_c,
236                                       final String id_i,
237                                       final double support ) {
238         if ( domain_counts.containsKey( id_i ) ) {
239             domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) );
240             if ( !saw_c.contains( id_i ) ) {
241                 domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) );
242             }
243         }
244         else {
245             stats.put( id_i, new BasicDescriptiveStatistics() );
246             domain_counts.put( id_i, 1 );
247             domain_protein_counts.put( id_i, 1 );
248         }
249         stats.get( id_i ).addValue( support );
250         saw_c.add( id_i );
251     }
252
253     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
254                                                                    final boolean ignore_combination_with_same_domain,
255                                                                    final Species species ) {
256         return createInstance( protein_list,
257                                ignore_combination_with_same_domain,
258                                species,
259                                null,
260                                DomainCombinationType.BASIC,
261                                null,
262                                null );
263     }
264
265     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
266                                                                    final boolean ignore_combination_with_same_domain,
267                                                                    final Species species,
268                                                                    final DomainCombinationType dc_type ) {
269         return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type, null, null );
270     }
271
272     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
273                                                                    final boolean ignore_combination_with_same_domain,
274                                                                    final Species species,
275                                                                    final Map<String, List<GoId>> domain_id_to_go_ids_map,
276                                                                    final DomainCombinationType dc_type,
277                                                                    final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
278                                                                    final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
279         final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
280         final Map<String, Integer> domain_counts = new HashMap<String, Integer>();
281         final Map<String, Integer> domain_protein_counts = new HashMap<String, Integer>();
282         final Map<String, DescriptiveStatistics> stats = new HashMap<String, DescriptiveStatistics>();
283         for( final Protein protein : protein_list ) {
284             if ( !protein.getSpecies().equals( species ) ) {
285                 throw new IllegalArgumentException( "species (" + protein.getSpecies()
286                         + ") does not match species of combinable domains collection (" + species + ")" );
287             }
288             final Set<String> saw_i = new HashSet<String>();
289             final Set<String> saw_c = new HashSet<String>();
290             for( int i = 0; i < protein.getProteinDomains().size(); ++i ) {
291                 final Domain pd_i = protein.getProteinDomain( i );
292                 final String id_i = pd_i.getDomainId();
293                 final int current_start = pd_i.getFrom();
294                 BasicGenomeWideCombinableDomains.countDomains( domain_counts,
295                                                                domain_protein_counts,
296                                                                stats,
297                                                                saw_c,
298                                                                id_i,
299                                                                pd_i.getPerSequenceEvalue() );
300                 if ( !saw_i.contains( id_i ) ) {
301                     if ( dc_type == DomainCombinationType.BASIC ) {
302                         saw_i.add( id_i );
303                     }
304                     CombinableDomains domain_combination = null;
305                     if ( instance.contains( id_i ) ) {
306                         domain_combination = instance.get( id_i );
307                     }
308                     else {
309                         if ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) {
310                             domain_combination = new AdjactantDirectedCombinableDomains( pd_i.getDomainId(), species );
311                         }
312                         else if ( dc_type == DomainCombinationType.DIRECTED ) {
313                             domain_combination = new DirectedCombinableDomains( pd_i.getDomainId(), species );
314                         }
315                         else {
316                             domain_combination = new BasicCombinableDomains( pd_i.getDomainId(), species );
317                         }
318                         // ^^       if ( ( domain_id_to_go_ids_map != null )
319                         // ^^             && domain_id_to_go_ids_map.containsKey( pd_i.getDomainId() ) ) {
320                         // ^^        final List<GoId> go_ids = domain_id_to_go_ids_map.get( pd_i.getDomainId() );
321                         // ^^        for( final GoId go_id : go_ids ) {
322                         // ^^           domain_combination.getKeyDomain().addGoId( go_id );
323                         // ^^       }
324                         // ^^  }
325                         instance.add( id_i, domain_combination );
326                     }
327                     final Set<String> saw_j = new HashSet<String>();
328                     if ( ignore_combination_with_same_domain ) {
329                         saw_j.add( id_i );
330                     }
331                     Domain closest = null;
332                     for( int j = 0; j < protein.getNumberOfProteinDomains(); ++j ) {
333                         if ( ( dc_type != DomainCombinationType.BASIC )
334                                 && ( current_start >= protein.getProteinDomain( j ).getFrom() ) ) {
335                             continue;
336                         }
337                         if ( i != j ) {
338                             final String id = protein.getProteinDomain( j ).getDomainId();
339                             if ( !saw_j.contains( id ) ) {
340                                 saw_j.add( id );
341                                 if ( dc_type != DomainCombinationType.DIRECTED_ADJACTANT ) {
342                                     domain_combination
343                                             .addCombinableDomain( protein.getProteinDomain( j ).getDomainId() );
344                                 }
345                                 else {
346                                     if ( closest == null ) {
347                                         closest = protein.getProteinDomain( j );
348                                     }
349                                     else {
350                                         if ( protein.getProteinDomain( j ).getFrom() < closest.getFrom() ) {
351                                             closest = protein.getProteinDomain( j );
352                                         }
353                                     }
354                                 }
355                             }
356                         }
357                     }
358                     if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
359                         domain_combination.addCombinableDomain( closest.getDomainId() );
360                     }
361                     if ( protein_length_stats_by_dc != null ) {
362                         final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
363                         for( final BinaryDomainCombination dc : dcs ) {
364                             final String dc_str = dc.toString();
365                             if ( !protein_length_stats_by_dc.containsKey( dc_str ) ) {
366                                 protein_length_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
367                             }
368                             protein_length_stats_by_dc.get( dc_str ).addValue( protein.getLength() );
369                         }
370                     }
371                     if ( domain_number_stats_by_dc != null ) {
372                         final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
373                         for( final BinaryDomainCombination dc : dcs ) {
374                             final String dc_str = dc.toString();
375                             if ( !domain_number_stats_by_dc.containsKey( dc_str ) ) {
376                                 domain_number_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
377                             }
378                             domain_number_stats_by_dc.get( dc_str ).addValue( protein.getNumberOfProteinDomains() );
379                         }
380                     }
381                     //
382                 }
383             }
384         }
385         for( final String key_id : domain_counts.keySet() ) {
386             instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) );
387             instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) );
388             instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) );
389         }
390         return instance;
391     }
392 }