in progress
[jalview.git] / forester / java / src / org / forester / surfacing / BasicGenomeWideCombinableDomains.java
1
2 package org.forester.surfacing;
3
4 import java.text.DecimalFormat;
5 import java.text.NumberFormat;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.List;
12 import java.util.Map;
13 import java.util.Set;
14 import java.util.SortedMap;
15 import java.util.SortedSet;
16 import java.util.TreeMap;
17 import java.util.TreeSet;
18
19 import org.forester.go.GoId;
20 import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType;
21 import org.forester.util.BasicDescriptiveStatistics;
22 import org.forester.util.DescriptiveStatistics;
23 import org.forester.util.ForesterUtil;
24
25 public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains {
26
27     private final static NumberFormat                    FORMATTER                                  = new DecimalFormat( "0.0E0" );
28     private static final Comparator<CombinableDomains>   DESCENDING_KEY_DOMAIN_COUNT_ORDER          = new Comparator<CombinableDomains>() {
29
30                                                                                                         @Override
31                                                                                                         public int compare( final CombinableDomains d1,
32                                                                                                                             final CombinableDomains d2 ) {
33                                                                                                             if ( d1.getKeyDomainCount() < d2
34                                                                                                                     .getKeyDomainCount() ) {
35                                                                                                                 return 1;
36                                                                                                             }
37                                                                                                             else if ( d1
38                                                                                                                     .getKeyDomainCount() > d2
39                                                                                                                     .getKeyDomainCount() ) {
40                                                                                                                 return -1;
41                                                                                                             }
42                                                                                                             else {
43                                                                                                                 return d1
44                                                                                                                         .getKeyDomain()
45                                                                                                                         .getId()
46                                                                                                                         .compareTo( d2
47                                                                                                                                 .getKeyDomain()
48                                                                                                                                 .getId() );
49                                                                                                             }
50                                                                                                         }
51                                                                                                     };
52     private static final Comparator<CombinableDomains>   DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator<CombinableDomains>() {
53
54                                                                                                         @Override
55                                                                                                         public int compare( final CombinableDomains d1,
56                                                                                                                             final CombinableDomains d2 ) {
57                                                                                                             if ( d1.getKeyDomainProteinsCount() < d2
58                                                                                                                     .getKeyDomainProteinsCount() ) {
59                                                                                                                 return 1;
60                                                                                                             }
61                                                                                                             else if ( d1
62                                                                                                                     .getKeyDomainProteinsCount() > d2
63                                                                                                                     .getKeyDomainProteinsCount() ) {
64                                                                                                                 return -1;
65                                                                                                             }
66                                                                                                             else {
67                                                                                                                 return d1
68                                                                                                                         .getKeyDomain()
69                                                                                                                         .getId()
70                                                                                                                         .compareTo( d2
71                                                                                                                                 .getKeyDomain()
72                                                                                                                                 .getId() );
73                                                                                                             }
74                                                                                                         }
75                                                                                                     };
76     private static final Comparator<CombinableDomains>   DESCENDING_COMBINATIONS_COUNT_ORDER        = new Comparator<CombinableDomains>() {
77
78                                                                                                         @Override
79                                                                                                         public int compare( final CombinableDomains d1,
80                                                                                                                             final CombinableDomains d2 ) {
81                                                                                                             if ( d1.getNumberOfCombinableDomains() < d2
82                                                                                                                     .getNumberOfCombinableDomains() ) {
83                                                                                                                 return 1;
84                                                                                                             }
85                                                                                                             else if ( d1
86                                                                                                                     .getNumberOfCombinableDomains() > d2
87                                                                                                                     .getNumberOfCombinableDomains() ) {
88                                                                                                                 return -1;
89                                                                                                             }
90                                                                                                             else {
91                                                                                                                 return d1
92                                                                                                                         .getKeyDomain()
93                                                                                                                         .getId()
94                                                                                                                         .compareTo( d2
95                                                                                                                                 .getKeyDomain()
96                                                                                                                                 .getId() );
97                                                                                                             }
98                                                                                                         }
99                                                                                                     };
100     final private SortedMap<DomainId, CombinableDomains> _combinable_domains_map;
101     final private Species                                _species;
102     final private DomainCombinationType                  _dc_type;
103
104     private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) {
105         _combinable_domains_map = new TreeMap<DomainId, CombinableDomains>();
106         _species = species;
107         _dc_type = dc_type;
108     }
109
110     private void add( final DomainId key, final CombinableDomains cdc ) {
111         _combinable_domains_map.put( key, cdc );
112     }
113
114     @Override
115     public boolean contains( final DomainId key_id ) {
116         return _combinable_domains_map.containsKey( key_id );
117     }
118
119     @Override
120     public CombinableDomains get( final DomainId key_id ) {
121         return _combinable_domains_map.get( key_id );
122     }
123
124     @Override
125     public SortedMap<DomainId, CombinableDomains> getAllCombinableDomainsIds() {
126         return _combinable_domains_map;
127     }
128
129     @Override
130     public SortedSet<DomainId> getAllDomainIds() {
131         final SortedSet<DomainId> domains = new TreeSet<DomainId>();
132         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
133             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
134             final List<DomainId> ds = cb.getAllDomains();
135             for( final DomainId d : ds ) {
136                 domains.add( d );
137             }
138         }
139         return domains;
140     }
141
142     @Override
143     public DomainCombinationType getDomainCombinationType() {
144         return _dc_type;
145     }
146
147     @Override
148     public SortedSet<DomainId> getMostPromiscuosDomain() {
149         final SortedSet<DomainId> doms = new TreeSet<DomainId>();
150         final int max = ( int ) getPerGenomeDomainPromiscuityStatistics().getMax();
151         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
152             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
153             if ( cb.getNumberOfCombinableDomains() == max ) {
154                 doms.add( key );
155             }
156         }
157         return doms;
158     }
159
160     @Override
161     public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics() {
162         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
163         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
164             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
165             stats.addValue( cb.getNumberOfCombinableDomains() );
166         }
167         return stats;
168     }
169
170     @Override
171     public int getSize() {
172         return _combinable_domains_map.size();
173     }
174
175     @Override
176     public Species getSpecies() {
177         return _species;
178     }
179
180     @Override
181     public SortedSet<BinaryDomainCombination> toBinaryDomainCombinations() {
182         final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
183         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
184             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
185             for( final BinaryDomainCombination b : cb.toBinaryDomainCombinations() ) {
186                 binary_combinations.add( b );
187             }
188         }
189         return binary_combinations;
190     }
191
192     @Override
193     public String toString() {
194         return toStringBuilder( GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID ).toString();
195     }
196
197     // Produces something like: 
198     // 2-oxoacid_dh      5       5       2       4.8E-67   Biotin_lipoyl [4], E3_binding [3]
199     @Override
200     public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) {
201         final StringBuilder sb = new StringBuilder();
202         final List<CombinableDomains> combinable_domains = new ArrayList<CombinableDomains>();
203         for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
204             final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
205             combinable_domains.add( cb );
206         }
207         if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT ) {
208             Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_COUNT_ORDER );
209         }
210         else if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT ) {
211             Collections.sort( combinable_domains,
212                               BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER );
213         }
214         else if ( sort_order == GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT ) {
215             Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_COMBINATIONS_COUNT_ORDER );
216         }
217         for( final CombinableDomains cb : combinable_domains ) {
218             sb.append( ForesterUtil.pad( new StringBuffer( cb.getKeyDomain().toString() ), 18, ' ', false ) );
219             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) );
220             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) );
221             sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) );
222             sb.append( ForesterUtil.pad( new StringBuffer( ""
223                                                  + FORMATTER.format( cb.getKeyDomainConfidenceDescriptiveStatistics()
224                                                          .median() ) ),
225                                          10,
226                                          ' ',
227                                          false ) );
228             sb.append( cb.getCombiningDomainIdsAsStringBuilder() );
229             sb.append( ForesterUtil.getLineSeparator() );
230         }
231         return sb;
232     }
233
234     private static void countDomains( final Map<DomainId, Integer> domain_counts,
235                                       final Map<DomainId, Integer> domain_protein_counts,
236                                       final Map<DomainId, DescriptiveStatistics> stats,
237                                       final Set<DomainId> saw_c,
238                                       final DomainId id_i,
239                                       final double support ) {
240         if ( domain_counts.containsKey( id_i ) ) {
241             domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) );
242             if ( !saw_c.contains( id_i ) ) {
243                 domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) );
244             }
245         }
246         else {
247             stats.put( id_i, new BasicDescriptiveStatistics() );
248             domain_counts.put( id_i, 1 );
249             domain_protein_counts.put( id_i, 1 );
250         }
251         stats.get( id_i ).addValue( support );
252         saw_c.add( id_i );
253     }
254
255     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
256                                                                    final boolean ignore_combination_with_same_domain,
257                                                                    final Species species ) {
258         return createInstance( protein_list,
259                                ignore_combination_with_same_domain,
260                                species,
261                                null,
262                                DomainCombinationType.BASIC );
263     }
264
265     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
266                                                                    final boolean ignore_combination_with_same_domain,
267                                                                    final Species species,
268                                                                    final DomainCombinationType dc_type ) {
269         return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type );
270     }
271
272     public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
273                                                                    final boolean ignore_combination_with_same_domain,
274                                                                    final Species species,
275                                                                    final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
276                                                                    final DomainCombinationType dc_type ) {
277         final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
278         final Map<DomainId, Integer> domain_counts = new HashMap<DomainId, Integer>();
279         final Map<DomainId, Integer> domain_protein_counts = new HashMap<DomainId, Integer>();
280         final Map<DomainId, DescriptiveStatistics> stats = new HashMap<DomainId, DescriptiveStatistics>();
281         for( final Protein protein : protein_list ) {
282             if ( !protein.getSpecies().equals( species ) ) {
283                 throw new IllegalArgumentException( "species (" + protein.getSpecies()
284                         + ") does not match species of combinable domains collection (" + species + ")" );
285             }
286             final Set<DomainId> saw_i = new HashSet<DomainId>();
287             final Set<DomainId> saw_c = new HashSet<DomainId>();
288             for( int i = 0; i < protein.getProteinDomains().size(); ++i ) {
289                 final Domain pd_i = protein.getProteinDomain( i );
290                 final DomainId id_i = pd_i.getDomainId();
291                 final int current_start = pd_i.getFrom();
292                 BasicGenomeWideCombinableDomains.countDomains( domain_counts,
293                                                                domain_protein_counts,
294                                                                stats,
295                                                                saw_c,
296                                                                id_i,
297                                                                pd_i.getPerSequenceEvalue() );
298                 if ( !saw_i.contains( id_i ) ) {
299                     if ( dc_type == DomainCombinationType.BASIC ) {
300                         saw_i.add( id_i );
301                     }
302                     CombinableDomains domain_combination = null;
303                     if ( instance.contains( id_i ) ) {
304                         domain_combination = instance.get( id_i );
305                     }
306                     else {
307                         if ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) {
308                             domain_combination = new AdjactantDirectedCombinableDomains( pd_i.getDomainId(), species );
309                         }
310                         else if ( dc_type == DomainCombinationType.DIRECTED ) {
311                             domain_combination = new DirectedCombinableDomains( pd_i.getDomainId(), species );
312                         }
313                         else {
314                             domain_combination = new BasicCombinableDomains( pd_i.getDomainId(), species );
315                         }
316                         if ( ( domain_id_to_go_ids_map != null )
317                                 && domain_id_to_go_ids_map.containsKey( pd_i.getDomainId() ) ) {
318                             final List<GoId> go_ids = domain_id_to_go_ids_map.get( pd_i.getDomainId() );
319                             for( final GoId go_id : go_ids ) {
320                                 domain_combination.getKeyDomain().addGoId( go_id );
321                             }
322                         }
323                         instance.add( id_i, domain_combination );
324                     }
325                     final Set<DomainId> saw_j = new HashSet<DomainId>();
326                     if ( ignore_combination_with_same_domain ) {
327                         saw_j.add( id_i );
328                     }
329                     Domain closest = null;
330                     for( int j = 0; j < protein.getNumberOfProteinDomains(); ++j ) {
331                         if ( ( dc_type != DomainCombinationType.BASIC )
332                                 && ( current_start >= protein.getProteinDomain( j ).getFrom() ) ) {
333                             continue;
334                         }
335                         if ( i != j ) {
336                             final DomainId id = protein.getProteinDomain( j ).getDomainId();
337                             if ( !saw_j.contains( id ) ) {
338                                 saw_j.add( id );
339                                 if ( dc_type != DomainCombinationType.DIRECTED_ADJACTANT ) {
340                                     domain_combination
341                                             .addCombinableDomain( protein.getProteinDomain( j ).getDomainId() );
342                                 }
343                                 else {
344                                     if ( closest == null ) {
345                                         closest = protein.getProteinDomain( j );
346                                     }
347                                     else {
348                                         if ( protein.getProteinDomain( j ).getFrom() < closest.getFrom() ) {
349                                             closest = protein.getProteinDomain( j );
350                                         }
351                                     }
352                                 }
353                             }
354                         }
355                     }
356                     if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
357                         domain_combination.addCombinableDomain( closest.getDomainId() );
358                     }
359                 }
360             }
361         }
362         for( final DomainId key_id : domain_counts.keySet() ) {
363             instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) );
364             instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) );
365             instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) );
366         }
367         return instance;
368     }
369 }