9abb02a37f91cfcbadbb243004e37cc8ef8f239e
[jalview.git] / forester / java / src / org / forester / surfacing / MinimalDomainomeCalculator.java
1
2 package org.forester.surfacing;
3
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileWriter;
7 import java.io.IOException;
8 import java.io.Writer;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashSet;
12 import java.util.List;
13 import java.util.Map.Entry;
14 import java.util.Set;
15 import java.util.SortedMap;
16 import java.util.SortedSet;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
19
20 import org.forester.application.surfacing;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyNode;
23 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
24 import org.forester.protein.Domain;
25 import org.forester.protein.Protein;
26 import org.forester.species.BasicSpecies;
27 import org.forester.species.Species;
28 import org.forester.surfacing.SurfacingUtil.DomainComparator;
29 import org.forester.util.ForesterUtil;
30
31 public final class MinimalDomainomeCalculator {
32
33     static final public void calcOme( final boolean use_domain_architectures,
34                                       final Phylogeny tre,
35                                       final SortedMap<Species, List<Protein>> protein_lists_per_species,
36                                       final String separator,
37                                       final double ie_cutoff,
38                                       final String outfile_base )
39             throws IOException {
40         final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
41         if ( protein_lists_per_species == null || tre == null ) {
42             throw new IllegalArgumentException( "argument is null" );
43         }
44         if ( protein_lists_per_species.size() < 2 ) {
45             throw new IllegalArgumentException( "not enough genomes" );
46         }
47         final String x;
48         if ( use_domain_architectures ) {
49             x = "DA";
50         }
51         else {
52             x = "domain";
53         }
54         final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
55         final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
56         SurfacingUtil.checkForOutputFileWriteability( outfile );
57         SurfacingUtil.checkForOutputFileWriteability( outfile_table );
58         final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
59         final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
60         out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
61         out.write( ForesterUtil.LINE_SEPARATOR );
62         for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
63             final PhylogenyNode node = iter.next();
64             final String species_name = node.getNodeData().isHasTaxonomy()
65                     ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
66             final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
67                     : "";
68             final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
69                     : "";
70             final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
71             out.write( species_name );
72             if ( !ForesterUtil.isEmpty( common ) ) {
73                 out.write( "\t" + common );
74             }
75             else {
76                 out.write( "\t" );
77             }
78             if ( !ForesterUtil.isEmpty( tcode ) ) {
79                 out.write( "\t" + tcode );
80             }
81             else {
82                 out.write( "\t" );
83             }
84             if ( !ForesterUtil.isEmpty( rank ) ) {
85                 out.write( "\t" + rank );
86             }
87             else {
88                 out.write( "\t" );
89             }
90             final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
91             if ( node.isInternal() ) {
92                 out.write( "\t" + external_descs.size() + "\t" );
93             }
94             else {
95                 out.write( "\t\t" );
96             }
97             final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
98             boolean first = true;
99             for( final PhylogenyNode external_desc : external_descs ) {
100                 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
101                 if ( node.isInternal() ) {
102                     if ( first ) {
103                         first = false;
104                     }
105                     else {
106                         out.write( ", " );
107                     }
108                     out.write( code );
109                 }
110                 final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
111                 if ( proteins_per_species != null ) {
112                     final SortedSet<String> features_per_genome = new TreeSet<String>();
113                     for( final Protein protein : proteins_per_species ) {
114                         if ( use_domain_architectures ) {
115                             final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
116                             features_per_genome.add( da );
117                         }
118                         else {
119                             List<Domain> domains = protein.getProteinDomains();
120                             for( final Domain domain : domains ) {
121                                 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
122                                     features_per_genome.add( domain.getDomainId() );
123                                 }
124                             }
125                         }
126                     }
127                     if ( features_per_genome.size() > 0 ) {
128                         features_per_genome_list.add( features_per_genome );
129                     }
130                 }
131             }
132             if ( features_per_genome_list.size() > 0 ) {
133                 SortedSet<String> intersection = calcIntersection( features_per_genome_list );
134                 out.write( "\t" + intersection.size() + "\t" );
135                 first = true;
136                 for( final String s : intersection ) {
137                     if ( first ) {
138                         first = false;
139                     }
140                     else {
141                         out.write( ", " );
142                     }
143                     out.write( s );
144                 }
145                 out.write( ForesterUtil.LINE_SEPARATOR );
146                 species_to_features_map.put( species_name, intersection );
147             }
148         }
149         final SortedSet<String> all_species_names = new TreeSet<String>();
150         final SortedSet<String> all_features = new TreeSet<String>();
151         for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
152             all_species_names.add( e.getKey() );
153             for( final String f : e.getValue() ) {
154                 all_features.add( f );
155             }
156         }
157         out_table.write( '\t' );
158         boolean first = true;
159         for( final String species_name : all_species_names ) {
160             if ( first ) {
161                 first = false;
162             }
163             else {
164                 out_table.write( '\t' );
165             }
166             out_table.write( species_name );
167         }
168         out_table.write( ForesterUtil.LINE_SEPARATOR );
169         for( final String das : all_features ) {
170             out_table.write( das );
171             out_table.write( '\t' );
172             first = true;
173             for( final String species_name : all_species_names ) {
174                 if ( first ) {
175                     first = false;
176                 }
177                 else {
178                     out_table.write( '\t' );
179                 }
180                 if ( species_to_features_map.get( species_name ).contains( das ) ) {
181                     out_table.write( '1' );
182                 }
183                 else {
184                     out_table.write( '0' );
185                 }
186             }
187             out_table.write( ForesterUtil.LINE_SEPARATOR );
188         }
189         out.flush();
190         out.close();
191         out_table.flush();
192         out_table.close();
193         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
194         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
195         for( String f : all_features ) {
196             final String a;
197             if ( use_domain_architectures ) {
198                 a = "DA_";
199             }
200             else {
201                 a = "domain_";
202             }
203             final File prot_dir = new File( outfile_base + "_prot" );
204             prot_dir.mkdir();
205             final File outt = new File( outfile_base + "_prot/" + a + f + surfacing.SEQ_EXTRACT_SUFFIX );
206             final Writer proteins_file_writer = new BufferedWriter( new FileWriter( outt ) );
207             extractProteinFeatures( use_domain_architectures,
208                                     protein_lists_per_species,
209                                     f,
210                                     proteins_file_writer,
211                                     ie_cutoff,
212                                     separator );
213             proteins_file_writer.close();
214         }
215     }
216
217     private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
218         final Set<String> first = features_per_genome_list.get( 0 );
219         final SortedSet<String> my_first = new TreeSet<String>();
220         for( final String s : first ) {
221             my_first.add( s );
222         }
223         for( int i = 1; i < features_per_genome_list.size(); ++i ) {
224             my_first.retainAll( features_per_genome_list.get( i ) );
225         }
226         return my_first;
227     }
228
229     public static void extractProteinFeatures( final boolean use_domain_architectures,
230                                                final SortedMap<Species, List<Protein>> protein_lists_per_species,
231                                                final String domain_id,
232                                                final Writer out,
233                                                final double ie_cutoff,
234                                                final String domain_separator )
235             throws IOException {
236         final String separator_for_output = "\t";
237         for( final Species species : protein_lists_per_species.keySet() ) {
238             final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
239             for( final Protein protein : proteins_per_species ) {
240                 if ( use_domain_architectures ) {
241                     if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
242                         int from = Integer.MAX_VALUE;
243                         int to = -1;
244                         for( final Domain d : protein.getProteinDomains() ) {
245                             if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
246                                 if ( d.getFrom() < from ) {
247                                     from = d.getFrom();
248                                 }
249                                 if ( d.getTo() > to ) {
250                                     to = d.getTo();
251                                 }
252                             }
253                         }
254                         out.write( protein.getSpecies().getSpeciesId() );
255                         out.write( separator_for_output );
256                         out.write( protein.getProteinId().getId() );
257                         out.write( separator_for_output );
258                         out.write( domain_id );
259                         out.write( separator_for_output );
260                         out.write( "/" );
261                         out.write( from + "-" + to );
262                         out.write( "/" );
263                         out.write( SurfacingConstants.NL );
264                     }
265                 }
266                 else {
267                     final List<Domain> domains = protein.getProteinDomains( domain_id );
268                     if ( domains.size() > 0 ) {
269                         out.write( protein.getSpecies().getSpeciesId() );
270                         out.write( separator_for_output );
271                         out.write( protein.getProteinId().getId() );
272                         out.write( separator_for_output );
273                         out.write( domain_id );
274                         out.write( separator_for_output );
275                         for( final Domain domain : domains ) {
276                             if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
277                                 out.write( "/" );
278                                 out.write( domain.getFrom() + "-" + domain.getTo() );
279                             }
280                         }
281                         out.write( "/" );
282                         out.write( separator_for_output );
283                         final List<Domain> domain_list = new ArrayList<Domain>();
284                         for( final Domain domain : protein.getProteinDomains() ) {
285                             if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
286                                 domain_list.add( domain );
287                             }
288                         }
289                         final Domain domain_ary[] = new Domain[ domain_list.size() ];
290                         for( int i = 0; i < domain_list.size(); ++i ) {
291                             domain_ary[ i ] = domain_list.get( i );
292                         }
293                         Arrays.sort( domain_ary, new DomainComparator( true ) );
294                         out.write( "{" );
295                         boolean first = true;
296                         for( final Domain domain : domain_ary ) {
297                             if ( first ) {
298                                 first = false;
299                             }
300                             else {
301                                 out.write( "," );
302                             }
303                             out.write( domain.getDomainId().toString() );
304                             out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
305                             out.write( ":" + domain.getPerDomainEvalue() );
306                         }
307                         out.write( "}" );
308                         if ( !( ForesterUtil.isEmpty( protein.getDescription() )
309                                 || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
310                             out.write( protein.getDescription() );
311                         }
312                         out.write( separator_for_output );
313                         if ( !( ForesterUtil.isEmpty( protein.getAccession() )
314                                 || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
315                             out.write( protein.getAccession() );
316                         }
317                         out.write( SurfacingConstants.NL );
318                     }
319                 }
320             }
321         }
322         out.flush();
323     }
324
325     public static void main( final String[] args ) {
326         Set<String> a = new HashSet<String>();
327         Set<String> b = new HashSet<String>();
328         Set<String> c = new HashSet<String>();
329         Set<String> d = new HashSet<String>();
330         a.add( "x" );
331         a.add( "b" );
332         a.add( "c" );
333         b.add( "a" );
334         b.add( "b" );
335         b.add( "c" );
336         c.add( "a" );
337         c.add( "b" );
338         c.add( "c" );
339         c.add( "c" );
340         c.add( "f" );
341         d.add( "a" );
342         d.add( "c" );
343         d.add( "d" );
344         List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
345         domains_per_genome_list.add( a );
346         domains_per_genome_list.add( b );
347         domains_per_genome_list.add( c );
348         domains_per_genome_list.add( d );
349         Set<String> x = calcIntersection( domains_per_genome_list );
350         System.out.println( x );
351     }
352 }