d48cc2b2efa9df6b69b4e79294e5ece5326e4575
[jalview.git] / forester / java / src / org / forester / surfacing / MinimalDomainomeCalculator.java
1
2 package org.forester.surfacing;
3
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileWriter;
7 import java.io.IOException;
8 import java.io.Writer;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashSet;
12 import java.util.List;
13 import java.util.Map.Entry;
14 import java.util.Set;
15 import java.util.SortedMap;
16 import java.util.SortedSet;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
19
20 import org.forester.application.surfacing;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyMethods;
23 import org.forester.phylogeny.PhylogenyNode;
24 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
25 import org.forester.protein.Domain;
26 import org.forester.protein.Protein;
27 import org.forester.species.BasicSpecies;
28 import org.forester.species.Species;
29 import org.forester.surfacing.SurfacingUtil.DomainComparator;
30 import org.forester.util.ForesterUtil;
31
32 public final class MinimalDomainomeCalculator {
33
34     public final static void calc( final boolean use_domain_architectures,
35                                    final Phylogeny tre,
36                                    final int target_level,
37                                    final SortedMap<Species, List<Protein>> protein_lists_per_species,
38                                    final String separator,
39                                    final double ie_cutoff,
40                                    final String outfile_base,
41                                    final boolean write_protein_files )
42             throws IOException {
43         final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
44         if ( protein_lists_per_species == null || tre == null ) {
45             throw new IllegalArgumentException( "argument is null" );
46         }
47         if ( protein_lists_per_species.size() < 2 ) {
48             throw new IllegalArgumentException( "not enough genomes" );
49         }
50         final String x;
51         if ( use_domain_architectures ) {
52             x = "DA";
53         }
54         else {
55             x = "domain";
56         }
57         final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
58         final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
59         SurfacingUtil.checkForOutputFileWriteability( outfile );
60         SurfacingUtil.checkForOutputFileWriteability( outfile_table );
61         final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
62         final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
63         out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
64         out.write( ForesterUtil.LINE_SEPARATOR );
65         ///////////
66         //////////
67         SortedMap<String, List<Protein>> protein_lists_per_quasi_species = null;
68         if ( target_level >= 1 ) {
69             protein_lists_per_quasi_species = makeProteinListsPerQuasiSpecies( tre,
70                                                                                target_level,
71                                                                                protein_lists_per_species );
72            
73         }
74         /////////
75         ///////////
76         for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
77             final PhylogenyNode node = iter.next();
78             final int node_level = PhylogenyMethods.calculateLevel( node );
79             final String species_name = node.getNodeData().isHasTaxonomy()
80                     ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
81             final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
82                     : "";
83             final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
84                     : "";
85             final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
86             final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
87             if ( ( target_level < 1 ) || ( node_level >= target_level ) ) {
88                 out.write( species_name + " " + node_level);
89                 if ( !ForesterUtil.isEmpty( common ) ) {
90                     out.write( "\t" + common );
91                 }
92                 else {
93                     out.write( "\t" );
94                 }
95                 if ( !ForesterUtil.isEmpty( tcode ) ) {
96                     out.write( "\t" + tcode );
97                 }
98                 else {
99                     out.write( "\t" );
100                 }
101                 if ( !ForesterUtil.isEmpty( rank ) ) {
102                     out.write( "\t" + rank );
103                 }
104                 else {
105                     out.write( "\t" );
106                 }
107                 if ( node.isInternal() ) {
108                     out.write( "\t" + external_descs.size() + "\t" );
109                 }
110                 else {
111                     out.write( "\t\t" );
112                 }
113             }
114             final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
115             boolean first = true;
116             if ( target_level >= 1 ) {
117                 ////////////
118                 ////////////
119                 if ( node_level >= target_level ) {
120                     final List<PhylogenyNode> given_level_descs = PhylogenyMethods
121                             .getAllDescendantsOfGivenLevel( node, target_level );
122                     for( final PhylogenyNode given_level_desc : given_level_descs ) {
123                         final String spec_name = given_level_desc.getNodeData().isHasTaxonomy()
124                                 ? given_level_desc.getNodeData().getTaxonomy().getScientificName()
125                                 : given_level_desc.getName();
126                         if ( node.isInternal() ) {
127                             if ( first ) {
128                                 first = false;
129                             }
130                             else {
131                                 out.write( ", " );
132                             }
133                             out.write( "sp_n=" + spec_name );
134                         }
135                         final List<Protein> proteins_per_species = protein_lists_per_quasi_species.get( spec_name );
136                         if ( proteins_per_species != null ) {
137                             final SortedSet<String> features_per_genome = new TreeSet<String>();
138                             for( final Protein protein : proteins_per_species ) {
139                                 if ( use_domain_architectures ) {
140                                     final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
141                                     features_per_genome.add( da );
142                                 }
143                                 else {
144                                     List<Domain> domains = protein.getProteinDomains();
145                                     for( final Domain domain : domains ) {
146                                         if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
147                                             features_per_genome.add( domain.getDomainId() );
148                                         }
149                                     }
150                                 }
151                             }
152                             System.out.println( ">>>>>>>>>>>>>> features_per_genome.size()=" + features_per_genome.size() );
153                             if ( features_per_genome.size() > 0 ) {
154                                 features_per_genome_list.add( features_per_genome );
155                             }
156                             else {
157                                 System.out.println( "error!" );
158                                 System.exit( -1 );
159                             }
160                         }
161                         else {
162                             System.out.println( "error!" );
163                             System.exit( -1 );
164                         }
165                     }
166                 }
167                 ///////////
168                 ///////////
169             }
170             else {
171                 for( final PhylogenyNode external_desc : external_descs ) {
172                     final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
173                     if ( node.isInternal() ) {
174                         if ( first ) {
175                             first = false;
176                         }
177                         else {
178                             out.write( ", " );
179                         }
180                         out.write( code );
181                     }
182                     final List<Protein> proteins_per_species = protein_lists_per_species
183                             .get( new BasicSpecies( code ) );
184                     if ( proteins_per_species != null ) {
185                         final SortedSet<String> features_per_genome = new TreeSet<String>();
186                         for( final Protein protein : proteins_per_species ) {
187                             if ( use_domain_architectures ) {
188                                 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
189                                 features_per_genome.add( da );
190                             }
191                             else {
192                                 List<Domain> domains = protein.getProteinDomains();
193                                 for( final Domain domain : domains ) {
194                                     if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
195                                         features_per_genome.add( domain.getDomainId() );
196                                     }
197                                 }
198                             }
199                         }
200                         if ( features_per_genome.size() > 0 ) {
201                             features_per_genome_list.add( features_per_genome );
202                         }
203                     }
204                 } // for( final PhylogenyNode external_desc : external_descs )
205             } // else
206             if ( features_per_genome_list.size() > 0 ) {
207                 SortedSet<String> intersection = calcIntersection( features_per_genome_list );
208                 out.write( "\t" + intersection.size() + "\t" );
209                 first = true;
210                 for( final String s : intersection ) {
211                     if ( first ) {
212                         first = false;
213                     }
214                     else {
215                         out.write( ", " );
216                     }
217                     out.write( s );
218                 }
219                 out.write( ForesterUtil.LINE_SEPARATOR );
220                 species_to_features_map.put( species_name, intersection );
221             }
222         }
223         final SortedSet<String> all_species_names = new TreeSet<String>();
224         final SortedSet<String> all_features = new TreeSet<String>();
225         for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
226             all_species_names.add( e.getKey() );
227             for( final String f : e.getValue() ) {
228                 all_features.add( f );
229             }
230         }
231         out_table.write( '\t' );
232         boolean first = true;
233         for( final String species_name : all_species_names ) {
234             if ( first ) {
235                 first = false;
236             }
237             else {
238                 out_table.write( '\t' );
239             }
240             out_table.write( species_name );
241         }
242         out_table.write( ForesterUtil.LINE_SEPARATOR );
243         for( final String das : all_features ) {
244             out_table.write( das );
245             out_table.write( '\t' );
246             first = true;
247             for( final String species_name : all_species_names ) {
248                 if ( first ) {
249                     first = false;
250                 }
251                 else {
252                     out_table.write( '\t' );
253                 }
254                 if ( species_to_features_map.get( species_name ).contains( das ) ) {
255                     out_table.write( '1' );
256                 }
257                 else {
258                     out_table.write( '0' );
259                 }
260             }
261             out_table.write( ForesterUtil.LINE_SEPARATOR );
262         }
263         out.flush();
264         out.close();
265         out_table.flush();
266         out_table.close();
267         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
268         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
269         if ( write_protein_files ) {
270             final String protdirname;
271             final String a;
272             final String b;
273             if ( use_domain_architectures ) {
274                 a = "_DA";
275                 b = "domain architectures (DAs)";
276                 protdirname = "_DAS";
277             }
278             else {
279                 a = "_domain";
280                 b = "domains";
281                 protdirname = "_DOMAINS";
282             }
283             final File prot_dir = new File( outfile_base + protdirname );
284             final boolean success = prot_dir.mkdir();
285             if ( !success ) {
286                 throw new IOException( "failed to create dir " + prot_dir );
287             }
288             int total = 0;
289             final String dir = outfile_base + protdirname + "/";
290             for( final String feat : all_features ) {
291                 final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX );
292                 SurfacingUtil.checkForOutputFileWriteability( extract_outfile );
293                 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) );
294                 final int counter = extractProteinFeatures( use_domain_architectures,
295                                                             protein_lists_per_species,
296                                                             feat,
297                                                             proteins_file_writer,
298                                                             ie_cutoff,
299                                                             separator );
300                 if ( counter < 1 ) {
301                     ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" );
302                 }
303                 total += counter;
304                 proteins_file_writer.close();
305             }
306             ForesterUtil.programMessage( "surfacing",
307                                          "Wrote " + total + " individual " + b + " from a total of "
308                                                  + all_features.size() + " into: " + dir );
309         }
310     }
311
312     private final static SortedMap<String, List<Protein>> makeProteinListsPerQuasiSpecies( final Phylogeny tre,
313                                                                                            final int level,
314                                                                                            final SortedMap<Species, List<Protein>> protein_lists_per_species ) {
315         final SortedMap<String, List<Protein>> protein_lists_per_quasi_species = new TreeMap<String, List<Protein>>();
316         System.out.println( "---------------------------------" );
317         System.out.println( "level=" + level );
318         for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
319             final PhylogenyNode node = iter.next();
320             final int node_level = PhylogenyMethods.calculateLevel( node );
321             if ( node_level == level ) {
322                 System.out.println( "level=" + level );
323                 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
324                 final List<Protein> protein_list_per_quasi_species = new ArrayList<Protein>();
325                 for( final PhylogenyNode external_desc : external_descs ) {
326                     final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
327                     final List<Protein> proteins_per_species = protein_lists_per_species
328                             .get( new BasicSpecies( code ) );
329                     //System.out.println( code );
330                     for( Protein protein : proteins_per_species ) {
331                         protein_list_per_quasi_species.add( protein );
332                     }
333                 }
334                 final String species_name = node.getNodeData().isHasTaxonomy()
335                         ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
336                 System.out.println( "species_name=" + species_name );
337                 protein_lists_per_quasi_species.put( species_name, protein_list_per_quasi_species );
338                 System.out.println( ">>>>" + protein_list_per_quasi_species.size() );
339             }
340         }
341       
342         return protein_lists_per_quasi_species;
343     }
344
345     private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
346         final Set<String> first = features_per_genome_list.get( 0 );
347         final SortedSet<String> my_first = new TreeSet<String>();
348         for( final String s : first ) {
349             my_first.add( s );
350         }
351         for( int i = 1; i < features_per_genome_list.size(); ++i ) {
352             my_first.retainAll( features_per_genome_list.get( i ) );
353         }
354         return my_first;
355     }
356
357     private final static int extractProteinFeatures( final boolean use_domain_architectures,
358                                                      final SortedMap<Species, List<Protein>> protein_lists_per_species,
359                                                      final String domain_id,
360                                                      final Writer out,
361                                                      final double ie_cutoff,
362                                                      final String domain_separator )
363             throws IOException {
364         int counter = 0;
365         final String separator_for_output = "\t";
366         for( final Species species : protein_lists_per_species.keySet() ) {
367             final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
368             for( final Protein protein : proteins_per_species ) {
369                 if ( use_domain_architectures ) {
370                     if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
371                         int from = Integer.MAX_VALUE;
372                         int to = -1;
373                         for( final Domain d : protein.getProteinDomains() ) {
374                             if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
375                                 if ( d.getFrom() < from ) {
376                                     from = d.getFrom();
377                                 }
378                                 if ( d.getTo() > to ) {
379                                     to = d.getTo();
380                                 }
381                             }
382                         }
383                         out.write( protein.getSpecies().getSpeciesId() );
384                         out.write( separator_for_output );
385                         out.write( protein.getProteinId().getId() );
386                         out.write( separator_for_output );
387                         out.write( domain_id );
388                         out.write( separator_for_output );
389                         out.write( "/" );
390                         out.write( from + "-" + to );
391                         out.write( "/" );
392                         out.write( SurfacingConstants.NL );
393                         ++counter;
394                     }
395                 }
396                 else {
397                     final List<Domain> domains = protein.getProteinDomains( domain_id );
398                     if ( domains.size() > 0 ) {
399                         out.write( protein.getSpecies().getSpeciesId() );
400                         out.write( separator_for_output );
401                         out.write( protein.getProteinId().getId() );
402                         out.write( separator_for_output );
403                         out.write( domain_id );
404                         out.write( separator_for_output );
405                         for( final Domain domain : domains ) {
406                             if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
407                                 out.write( "/" );
408                                 out.write( domain.getFrom() + "-" + domain.getTo() );
409                             }
410                         }
411                         out.write( "/" );
412                         out.write( separator_for_output );
413                         final List<Domain> domain_list = new ArrayList<Domain>();
414                         for( final Domain domain : protein.getProteinDomains() ) {
415                             if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
416                                 domain_list.add( domain );
417                             }
418                         }
419                         final Domain domain_ary[] = new Domain[ domain_list.size() ];
420                         for( int i = 0; i < domain_list.size(); ++i ) {
421                             domain_ary[ i ] = domain_list.get( i );
422                         }
423                         Arrays.sort( domain_ary, new DomainComparator( true ) );
424                         out.write( "{" );
425                         boolean first = true;
426                         for( final Domain domain : domain_ary ) {
427                             if ( first ) {
428                                 first = false;
429                             }
430                             else {
431                                 out.write( "," );
432                             }
433                             out.write( domain.getDomainId().toString() );
434                             out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
435                             out.write( ":" + domain.getPerDomainEvalue() );
436                         }
437                         out.write( "}" );
438                         if ( !( ForesterUtil.isEmpty( protein.getDescription() )
439                                 || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
440                             out.write( protein.getDescription() );
441                         }
442                         out.write( separator_for_output );
443                         if ( !( ForesterUtil.isEmpty( protein.getAccession() )
444                                 || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
445                             out.write( protein.getAccession() );
446                         }
447                         out.write( SurfacingConstants.NL );
448                         ++counter;
449                     }
450                 }
451             }
452         }
453         out.flush();
454         return counter;
455     }
456
457     public static void main( final String[] args ) {
458         Set<String> a = new HashSet<String>();
459         Set<String> b = new HashSet<String>();
460         Set<String> c = new HashSet<String>();
461         Set<String> d = new HashSet<String>();
462         a.add( "x" );
463         a.add( "b" );
464         a.add( "c" );
465         b.add( "a" );
466         b.add( "b" );
467         b.add( "c" );
468         c.add( "a" );
469         c.add( "b" );
470         c.add( "c" );
471         c.add( "c" );
472         c.add( "f" );
473         d.add( "a" );
474         d.add( "c" );
475         d.add( "d" );
476         List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
477         domains_per_genome_list.add( a );
478         domains_per_genome_list.add( b );
479         domains_per_genome_list.add( c );
480         domains_per_genome_list.add( d );
481         Set<String> x = calcIntersection( domains_per_genome_list );
482         System.out.println( x );
483     }
484 }