in progress...
[jalview.git] / forester / java / src / org / forester / surfacing / MinimalDomainomeCalculator.java
1
2 package org.forester.surfacing;
3
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileWriter;
7 import java.io.IOException;
8 import java.io.Writer;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashSet;
12 import java.util.List;
13 import java.util.Map.Entry;
14 import java.util.Set;
15 import java.util.SortedMap;
16 import java.util.SortedSet;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
19
20 import org.forester.application.surfacing;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyNode;
23 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
24 import org.forester.protein.Domain;
25 import org.forester.protein.Protein;
26 import org.forester.species.BasicSpecies;
27 import org.forester.species.Species;
28 import org.forester.surfacing.SurfacingUtil.DomainComparator;
29 import org.forester.util.ForesterUtil;
30
31 public final class MinimalDomainomeCalculator {
32
33     public final static void calc( final boolean use_domain_architectures,
34                                    final Phylogeny tre,
35                                    final SortedMap<Species, List<Protein>> protein_lists_per_species,
36                                    final String separator,
37                                    final double ie_cutoff,
38                                    final String outfile_base,
39                                    final boolean write_protein_files )
40             throws IOException {
41         final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
42         if ( protein_lists_per_species == null || tre == null ) {
43             throw new IllegalArgumentException( "argument is null" );
44         }
45         if ( protein_lists_per_species.size() < 2 ) {
46             throw new IllegalArgumentException( "not enough genomes" );
47         }
48         final String x;
49         if ( use_domain_architectures ) {
50             x = "DA";
51         }
52         else {
53             x = "domain";
54         }
55         final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
56         final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
57         SurfacingUtil.checkForOutputFileWriteability( outfile );
58         SurfacingUtil.checkForOutputFileWriteability( outfile_table );
59         final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
60         final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
61         out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
62         out.write( ForesterUtil.LINE_SEPARATOR );
63         for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
64             final PhylogenyNode node = iter.next();
65             final String species_name = node.getNodeData().isHasTaxonomy()
66                     ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
67             final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
68                     : "";
69             final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
70                     : "";
71             final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
72             out.write( species_name );
73             if ( !ForesterUtil.isEmpty( common ) ) {
74                 out.write( "\t" + common );
75             }
76             else {
77                 out.write( "\t" );
78             }
79             if ( !ForesterUtil.isEmpty( tcode ) ) {
80                 out.write( "\t" + tcode );
81             }
82             else {
83                 out.write( "\t" );
84             }
85             if ( !ForesterUtil.isEmpty( rank ) ) {
86                 out.write( "\t" + rank );
87             }
88             else {
89                 out.write( "\t" );
90             }
91             final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
92             if ( node.isInternal() ) {
93                 out.write( "\t" + external_descs.size() + "\t" );
94             }
95             else {
96                 out.write( "\t\t" );
97             }
98             final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
99             boolean first = true;
100             for( final PhylogenyNode external_desc : external_descs ) {
101                 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
102                 if ( node.isInternal() ) {
103                     if ( first ) {
104                         first = false;
105                     }
106                     else {
107                         out.write( ", " );
108                     }
109                     out.write( code );
110                 }
111                 final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
112                 if ( proteins_per_species != null ) {
113                     final SortedSet<String> features_per_genome = new TreeSet<String>();
114                     for( final Protein protein : proteins_per_species ) {
115                         if ( use_domain_architectures ) {
116                             final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
117                             features_per_genome.add( da );
118                         }
119                         else {
120                             List<Domain> domains = protein.getProteinDomains();
121                             for( final Domain domain : domains ) {
122                                 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
123                                     features_per_genome.add( domain.getDomainId() );
124                                 }
125                             }
126                         }
127                     }
128                     if ( features_per_genome.size() > 0 ) {
129                         features_per_genome_list.add( features_per_genome );
130                     }
131                 }
132             }
133             if ( features_per_genome_list.size() > 0 ) {
134                 SortedSet<String> intersection = calcIntersection( features_per_genome_list );
135                 out.write( "\t" + intersection.size() + "\t" );
136                 first = true;
137                 for( final String s : intersection ) {
138                     if ( first ) {
139                         first = false;
140                     }
141                     else {
142                         out.write( ", " );
143                     }
144                     out.write( s );
145                 }
146                 out.write( ForesterUtil.LINE_SEPARATOR );
147                 species_to_features_map.put( species_name, intersection );
148             }
149         }
150         final SortedSet<String> all_species_names = new TreeSet<String>();
151         final SortedSet<String> all_features = new TreeSet<String>();
152         for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
153             all_species_names.add( e.getKey() );
154             for( final String f : e.getValue() ) {
155                 all_features.add( f );
156             }
157         }
158         out_table.write( '\t' );
159         boolean first = true;
160         for( final String species_name : all_species_names ) {
161             if ( first ) {
162                 first = false;
163             }
164             else {
165                 out_table.write( '\t' );
166             }
167             out_table.write( species_name );
168         }
169         out_table.write( ForesterUtil.LINE_SEPARATOR );
170         for( final String das : all_features ) {
171             out_table.write( das );
172             out_table.write( '\t' );
173             first = true;
174             for( final String species_name : all_species_names ) {
175                 if ( first ) {
176                     first = false;
177                 }
178                 else {
179                     out_table.write( '\t' );
180                 }
181                 if ( species_to_features_map.get( species_name ).contains( das ) ) {
182                     out_table.write( '1' );
183                 }
184                 else {
185                     out_table.write( '0' );
186                 }
187             }
188             out_table.write( ForesterUtil.LINE_SEPARATOR );
189         }
190         out.flush();
191         out.close();
192         out_table.flush();
193         out_table.close();
194         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
195         ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
196         if ( write_protein_files ) {
197             final String protdirname;
198             final String a;
199             final String b;
200             if ( use_domain_architectures ) {
201                 a = "_DA";
202                 b = "domain architectures (DAs)";
203                 protdirname = "_DAS";
204             }
205             else {
206                 a = "_domain";
207                 b = "domains";
208                 protdirname = "_DOMAINS";
209             }
210             final File prot_dir = new File( outfile_base + protdirname );
211             final boolean success = prot_dir.mkdir();
212             if ( !success ) {
213                 throw new IOException( "failed to create dir " + prot_dir );
214             }
215             int total = 0;
216             final String dir = outfile_base + protdirname + "/";
217             for( final String feat : all_features ) {
218                 final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX );
219                 SurfacingUtil.checkForOutputFileWriteability( extract_outfile );
220                 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) );
221                 final int counter = extractProteinFeatures( use_domain_architectures,
222                                                             protein_lists_per_species,
223                                                             feat,
224                                                             proteins_file_writer,
225                                                             ie_cutoff,
226                                                             separator );
227                 if ( counter < 1 ) {
228                     ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" );
229                 }
230                 total += counter;
231                 proteins_file_writer.close();
232             }
233             ForesterUtil.programMessage( "surfacing",
234                                          "Wrote " + total + " individual " + b + " from a total of "
235                                                  + all_features.size() + " into: " + dir );
236         }
237     }
238
239     private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
240         final Set<String> first = features_per_genome_list.get( 0 );
241         final SortedSet<String> my_first = new TreeSet<String>();
242         for( final String s : first ) {
243             my_first.add( s );
244         }
245         for( int i = 1; i < features_per_genome_list.size(); ++i ) {
246             my_first.retainAll( features_per_genome_list.get( i ) );
247         }
248         return my_first;
249     }
250
251     private final static int extractProteinFeatures( final boolean use_domain_architectures,
252                                                      final SortedMap<Species, List<Protein>> protein_lists_per_species,
253                                                      final String domain_id,
254                                                      final Writer out,
255                                                      final double ie_cutoff,
256                                                      final String domain_separator )
257             throws IOException {
258         int counter = 0;
259         final String separator_for_output = "\t";
260         for( final Species species : protein_lists_per_species.keySet() ) {
261             final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
262             for( final Protein protein : proteins_per_species ) {
263                 if ( use_domain_architectures ) {
264                     if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
265                         int from = Integer.MAX_VALUE;
266                         int to = -1;
267                         for( final Domain d : protein.getProteinDomains() ) {
268                             if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
269                                 if ( d.getFrom() < from ) {
270                                     from = d.getFrom();
271                                 }
272                                 if ( d.getTo() > to ) {
273                                     to = d.getTo();
274                                 }
275                             }
276                         }
277                         out.write( protein.getSpecies().getSpeciesId() );
278                         out.write( separator_for_output );
279                         out.write( protein.getProteinId().getId() );
280                         out.write( separator_for_output );
281                         out.write( domain_id );
282                         out.write( separator_for_output );
283                         out.write( "/" );
284                         out.write( from + "-" + to );
285                         out.write( "/" );
286                         out.write( SurfacingConstants.NL );
287                         ++counter;
288                     }
289                 }
290                 else {
291                     final List<Domain> domains = protein.getProteinDomains( domain_id );
292                     if ( domains.size() > 0 ) {
293                         out.write( protein.getSpecies().getSpeciesId() );
294                         out.write( separator_for_output );
295                         out.write( protein.getProteinId().getId() );
296                         out.write( separator_for_output );
297                         out.write( domain_id );
298                         out.write( separator_for_output );
299                         for( final Domain domain : domains ) {
300                             if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
301                                 out.write( "/" );
302                                 out.write( domain.getFrom() + "-" + domain.getTo() );
303                             }
304                         }
305                         out.write( "/" );
306                         out.write( separator_for_output );
307                         final List<Domain> domain_list = new ArrayList<Domain>();
308                         for( final Domain domain : protein.getProteinDomains() ) {
309                             if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
310                                 domain_list.add( domain );
311                             }
312                         }
313                         final Domain domain_ary[] = new Domain[ domain_list.size() ];
314                         for( int i = 0; i < domain_list.size(); ++i ) {
315                             domain_ary[ i ] = domain_list.get( i );
316                         }
317                         Arrays.sort( domain_ary, new DomainComparator( true ) );
318                         out.write( "{" );
319                         boolean first = true;
320                         for( final Domain domain : domain_ary ) {
321                             if ( first ) {
322                                 first = false;
323                             }
324                             else {
325                                 out.write( "," );
326                             }
327                             out.write( domain.getDomainId().toString() );
328                             out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
329                             out.write( ":" + domain.getPerDomainEvalue() );
330                         }
331                         out.write( "}" );
332                         if ( !( ForesterUtil.isEmpty( protein.getDescription() )
333                                 || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
334                             out.write( protein.getDescription() );
335                         }
336                         out.write( separator_for_output );
337                         if ( !( ForesterUtil.isEmpty( protein.getAccession() )
338                                 || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
339                             out.write( protein.getAccession() );
340                         }
341                         out.write( SurfacingConstants.NL );
342                         ++counter;
343                     }
344                 }
345             }
346         }
347         out.flush();
348         return counter;
349     }
350
351     public static void main( final String[] args ) {
352         Set<String> a = new HashSet<String>();
353         Set<String> b = new HashSet<String>();
354         Set<String> c = new HashSet<String>();
355         Set<String> d = new HashSet<String>();
356         a.add( "x" );
357         a.add( "b" );
358         a.add( "c" );
359         b.add( "a" );
360         b.add( "b" );
361         b.add( "c" );
362         c.add( "a" );
363         c.add( "b" );
364         c.add( "c" );
365         c.add( "c" );
366         c.add( "f" );
367         d.add( "a" );
368         d.add( "c" );
369         d.add( "d" );
370         List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
371         domains_per_genome_list.add( a );
372         domains_per_genome_list.add( b );
373         domains_per_genome_list.add( c );
374         domains_per_genome_list.add( d );
375         Set<String> x = calcIntersection( domains_per_genome_list );
376         System.out.println( x );
377     }
378 }