2 package org.forester.surfacing;
4 import java.io.BufferedWriter;
6 import java.io.FileWriter;
7 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashSet;
12 import java.util.List;
13 import java.util.Map.Entry;
15 import java.util.SortedMap;
16 import java.util.SortedSet;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
20 import org.forester.application.surfacing;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyMethods;
23 import org.forester.phylogeny.PhylogenyNode;
24 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
25 import org.forester.protein.Domain;
26 import org.forester.protein.Protein;
27 import org.forester.species.BasicSpecies;
28 import org.forester.species.Species;
29 import org.forester.surfacing.SurfacingUtil.DomainComparator;
30 import org.forester.util.ForesterUtil;
32 public final class MinimalDomainomeCalculator {
34 public final static void calc( final boolean use_domain_architectures,
36 final SortedMap<Species, List<Protein>> protein_lists_per_species,
37 final String separator,
38 final double ie_cutoff,
39 final String outfile_base,
40 final boolean write_protein_files )
42 final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
43 if ( protein_lists_per_species == null || tre == null ) {
44 throw new IllegalArgumentException( "argument is null" );
46 if ( protein_lists_per_species.size() < 2 ) {
47 throw new IllegalArgumentException( "not enough genomes" );
50 if ( use_domain_architectures ) {
56 final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
57 final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
58 SurfacingUtil.checkForOutputFileWriteability( outfile );
59 SurfacingUtil.checkForOutputFileWriteability( outfile_table );
60 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
61 final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
62 out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
63 out.write( ForesterUtil.LINE_SEPARATOR );
64 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
65 final PhylogenyNode node = iter.next();
66 final String species_name = node.getNodeData().isHasTaxonomy()
67 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
68 final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
70 final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
72 final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
73 out.write( species_name );
74 if ( !ForesterUtil.isEmpty( common ) ) {
75 out.write( "\t" + common );
80 if ( !ForesterUtil.isEmpty( tcode ) ) {
81 out.write( "\t" + tcode );
86 if ( !ForesterUtil.isEmpty( rank ) ) {
87 out.write( "\t" + rank );
92 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
93 if ( node.isInternal() ) {
94 out.write( "\t" + external_descs.size() + "\t" );
99 final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
100 boolean first = true;
101 for( final PhylogenyNode external_desc : external_descs ) {
102 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
103 if ( node.isInternal() ) {
112 final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
113 if ( proteins_per_species != null ) {
114 final SortedSet<String> features_per_genome = new TreeSet<String>();
115 for( final Protein protein : proteins_per_species ) {
116 if ( use_domain_architectures ) {
117 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
118 features_per_genome.add( da );
121 List<Domain> domains = protein.getProteinDomains();
122 for( final Domain domain : domains ) {
123 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
124 features_per_genome.add( domain.getDomainId() );
129 if ( features_per_genome.size() > 0 ) {
130 features_per_genome_list.add( features_per_genome );
134 if ( features_per_genome_list.size() > 0 ) {
135 SortedSet<String> intersection = calcIntersection( features_per_genome_list );
136 out.write( "\t" + intersection.size() + "\t" );
138 for( final String s : intersection ) {
147 out.write( ForesterUtil.LINE_SEPARATOR );
148 species_to_features_map.put( species_name, intersection );
151 final SortedSet<String> all_species_names = new TreeSet<String>();
152 final SortedSet<String> all_features = new TreeSet<String>();
153 for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
154 all_species_names.add( e.getKey() );
155 for( final String f : e.getValue() ) {
156 all_features.add( f );
159 out_table.write( '\t' );
160 boolean first = true;
161 for( final String species_name : all_species_names ) {
166 out_table.write( '\t' );
168 out_table.write( species_name );
170 out_table.write( ForesterUtil.LINE_SEPARATOR );
171 for( final String das : all_features ) {
172 out_table.write( das );
173 out_table.write( '\t' );
175 for( final String species_name : all_species_names ) {
180 out_table.write( '\t' );
182 if ( species_to_features_map.get( species_name ).contains( das ) ) {
183 out_table.write( '1' );
186 out_table.write( '0' );
189 out_table.write( ForesterUtil.LINE_SEPARATOR );
195 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
196 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
197 if ( write_protein_files ) {
198 final String protdirname;
201 if ( use_domain_architectures ) {
203 b = "domain architectures (DAs)";
204 protdirname = "_DAS";
209 protdirname = "_DOMAINS";
211 final File prot_dir = new File( outfile_base + protdirname );
212 final boolean success = prot_dir.mkdir();
214 throw new IOException( "failed to create dir " + prot_dir );
217 final String dir = outfile_base + protdirname + "/";
218 for( final String feat : all_features ) {
219 final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX );
220 SurfacingUtil.checkForOutputFileWriteability( extract_outfile );
221 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) );
222 final int counter = extractProteinFeatures( use_domain_architectures,
223 protein_lists_per_species,
225 proteins_file_writer,
229 ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" );
232 proteins_file_writer.close();
234 ForesterUtil.programMessage( "surfacing",
235 "Wrote " + total + " individual " + b + " from a total of "
236 + all_features.size() + " into: " + dir );
240 public final static void calcNEW( final boolean use_domain_architectures,
243 final SortedMap<Species, List<Protein>> protein_lists_per_species,
244 final String separator,
245 final double ie_cutoff,
246 final String outfile_base,
247 final boolean write_protein_files )
249 final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
250 if ( protein_lists_per_species == null || tre == null ) {
251 throw new IllegalArgumentException( "argument is null" );
253 if ( protein_lists_per_species.size() < 2 ) {
254 throw new IllegalArgumentException( "not enough genomes" );
257 if ( use_domain_architectures ) {
263 final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
264 final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
265 SurfacingUtil.checkForOutputFileWriteability( outfile );
266 SurfacingUtil.checkForOutputFileWriteability( outfile_table );
267 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
268 final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
269 out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
270 out.write( ForesterUtil.LINE_SEPARATOR );
273 SortedMap<String, List<Protein>> protein_lists_per_quasi_species = null;
275 protein_lists_per_quasi_species = makeProteinListsPerQuasiSpecies( tre, level, protein_lists_per_species );
279 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
280 final PhylogenyNode node = iter.next();
281 final String species_name = node.getNodeData().isHasTaxonomy()
282 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
283 final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
285 final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
287 final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
288 out.write( species_name );
289 if ( !ForesterUtil.isEmpty( common ) ) {
290 out.write( "\t" + common );
295 if ( !ForesterUtil.isEmpty( tcode ) ) {
296 out.write( "\t" + tcode );
301 if ( !ForesterUtil.isEmpty( rank ) ) {
302 out.write( "\t" + rank );
307 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
308 if ( node.isInternal() ) {
309 out.write( "\t" + external_descs.size() + "\t" );
314 final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
315 boolean first = true;
316 for( final PhylogenyNode external_desc : external_descs ) {
317 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
318 if ( node.isInternal() ) {
327 final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
328 final int node_level = PhylogenyMethods.calculateLevel( node );
329 final List<Protein> proteins_per_quasi_species = protein_lists_per_species
330 .get( new BasicSpecies( code ) );
331 if ( proteins_per_species != null ) {
332 final SortedSet<String> features_per_genome = new TreeSet<String>();
333 for( final Protein protein : proteins_per_species ) {
334 if ( use_domain_architectures ) {
335 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
336 features_per_genome.add( da );
339 List<Domain> domains = protein.getProteinDomains();
340 for( final Domain domain : domains ) {
341 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
342 features_per_genome.add( domain.getDomainId() );
347 if ( features_per_genome.size() > 0 ) {
348 features_per_genome_list.add( features_per_genome );
351 } // for( final PhylogenyNode external_desc : external_descs )
352 if ( features_per_genome_list.size() > 0 ) {
353 SortedSet<String> intersection = calcIntersection( features_per_genome_list );
354 out.write( "\t" + intersection.size() + "\t" );
356 for( final String s : intersection ) {
365 out.write( ForesterUtil.LINE_SEPARATOR );
366 species_to_features_map.put( species_name, intersection );
369 final SortedSet<String> all_species_names = new TreeSet<String>();
370 final SortedSet<String> all_features = new TreeSet<String>();
371 for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
372 all_species_names.add( e.getKey() );
373 for( final String f : e.getValue() ) {
374 all_features.add( f );
377 out_table.write( '\t' );
378 boolean first = true;
379 for( final String species_name : all_species_names ) {
384 out_table.write( '\t' );
386 out_table.write( species_name );
388 out_table.write( ForesterUtil.LINE_SEPARATOR );
389 for( final String das : all_features ) {
390 out_table.write( das );
391 out_table.write( '\t' );
393 for( final String species_name : all_species_names ) {
398 out_table.write( '\t' );
400 if ( species_to_features_map.get( species_name ).contains( das ) ) {
401 out_table.write( '1' );
404 out_table.write( '0' );
407 out_table.write( ForesterUtil.LINE_SEPARATOR );
413 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
414 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
415 if ( write_protein_files ) {
416 final String protdirname;
419 if ( use_domain_architectures ) {
421 b = "domain architectures (DAs)";
422 protdirname = "_DAS";
427 protdirname = "_DOMAINS";
429 final File prot_dir = new File( outfile_base + protdirname );
430 final boolean success = prot_dir.mkdir();
432 throw new IOException( "failed to create dir " + prot_dir );
435 final String dir = outfile_base + protdirname + "/";
436 for( final String feat : all_features ) {
437 final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX );
438 SurfacingUtil.checkForOutputFileWriteability( extract_outfile );
439 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) );
440 final int counter = extractProteinFeatures( use_domain_architectures,
441 protein_lists_per_species,
443 proteins_file_writer,
447 ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" );
450 proteins_file_writer.close();
452 ForesterUtil.programMessage( "surfacing",
453 "Wrote " + total + " individual " + b + " from a total of "
454 + all_features.size() + " into: " + dir );
458 private final static SortedMap<String, List<Protein>> makeProteinListsPerQuasiSpecies( final Phylogeny tre,
460 final SortedMap<Species, List<Protein>> protein_lists_per_species ) {
461 final SortedMap<String, List<Protein>> protein_lists_per_quasi_species = new TreeMap<String, List<Protein>>();
462 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
463 final PhylogenyNode node = iter.next();
464 final int node_level = PhylogenyMethods.calculateLevel( node );
465 if ( node_level == level ) {
466 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
467 final List<Protein> protein_list_per_quasi_species = new ArrayList<Protein>();
468 for( final PhylogenyNode external_desc : external_descs ) {
469 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
470 final List<Protein> proteins_per_species = protein_lists_per_species
471 .get( new BasicSpecies( code ) );
472 for( Protein protein : proteins_per_species ) {
473 protein_list_per_quasi_species.add( protein );
476 final String species_name = node.getNodeData().isHasTaxonomy()
477 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
478 protein_lists_per_quasi_species.put( species_name, protein_list_per_quasi_species );
481 return protein_lists_per_quasi_species;
484 private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
485 final Set<String> first = features_per_genome_list.get( 0 );
486 final SortedSet<String> my_first = new TreeSet<String>();
487 for( final String s : first ) {
490 for( int i = 1; i < features_per_genome_list.size(); ++i ) {
491 my_first.retainAll( features_per_genome_list.get( i ) );
496 private final static int extractProteinFeatures( final boolean use_domain_architectures,
497 final SortedMap<Species, List<Protein>> protein_lists_per_species,
498 final String domain_id,
500 final double ie_cutoff,
501 final String domain_separator )
504 final String separator_for_output = "\t";
505 for( final Species species : protein_lists_per_species.keySet() ) {
506 final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
507 for( final Protein protein : proteins_per_species ) {
508 if ( use_domain_architectures ) {
509 if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
510 int from = Integer.MAX_VALUE;
512 for( final Domain d : protein.getProteinDomains() ) {
513 if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
514 if ( d.getFrom() < from ) {
517 if ( d.getTo() > to ) {
522 out.write( protein.getSpecies().getSpeciesId() );
523 out.write( separator_for_output );
524 out.write( protein.getProteinId().getId() );
525 out.write( separator_for_output );
526 out.write( domain_id );
527 out.write( separator_for_output );
529 out.write( from + "-" + to );
531 out.write( SurfacingConstants.NL );
536 final List<Domain> domains = protein.getProteinDomains( domain_id );
537 if ( domains.size() > 0 ) {
538 out.write( protein.getSpecies().getSpeciesId() );
539 out.write( separator_for_output );
540 out.write( protein.getProteinId().getId() );
541 out.write( separator_for_output );
542 out.write( domain_id );
543 out.write( separator_for_output );
544 for( final Domain domain : domains ) {
545 if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
547 out.write( domain.getFrom() + "-" + domain.getTo() );
551 out.write( separator_for_output );
552 final List<Domain> domain_list = new ArrayList<Domain>();
553 for( final Domain domain : protein.getProteinDomains() ) {
554 if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
555 domain_list.add( domain );
558 final Domain domain_ary[] = new Domain[ domain_list.size() ];
559 for( int i = 0; i < domain_list.size(); ++i ) {
560 domain_ary[ i ] = domain_list.get( i );
562 Arrays.sort( domain_ary, new DomainComparator( true ) );
564 boolean first = true;
565 for( final Domain domain : domain_ary ) {
572 out.write( domain.getDomainId().toString() );
573 out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
574 out.write( ":" + domain.getPerDomainEvalue() );
577 if ( !( ForesterUtil.isEmpty( protein.getDescription() )
578 || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
579 out.write( protein.getDescription() );
581 out.write( separator_for_output );
582 if ( !( ForesterUtil.isEmpty( protein.getAccession() )
583 || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
584 out.write( protein.getAccession() );
586 out.write( SurfacingConstants.NL );
596 public static void main( final String[] args ) {
597 Set<String> a = new HashSet<String>();
598 Set<String> b = new HashSet<String>();
599 Set<String> c = new HashSet<String>();
600 Set<String> d = new HashSet<String>();
615 List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
616 domains_per_genome_list.add( a );
617 domains_per_genome_list.add( b );
618 domains_per_genome_list.add( c );
619 domains_per_genome_list.add( d );
620 Set<String> x = calcIntersection( domains_per_genome_list );
621 System.out.println( x );