2 package org.forester.surfacing;
4 import java.io.BufferedWriter;
6 import java.io.FileWriter;
7 import java.io.IOException;
8 import java.util.ArrayList;
9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.List;
13 import java.util.Map.Entry;
15 import java.util.SortedMap;
16 import java.util.SortedSet;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
20 import org.forester.application.surfacing;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyNode;
23 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
24 import org.forester.protein.Domain;
25 import org.forester.protein.Protein;
26 import org.forester.species.BasicSpecies;
27 import org.forester.species.Species;
28 import org.forester.util.ForesterUtil;
30 public final class MinimalDomainomeCalculator {
32 static final public void calcDomainome( final Phylogeny tre,
33 final SortedMap<Species, List<Protein>> protein_lists_per_species,
34 final double ie_cutoff ) {
35 if ( protein_lists_per_species == null || tre == null ) {
36 throw new IllegalArgumentException( "argument is null" );
38 if ( protein_lists_per_species.size() < 2 ) {
39 throw new IllegalArgumentException( "not enough genomes" );
41 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
42 final PhylogenyNode node = iter.next();
43 if ( node.isInternal() ) {
45 if ( node.getNodeData().isHasTaxonomy() ) {
46 System.out.println( node.getNodeData().getTaxonomy().getScientificName() + ":" );
49 System.out.println( node.getName() + ":" );
51 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
52 final List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
53 for( final PhylogenyNode external_desc : external_descs ) {
54 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
55 System.out.print( code + " " );
56 final List<Protein> proteins_per_species = protein_lists_per_species
57 .get( new BasicSpecies( code ) );
58 if ( proteins_per_species != null ) {
59 final SortedSet<String> domains_per_genome = new TreeSet<String>();
60 for( final Protein protein : proteins_per_species ) {
61 List<Domain> domains = protein.getProteinDomains();
62 for( final Domain domain : domains ) {
63 if ( ( domain.getPerDomainEvalue() <= ie_cutoff ) || ( ie_cutoff <= -1 ) ) {
64 domains_per_genome.add( domain.getDomainId() );
68 if ( domains_per_genome.size() > 0 ) {
69 domains_per_genome_list.add( domains_per_genome );
74 if ( domains_per_genome_list.size() > 0 ) {
75 Set<String> intersection = calcIntersection( domains_per_genome_list );
76 System.out.println( intersection );
82 static final public void calcOme( final boolean use_domain_architectures,
84 final SortedMap<Species, List<Protein>> protein_lists_per_species,
85 final String separator,
86 final double ie_cutoff,
87 final String outfile_base )
89 final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
90 if ( protein_lists_per_species == null || tre == null ) {
91 throw new IllegalArgumentException( "argument is null" );
93 if ( protein_lists_per_species.size() < 2 ) {
94 throw new IllegalArgumentException( "not enough genomes" );
97 if ( use_domain_architectures ) {
103 final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
104 final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
105 SurfacingUtil.checkForOutputFileWriteability( outfile );
106 SurfacingUtil.checkForOutputFileWriteability( outfile_table );
107 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
108 final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
109 out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
110 out.write( ForesterUtil.LINE_SEPARATOR );
111 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
112 final PhylogenyNode node = iter.next();
113 final String species_name = node.getNodeData().isHasTaxonomy()
114 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
115 final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
117 final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
119 final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
120 out.write( species_name );
121 if ( !ForesterUtil.isEmpty( common ) ) {
122 out.write( "\t" + common );
127 if ( !ForesterUtil.isEmpty( tcode ) ) {
128 out.write( "\t" + tcode );
133 if ( !ForesterUtil.isEmpty( rank ) ) {
134 out.write( "\t" + rank );
139 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
140 if ( node.isInternal() ) {
141 out.write( "\t" + external_descs.size() + "\t" );
146 final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
147 boolean first = true;
148 for( final PhylogenyNode external_desc : external_descs ) {
149 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
150 if ( node.isInternal() ) {
159 final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
160 if ( proteins_per_species != null ) {
161 final SortedSet<String> das_per_genome = new TreeSet<String>();
162 for( final Protein protein : proteins_per_species ) {
163 if ( use_domain_architectures ) {
164 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
165 das_per_genome.add( da );
168 List<Domain> domains = protein.getProteinDomains();
169 for( final Domain domain : domains ) {
170 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
171 das_per_genome.add( domain.getDomainId() );
176 if ( das_per_genome.size() > 0 ) {
177 das_per_genome_list.add( das_per_genome );
181 if ( das_per_genome_list.size() > 0 ) {
182 SortedSet<String> intersection = calcIntersection( das_per_genome_list );
183 out.write( "\t" + intersection.size() + "\t" );
185 for( final String s : intersection ) {
194 out.write( ForesterUtil.LINE_SEPARATOR );
195 species_to_das_map.put( species_name, intersection );
198 final SortedSet<String> all_species_names = new TreeSet<String>();
199 final SortedSet<String> all_das = new TreeSet<String>();
200 for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
201 all_species_names.add( e.getKey() );
202 for( final String das : e.getValue() ) {
206 out_table.write( '\t' );
207 boolean first = true;
208 for( final String species_name : all_species_names ) {
213 out_table.write( '\t' );
215 out_table.write( species_name );
217 out_table.write( ForesterUtil.LINE_SEPARATOR );
218 for( final String das : all_das ) {
219 out_table.write( das );
220 out_table.write( '\t' );
222 for( final String species_name : all_species_names ) {
227 out_table.write( '\t' );
229 if ( species_to_das_map.get( species_name ).contains( das ) ) {
230 out_table.write( '1' );
233 out_table.write( '0' );
236 out_table.write( ForesterUtil.LINE_SEPARATOR );
242 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
243 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
246 static final public void calcDAome( final Phylogeny tre,
247 final SortedMap<Species, List<Protein>> protein_lists_per_species,
248 final String separator,
249 final double ie_cutoff,
250 final String outfile_base )
252 final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
253 if ( protein_lists_per_species == null || tre == null ) {
254 throw new IllegalArgumentException( "argument is null" );
256 if ( protein_lists_per_species.size() < 2 ) {
257 throw new IllegalArgumentException( "not enough genomes" );
259 final File outfile = new File( outfile_base + "_minimal_daome.txt" );
260 final File outfile_table = new File( outfile_base + "_minimal_daome.tsv" );
261 SurfacingUtil.checkForOutputFileWriteability( outfile );
262 SurfacingUtil.checkForOutputFileWriteability( outfile_table );
263 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
264 final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
265 out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
266 out.write( ForesterUtil.LINE_SEPARATOR );
267 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
268 final PhylogenyNode node = iter.next();
269 final String species_name = node.getNodeData().isHasTaxonomy()
270 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
271 final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
273 final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
275 final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
276 out.write( species_name );
277 if ( !ForesterUtil.isEmpty( common ) ) {
278 out.write( "\t" + common );
283 if ( !ForesterUtil.isEmpty( tcode ) ) {
284 out.write( "\t" + tcode );
289 if ( !ForesterUtil.isEmpty( rank ) ) {
290 out.write( "\t" + rank );
295 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
296 if ( node.isInternal() ) {
297 out.write( "\t" + external_descs.size() + "\t" );
302 final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
303 boolean first = true;
304 for( final PhylogenyNode external_desc : external_descs ) {
305 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
306 if ( node.isInternal() ) {
315 final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
316 if ( proteins_per_species != null ) {
317 final SortedSet<String> das_per_genome = new TreeSet<String>();
318 for( final Protein protein : proteins_per_species ) {
319 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
320 das_per_genome.add( da );
322 if ( das_per_genome.size() > 0 ) {
323 das_per_genome_list.add( das_per_genome );
327 if ( das_per_genome_list.size() > 0 ) {
328 SortedSet<String> intersection = calcIntersection( das_per_genome_list );
329 out.write( "\t" + intersection.size() + "\t" );
331 for( final String s : intersection ) {
340 out.write( ForesterUtil.LINE_SEPARATOR );
341 species_to_das_map.put( species_name, intersection );
344 final SortedSet<String> all_species_names = new TreeSet<String>();
345 final SortedSet<String> all_das = new TreeSet<String>();
346 for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
347 all_species_names.add( e.getKey() );
348 for( final String das : e.getValue() ) {
352 out_table.write( '\t' );
353 boolean first = true;
354 for( final String species_name : all_species_names ) {
359 out_table.write( '\t' );
361 out_table.write( species_name );
363 out_table.write( ForesterUtil.LINE_SEPARATOR );
364 for( final String das : all_das ) {
365 out_table.write( das );
366 out_table.write( '\t' );
368 for( final String species_name : all_species_names ) {
373 out_table.write( '\t' );
375 if ( species_to_das_map.get( species_name ).contains( das ) ) {
376 out_table.write( '1' );
379 out_table.write( '0' );
382 out_table.write( ForesterUtil.LINE_SEPARATOR );
388 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
389 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
392 private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
393 final Set<String> first = features_per_genome_list.get( 0 );
394 final SortedSet<String> my_first = new TreeSet<String>();
395 for( final String s : first ) {
398 for( int i = 1; i < features_per_genome_list.size(); ++i ) {
399 my_first.retainAll( features_per_genome_list.get( i ) );
404 public static void main( final String[] args ) {
405 Set<String> a = new HashSet<String>();
406 Set<String> b = new HashSet<String>();
407 Set<String> c = new HashSet<String>();
408 Set<String> d = new HashSet<String>();
423 List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
424 domains_per_genome_list.add( a );
425 domains_per_genome_list.add( b );
426 domains_per_genome_list.add( c );
427 domains_per_genome_list.add( d );
428 Set<String> x = calcIntersection( domains_per_genome_list );
429 System.out.println( x );