2 package org.forester.surfacing;
4 import java.io.BufferedWriter;
6 import java.io.FileWriter;
7 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashSet;
12 import java.util.List;
13 import java.util.Map.Entry;
15 import java.util.SortedMap;
16 import java.util.SortedSet;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
20 import org.forester.application.surfacing;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyMethods;
23 import org.forester.phylogeny.PhylogenyNode;
24 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
25 import org.forester.protein.Domain;
26 import org.forester.protein.Protein;
27 import org.forester.species.BasicSpecies;
28 import org.forester.species.Species;
29 import org.forester.surfacing.SurfacingUtil.DomainComparator;
30 import org.forester.util.ForesterUtil;
32 public final class MinimalDomainomeCalculator {
34 public final static void calc( final boolean use_domain_architectures,
36 final int target_level,
37 final SortedMap<Species, List<Protein>> protein_lists_per_species,
38 final String separator,
39 final double ie_cutoff,
40 final String outfile_base,
41 final boolean write_protein_files )
43 final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
44 if ( protein_lists_per_species == null || tre == null ) {
45 throw new IllegalArgumentException( "argument is null" );
47 if ( protein_lists_per_species.size() < 2 ) {
48 throw new IllegalArgumentException( "not enough genomes" );
51 if ( use_domain_architectures ) {
57 final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
58 final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
59 SurfacingUtil.checkForOutputFileWriteability( outfile );
60 SurfacingUtil.checkForOutputFileWriteability( outfile_table );
61 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
62 final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
63 out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
64 out.write( ForesterUtil.LINE_SEPARATOR );
67 SortedMap<String, List<Protein>> protein_lists_per_quasi_species = null;
68 if ( target_level >= 1 ) {
69 protein_lists_per_quasi_species = makeProteinListsPerQuasiSpecies( tre,
71 protein_lists_per_species );
76 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
77 final PhylogenyNode node = iter.next();
78 final int node_level = PhylogenyMethods.calculateLevel( node );
79 final String species_name = node.getNodeData().isHasTaxonomy()
80 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
81 final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
83 final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
85 final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
86 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
87 if ( ( target_level < 1 ) || ( node_level >= target_level ) ) {
88 out.write( species_name + " " + node_level);
89 if ( !ForesterUtil.isEmpty( common ) ) {
90 out.write( "\t" + common );
95 if ( !ForesterUtil.isEmpty( tcode ) ) {
96 out.write( "\t" + tcode );
101 if ( !ForesterUtil.isEmpty( rank ) ) {
102 out.write( "\t" + rank );
107 if ( node.isInternal() ) {
108 out.write( "\t" + external_descs.size() + "\t" );
114 final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
115 boolean first = true;
116 if ( target_level >= 1 ) {
119 if ( node_level >= target_level ) {
120 final List<PhylogenyNode> given_level_descs = PhylogenyMethods
121 .getAllDescendantsOfGivenLevel( node, target_level );
122 for( final PhylogenyNode given_level_desc : given_level_descs ) {
123 final String spec_name = given_level_desc.getNodeData().isHasTaxonomy()
124 ? given_level_desc.getNodeData().getTaxonomy().getScientificName()
125 : given_level_desc.getName();
126 if ( node.isInternal() ) {
133 out.write( "sp_n=" + spec_name );
135 final List<Protein> proteins_per_species = protein_lists_per_quasi_species.get( spec_name );
136 if ( proteins_per_species != null ) {
137 final SortedSet<String> features_per_genome = new TreeSet<String>();
138 for( final Protein protein : proteins_per_species ) {
139 if ( use_domain_architectures ) {
140 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
141 features_per_genome.add( da );
144 List<Domain> domains = protein.getProteinDomains();
145 for( final Domain domain : domains ) {
146 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
147 features_per_genome.add( domain.getDomainId() );
152 System.out.println( ">>>>>>>>>>>>>> features_per_genome.size()=" + features_per_genome.size() );
153 if ( features_per_genome.size() > 0 ) {
154 features_per_genome_list.add( features_per_genome );
157 System.out.println( "error!" );
162 System.out.println( "error!" );
171 for( final PhylogenyNode external_desc : external_descs ) {
172 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
173 if ( node.isInternal() ) {
182 final List<Protein> proteins_per_species = protein_lists_per_species
183 .get( new BasicSpecies( code ) );
184 if ( proteins_per_species != null ) {
185 final SortedSet<String> features_per_genome = new TreeSet<String>();
186 for( final Protein protein : proteins_per_species ) {
187 if ( use_domain_architectures ) {
188 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
189 features_per_genome.add( da );
192 List<Domain> domains = protein.getProteinDomains();
193 for( final Domain domain : domains ) {
194 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
195 features_per_genome.add( domain.getDomainId() );
200 if ( features_per_genome.size() > 0 ) {
201 features_per_genome_list.add( features_per_genome );
204 } // for( final PhylogenyNode external_desc : external_descs )
206 if ( features_per_genome_list.size() > 0 ) {
207 SortedSet<String> intersection = calcIntersection( features_per_genome_list );
208 out.write( "\t" + intersection.size() + "\t" );
210 for( final String s : intersection ) {
219 out.write( ForesterUtil.LINE_SEPARATOR );
220 species_to_features_map.put( species_name, intersection );
223 final SortedSet<String> all_species_names = new TreeSet<String>();
224 final SortedSet<String> all_features = new TreeSet<String>();
225 for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
226 all_species_names.add( e.getKey() );
227 for( final String f : e.getValue() ) {
228 all_features.add( f );
231 out_table.write( '\t' );
232 boolean first = true;
233 for( final String species_name : all_species_names ) {
238 out_table.write( '\t' );
240 out_table.write( species_name );
242 out_table.write( ForesterUtil.LINE_SEPARATOR );
243 for( final String das : all_features ) {
244 out_table.write( das );
245 out_table.write( '\t' );
247 for( final String species_name : all_species_names ) {
252 out_table.write( '\t' );
254 if ( species_to_features_map.get( species_name ).contains( das ) ) {
255 out_table.write( '1' );
258 out_table.write( '0' );
261 out_table.write( ForesterUtil.LINE_SEPARATOR );
267 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
268 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
269 if ( write_protein_files ) {
270 final String protdirname;
273 if ( use_domain_architectures ) {
275 b = "domain architectures (DAs)";
276 protdirname = "_DAS";
281 protdirname = "_DOMAINS";
283 final File prot_dir = new File( outfile_base + protdirname );
284 final boolean success = prot_dir.mkdir();
286 throw new IOException( "failed to create dir " + prot_dir );
289 final String dir = outfile_base + protdirname + "/";
290 for( final String feat : all_features ) {
291 final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX );
292 SurfacingUtil.checkForOutputFileWriteability( extract_outfile );
293 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) );
294 final int counter = extractProteinFeatures( use_domain_architectures,
295 protein_lists_per_species,
297 proteins_file_writer,
301 ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" );
304 proteins_file_writer.close();
306 ForesterUtil.programMessage( "surfacing",
307 "Wrote " + total + " individual " + b + " from a total of "
308 + all_features.size() + " into: " + dir );
312 private final static SortedMap<String, List<Protein>> makeProteinListsPerQuasiSpecies( final Phylogeny tre,
314 final SortedMap<Species, List<Protein>> protein_lists_per_species ) {
315 final SortedMap<String, List<Protein>> protein_lists_per_quasi_species = new TreeMap<String, List<Protein>>();
316 System.out.println( "---------------------------------" );
317 System.out.println( "level=" + level );
318 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
319 final PhylogenyNode node = iter.next();
320 final int node_level = PhylogenyMethods.calculateLevel( node );
321 if ( node_level == level ) {
322 System.out.println( "level=" + level );
323 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
324 final List<Protein> protein_list_per_quasi_species = new ArrayList<Protein>();
325 for( final PhylogenyNode external_desc : external_descs ) {
326 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
327 final List<Protein> proteins_per_species = protein_lists_per_species
328 .get( new BasicSpecies( code ) );
329 //System.out.println( code );
330 for( Protein protein : proteins_per_species ) {
331 protein_list_per_quasi_species.add( protein );
334 final String species_name = node.getNodeData().isHasTaxonomy()
335 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
336 System.out.println( "species_name=" + species_name );
337 protein_lists_per_quasi_species.put( species_name, protein_list_per_quasi_species );
338 System.out.println( ">>>>" + protein_list_per_quasi_species.size() );
342 return protein_lists_per_quasi_species;
345 private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
346 final Set<String> first = features_per_genome_list.get( 0 );
347 final SortedSet<String> my_first = new TreeSet<String>();
348 for( final String s : first ) {
351 for( int i = 1; i < features_per_genome_list.size(); ++i ) {
352 my_first.retainAll( features_per_genome_list.get( i ) );
357 private final static int extractProteinFeatures( final boolean use_domain_architectures,
358 final SortedMap<Species, List<Protein>> protein_lists_per_species,
359 final String domain_id,
361 final double ie_cutoff,
362 final String domain_separator )
365 final String separator_for_output = "\t";
366 for( final Species species : protein_lists_per_species.keySet() ) {
367 final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
368 for( final Protein protein : proteins_per_species ) {
369 if ( use_domain_architectures ) {
370 if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
371 int from = Integer.MAX_VALUE;
373 for( final Domain d : protein.getProteinDomains() ) {
374 if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
375 if ( d.getFrom() < from ) {
378 if ( d.getTo() > to ) {
383 out.write( protein.getSpecies().getSpeciesId() );
384 out.write( separator_for_output );
385 out.write( protein.getProteinId().getId() );
386 out.write( separator_for_output );
387 out.write( domain_id );
388 out.write( separator_for_output );
390 out.write( from + "-" + to );
392 out.write( SurfacingConstants.NL );
397 final List<Domain> domains = protein.getProteinDomains( domain_id );
398 if ( domains.size() > 0 ) {
399 out.write( protein.getSpecies().getSpeciesId() );
400 out.write( separator_for_output );
401 out.write( protein.getProteinId().getId() );
402 out.write( separator_for_output );
403 out.write( domain_id );
404 out.write( separator_for_output );
405 for( final Domain domain : domains ) {
406 if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
408 out.write( domain.getFrom() + "-" + domain.getTo() );
412 out.write( separator_for_output );
413 final List<Domain> domain_list = new ArrayList<Domain>();
414 for( final Domain domain : protein.getProteinDomains() ) {
415 if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
416 domain_list.add( domain );
419 final Domain domain_ary[] = new Domain[ domain_list.size() ];
420 for( int i = 0; i < domain_list.size(); ++i ) {
421 domain_ary[ i ] = domain_list.get( i );
423 Arrays.sort( domain_ary, new DomainComparator( true ) );
425 boolean first = true;
426 for( final Domain domain : domain_ary ) {
433 out.write( domain.getDomainId().toString() );
434 out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
435 out.write( ":" + domain.getPerDomainEvalue() );
438 if ( !( ForesterUtil.isEmpty( protein.getDescription() )
439 || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
440 out.write( protein.getDescription() );
442 out.write( separator_for_output );
443 if ( !( ForesterUtil.isEmpty( protein.getAccession() )
444 || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
445 out.write( protein.getAccession() );
447 out.write( SurfacingConstants.NL );
457 public static void main( final String[] args ) {
458 Set<String> a = new HashSet<String>();
459 Set<String> b = new HashSet<String>();
460 Set<String> c = new HashSet<String>();
461 Set<String> d = new HashSet<String>();
476 List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
477 domains_per_genome_list.add( a );
478 domains_per_genome_list.add( b );
479 domains_per_genome_list.add( c );
480 domains_per_genome_list.add( d );
481 Set<String> x = calcIntersection( domains_per_genome_list );
482 System.out.println( x );