2 package org.forester.surfacing;
4 import java.io.BufferedWriter;
6 import java.io.FileWriter;
7 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashSet;
12 import java.util.List;
13 import java.util.Map.Entry;
15 import java.util.SortedMap;
16 import java.util.SortedSet;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
20 import org.forester.application.surfacing;
21 import org.forester.phylogeny.Phylogeny;
22 import org.forester.phylogeny.PhylogenyNode;
23 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
24 import org.forester.protein.Domain;
25 import org.forester.protein.Protein;
26 import org.forester.species.BasicSpecies;
27 import org.forester.species.Species;
28 import org.forester.surfacing.SurfacingUtil.DomainComparator;
29 import org.forester.util.ForesterUtil;
31 public final class MinimalDomainomeCalculator {
33 public final static void calc( final boolean use_domain_architectures,
35 final SortedMap<Species, List<Protein>> protein_lists_per_species,
36 final String separator,
37 final double ie_cutoff,
38 final String outfile_base,
39 final boolean write_protein_files )
41 final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
42 if ( protein_lists_per_species == null || tre == null ) {
43 throw new IllegalArgumentException( "argument is null" );
45 if ( protein_lists_per_species.size() < 2 ) {
46 throw new IllegalArgumentException( "not enough genomes" );
49 if ( use_domain_architectures ) {
55 final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
56 final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
57 SurfacingUtil.checkForOutputFileWriteability( outfile );
58 SurfacingUtil.checkForOutputFileWriteability( outfile_table );
59 final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
60 final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
61 out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
62 out.write( ForesterUtil.LINE_SEPARATOR );
63 for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
64 final PhylogenyNode node = iter.next();
65 final String species_name = node.getNodeData().isHasTaxonomy()
66 ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
67 final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
69 final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
71 final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
72 out.write( species_name );
73 if ( !ForesterUtil.isEmpty( common ) ) {
74 out.write( "\t" + common );
79 if ( !ForesterUtil.isEmpty( tcode ) ) {
80 out.write( "\t" + tcode );
85 if ( !ForesterUtil.isEmpty( rank ) ) {
86 out.write( "\t" + rank );
91 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
92 if ( node.isInternal() ) {
93 out.write( "\t" + external_descs.size() + "\t" );
98 final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
100 for( final PhylogenyNode external_desc : external_descs ) {
101 final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
102 if ( node.isInternal() ) {
111 final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
112 if ( proteins_per_species != null ) {
113 final SortedSet<String> features_per_genome = new TreeSet<String>();
114 for( final Protein protein : proteins_per_species ) {
115 if ( use_domain_architectures ) {
116 final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
117 features_per_genome.add( da );
120 List<Domain> domains = protein.getProteinDomains();
121 for( final Domain domain : domains ) {
122 if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
123 features_per_genome.add( domain.getDomainId() );
128 if ( features_per_genome.size() > 0 ) {
129 features_per_genome_list.add( features_per_genome );
133 if ( features_per_genome_list.size() > 0 ) {
134 SortedSet<String> intersection = calcIntersection( features_per_genome_list );
135 out.write( "\t" + intersection.size() + "\t" );
137 for( final String s : intersection ) {
146 out.write( ForesterUtil.LINE_SEPARATOR );
147 species_to_features_map.put( species_name, intersection );
150 final SortedSet<String> all_species_names = new TreeSet<String>();
151 final SortedSet<String> all_features = new TreeSet<String>();
152 for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
153 all_species_names.add( e.getKey() );
154 for( final String f : e.getValue() ) {
155 all_features.add( f );
158 out_table.write( '\t' );
159 boolean first = true;
160 for( final String species_name : all_species_names ) {
165 out_table.write( '\t' );
167 out_table.write( species_name );
169 out_table.write( ForesterUtil.LINE_SEPARATOR );
170 for( final String das : all_features ) {
171 out_table.write( das );
172 out_table.write( '\t' );
174 for( final String species_name : all_species_names ) {
179 out_table.write( '\t' );
181 if ( species_to_features_map.get( species_name ).contains( das ) ) {
182 out_table.write( '1' );
185 out_table.write( '0' );
188 out_table.write( ForesterUtil.LINE_SEPARATOR );
194 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
195 ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
196 if ( write_protein_files ) {
197 final String protdirname;
200 if ( use_domain_architectures ) {
202 b = "domain architectures (DAs)";
203 protdirname = "_DAS";
208 protdirname = "_DOMAINS";
210 final File prot_dir = new File( outfile_base + protdirname );
211 final boolean success = prot_dir.mkdir();
213 throw new IOException( "failed to create dir " + prot_dir );
216 final String dir = outfile_base + protdirname + "/";
217 for( final String feat : all_features ) {
218 final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX );
219 SurfacingUtil.checkForOutputFileWriteability( extract_outfile );
220 final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) );
221 final int counter = extractProteinFeatures( use_domain_architectures,
222 protein_lists_per_species,
224 proteins_file_writer,
228 ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" );
231 proteins_file_writer.close();
233 ForesterUtil.programMessage( "surfacing",
234 "Wrote " + total + " individual " + b + " from a total of "
235 + all_features.size() + " into: " + dir );
239 private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
240 final Set<String> first = features_per_genome_list.get( 0 );
241 final SortedSet<String> my_first = new TreeSet<String>();
242 for( final String s : first ) {
245 for( int i = 1; i < features_per_genome_list.size(); ++i ) {
246 my_first.retainAll( features_per_genome_list.get( i ) );
251 private final static int extractProteinFeatures( final boolean use_domain_architectures,
252 final SortedMap<Species, List<Protein>> protein_lists_per_species,
253 final String domain_id,
255 final double ie_cutoff,
256 final String domain_separator )
259 final String separator_for_output = "\t";
260 for( final Species species : protein_lists_per_species.keySet() ) {
261 final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
262 for( final Protein protein : proteins_per_species ) {
263 if ( use_domain_architectures ) {
264 if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
265 int from = Integer.MAX_VALUE;
267 for( final Domain d : protein.getProteinDomains() ) {
268 if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
269 if ( d.getFrom() < from ) {
272 if ( d.getTo() > to ) {
277 out.write( protein.getSpecies().getSpeciesId() );
278 out.write( separator_for_output );
279 out.write( protein.getProteinId().getId() );
280 out.write( separator_for_output );
281 out.write( domain_id );
282 out.write( separator_for_output );
284 out.write( from + "-" + to );
286 out.write( SurfacingConstants.NL );
291 final List<Domain> domains = protein.getProteinDomains( domain_id );
292 if ( domains.size() > 0 ) {
293 out.write( protein.getSpecies().getSpeciesId() );
294 out.write( separator_for_output );
295 out.write( protein.getProteinId().getId() );
296 out.write( separator_for_output );
297 out.write( domain_id );
298 out.write( separator_for_output );
299 for( final Domain domain : domains ) {
300 if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
302 out.write( domain.getFrom() + "-" + domain.getTo() );
306 out.write( separator_for_output );
307 final List<Domain> domain_list = new ArrayList<Domain>();
308 for( final Domain domain : protein.getProteinDomains() ) {
309 if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
310 domain_list.add( domain );
313 final Domain domain_ary[] = new Domain[ domain_list.size() ];
314 for( int i = 0; i < domain_list.size(); ++i ) {
315 domain_ary[ i ] = domain_list.get( i );
317 Arrays.sort( domain_ary, new DomainComparator( true ) );
319 boolean first = true;
320 for( final Domain domain : domain_ary ) {
327 out.write( domain.getDomainId().toString() );
328 out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
329 out.write( ":" + domain.getPerDomainEvalue() );
332 if ( !( ForesterUtil.isEmpty( protein.getDescription() )
333 || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
334 out.write( protein.getDescription() );
336 out.write( separator_for_output );
337 if ( !( ForesterUtil.isEmpty( protein.getAccession() )
338 || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
339 out.write( protein.getAccession() );
341 out.write( SurfacingConstants.NL );
351 public static void main( final String[] args ) {
352 Set<String> a = new HashSet<String>();
353 Set<String> b = new HashSet<String>();
354 Set<String> c = new HashSet<String>();
355 Set<String> d = new HashSet<String>();
370 List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
371 domains_per_genome_list.add( a );
372 domains_per_genome_list.add( b );
373 domains_per_genome_list.add( c );
374 domains_per_genome_list.add( d );
375 Set<String> x = calcIntersection( domains_per_genome_list );
376 System.out.println( x );