import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
+import java.io.Writer;
import java.util.ArrayList;
-import java.util.HashMap;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
-import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import org.forester.protein.Protein;
import org.forester.species.BasicSpecies;
import org.forester.species.Species;
+import org.forester.surfacing.SurfacingUtil.DomainComparator;
import org.forester.util.ForesterUtil;
public final class MinimalDomainomeCalculator {
- static final public void calcDomainome( final Phylogeny tre,
- final SortedMap<Species, List<Protein>> protein_lists_per_species,
- final double ie_cutoff ) {
- if ( protein_lists_per_species == null || tre == null ) {
- throw new IllegalArgumentException( "argument is null" );
- }
- if ( protein_lists_per_species.size() < 2 ) {
- throw new IllegalArgumentException( "not enough genomes" );
- }
- for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
- final PhylogenyNode node = iter.next();
- if ( node.isInternal() ) {
- System.out.println();
- if ( node.getNodeData().isHasTaxonomy() ) {
- System.out.println( node.getNodeData().getTaxonomy().getScientificName() + ":" );
- }
- else {
- System.out.println( node.getName() + ":" );
- }
- final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
- final List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
- for( final PhylogenyNode external_desc : external_descs ) {
- final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
- System.out.print( code + " " );
- final List<Protein> proteins_per_species = protein_lists_per_species
- .get( new BasicSpecies( code ) );
- if ( proteins_per_species != null ) {
- final SortedSet<String> domains_per_genome = new TreeSet<String>();
- for( final Protein protein : proteins_per_species ) {
- List<Domain> domains = protein.getProteinDomains();
- for( final Domain domain : domains ) {
- if ( ( domain.getPerDomainEvalue() <= ie_cutoff ) || ( ie_cutoff <= -1 ) ) {
- domains_per_genome.add( domain.getDomainId() );
- }
- }
- }
- if ( domains_per_genome.size() > 0 ) {
- domains_per_genome_list.add( domains_per_genome );
- }
- }
- }
- System.out.println();
- if ( domains_per_genome_list.size() > 0 ) {
- Set<String> intersection = calcIntersection( domains_per_genome_list );
- System.out.println( intersection );
- }
- }
- }
- }
-
static final public void calcOme( final boolean use_domain_architectures,
final Phylogeny tre,
final SortedMap<Species, List<Protein>> protein_lists_per_species,
final double ie_cutoff,
final String outfile_base )
throws IOException {
- final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
+ final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
if ( protein_lists_per_species == null || tre == null ) {
throw new IllegalArgumentException( "argument is null" );
}
SurfacingUtil.checkForOutputFileWriteability( outfile_table );
final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
- out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
+ out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
out.write( ForesterUtil.LINE_SEPARATOR );
for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
final PhylogenyNode node = iter.next();
else {
out.write( "\t\t" );
}
- final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
+ final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
boolean first = true;
for( final PhylogenyNode external_desc : external_descs ) {
final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
}
final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
if ( proteins_per_species != null ) {
- final SortedSet<String> das_per_genome = new TreeSet<String>();
+ final SortedSet<String> features_per_genome = new TreeSet<String>();
for( final Protein protein : proteins_per_species ) {
if ( use_domain_architectures ) {
final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
- das_per_genome.add( da );
+ features_per_genome.add( da );
}
else {
List<Domain> domains = protein.getProteinDomains();
for( final Domain domain : domains ) {
if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
- das_per_genome.add( domain.getDomainId() );
+ features_per_genome.add( domain.getDomainId() );
}
}
}
}
- if ( das_per_genome.size() > 0 ) {
- das_per_genome_list.add( das_per_genome );
+ if ( features_per_genome.size() > 0 ) {
+ features_per_genome_list.add( features_per_genome );
}
}
}
- if ( das_per_genome_list.size() > 0 ) {
- SortedSet<String> intersection = calcIntersection( das_per_genome_list );
+ if ( features_per_genome_list.size() > 0 ) {
+ SortedSet<String> intersection = calcIntersection( features_per_genome_list );
out.write( "\t" + intersection.size() + "\t" );
first = true;
for( final String s : intersection ) {
out.write( s );
}
out.write( ForesterUtil.LINE_SEPARATOR );
- species_to_das_map.put( species_name, intersection );
+ species_to_features_map.put( species_name, intersection );
}
}
final SortedSet<String> all_species_names = new TreeSet<String>();
- final SortedSet<String> all_das = new TreeSet<String>();
- for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
+ final SortedSet<String> all_features = new TreeSet<String>();
+ for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
all_species_names.add( e.getKey() );
- for( final String das : e.getValue() ) {
- all_das.add( das );
+ for( final String f : e.getValue() ) {
+ all_features.add( f );
}
}
out_table.write( '\t' );
out_table.write( species_name );
}
out_table.write( ForesterUtil.LINE_SEPARATOR );
- for( final String das : all_das ) {
+ for( final String das : all_features ) {
out_table.write( das );
out_table.write( '\t' );
first = true;
else {
out_table.write( '\t' );
}
- if ( species_to_das_map.get( species_name ).contains( das ) ) {
+ if ( species_to_features_map.get( species_name ).contains( das ) ) {
out_table.write( '1' );
}
else {
out_table.close();
ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
- }
-
- static final public void calcDAome( final Phylogeny tre,
- final SortedMap<Species, List<Protein>> protein_lists_per_species,
- final String separator,
- final double ie_cutoff,
- final String outfile_base )
- throws IOException {
- final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
- if ( protein_lists_per_species == null || tre == null ) {
- throw new IllegalArgumentException( "argument is null" );
- }
- if ( protein_lists_per_species.size() < 2 ) {
- throw new IllegalArgumentException( "not enough genomes" );
- }
- final File outfile = new File( outfile_base + "_minimal_daome.txt" );
- final File outfile_table = new File( outfile_base + "_minimal_daome.tsv" );
- SurfacingUtil.checkForOutputFileWriteability( outfile );
- SurfacingUtil.checkForOutputFileWriteability( outfile_table );
- final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
- final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
- out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
- out.write( ForesterUtil.LINE_SEPARATOR );
- for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
- final PhylogenyNode node = iter.next();
- final String species_name = node.getNodeData().isHasTaxonomy()
- ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
- final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
- : "";
- final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
- : "";
- final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
- out.write( species_name );
- if ( !ForesterUtil.isEmpty( common ) ) {
- out.write( "\t" + common );
- }
- else {
- out.write( "\t" );
- }
- if ( !ForesterUtil.isEmpty( tcode ) ) {
- out.write( "\t" + tcode );
- }
- else {
- out.write( "\t" );
- }
- if ( !ForesterUtil.isEmpty( rank ) ) {
- out.write( "\t" + rank );
- }
- else {
- out.write( "\t" );
- }
- final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
- if ( node.isInternal() ) {
- out.write( "\t" + external_descs.size() + "\t" );
- }
- else {
- out.write( "\t\t" );
- }
- final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
- boolean first = true;
- for( final PhylogenyNode external_desc : external_descs ) {
- final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
- if ( node.isInternal() ) {
- if ( first ) {
- first = false;
- }
- else {
- out.write( ", " );
- }
- out.write( code );
- }
- final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
- if ( proteins_per_species != null ) {
- final SortedSet<String> das_per_genome = new TreeSet<String>();
- for( final Protein protein : proteins_per_species ) {
- final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
- das_per_genome.add( da );
- }
- if ( das_per_genome.size() > 0 ) {
- das_per_genome_list.add( das_per_genome );
- }
- }
- }
- if ( das_per_genome_list.size() > 0 ) {
- SortedSet<String> intersection = calcIntersection( das_per_genome_list );
- out.write( "\t" + intersection.size() + "\t" );
- first = true;
- for( final String s : intersection ) {
- if ( first ) {
- first = false;
- }
- else {
- out.write( ", " );
- }
- out.write( s );
- }
- out.write( ForesterUtil.LINE_SEPARATOR );
- species_to_das_map.put( species_name, intersection );
- }
- }
- final SortedSet<String> all_species_names = new TreeSet<String>();
- final SortedSet<String> all_das = new TreeSet<String>();
- for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
- all_species_names.add( e.getKey() );
- for( final String das : e.getValue() ) {
- all_das.add( das );
- }
- }
- out_table.write( '\t' );
- boolean first = true;
- for( final String species_name : all_species_names ) {
- if ( first ) {
- first = false;
+ for( String f : all_features ) {
+ final String a;
+ if ( use_domain_architectures ) {
+ a = "DA_";
}
else {
- out_table.write( '\t' );
- }
- out_table.write( species_name );
- }
- out_table.write( ForesterUtil.LINE_SEPARATOR );
- for( final String das : all_das ) {
- out_table.write( das );
- out_table.write( '\t' );
- first = true;
- for( final String species_name : all_species_names ) {
- if ( first ) {
- first = false;
- }
- else {
- out_table.write( '\t' );
- }
- if ( species_to_das_map.get( species_name ).contains( das ) ) {
- out_table.write( '1' );
- }
- else {
- out_table.write( '0' );
- }
+ a = "domain_";
}
- out_table.write( ForesterUtil.LINE_SEPARATOR );
+ final File prot_dir = new File( outfile_base + "_prot" );
+ prot_dir.mkdir();
+ final File outt = new File( outfile_base + "_prot/" + a + f + surfacing.SEQ_EXTRACT_SUFFIX );
+ final Writer proteins_file_writer = new BufferedWriter( new FileWriter( outt ) );
+ extractProteinFeatures( use_domain_architectures,
+ protein_lists_per_species,
+ f,
+ proteins_file_writer,
+ ie_cutoff,
+ separator );
+ proteins_file_writer.close();
}
- out.flush();
- out.close();
- out_table.flush();
- out_table.close();
- ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
- ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
}
private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
return my_first;
}
+ public static void extractProteinFeatures( final boolean use_domain_architectures,
+ final SortedMap<Species, List<Protein>> protein_lists_per_species,
+ final String domain_id,
+ final Writer out,
+ final double ie_cutoff,
+ final String domain_separator )
+ throws IOException {
+ final String separator_for_output = "\t";
+ for( final Species species : protein_lists_per_species.keySet() ) {
+ final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
+ for( final Protein protein : proteins_per_species ) {
+ if ( use_domain_architectures ) {
+ if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
+ int from = Integer.MAX_VALUE;
+ int to = -1;
+ for( final Domain d : protein.getProteinDomains() ) {
+ if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
+ if ( d.getFrom() < from ) {
+ from = d.getFrom();
+ }
+ if ( d.getTo() > to ) {
+ to = d.getTo();
+ }
+ }
+ }
+ out.write( protein.getSpecies().getSpeciesId() );
+ out.write( separator_for_output );
+ out.write( protein.getProteinId().getId() );
+ out.write( separator_for_output );
+ out.write( domain_id );
+ out.write( separator_for_output );
+ out.write( "/" );
+ out.write( from + "-" + to );
+ out.write( "/" );
+ out.write( SurfacingConstants.NL );
+ }
+ }
+ else {
+ final List<Domain> domains = protein.getProteinDomains( domain_id );
+ if ( domains.size() > 0 ) {
+ out.write( protein.getSpecies().getSpeciesId() );
+ out.write( separator_for_output );
+ out.write( protein.getProteinId().getId() );
+ out.write( separator_for_output );
+ out.write( domain_id );
+ out.write( separator_for_output );
+ for( final Domain domain : domains ) {
+ if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
+ out.write( "/" );
+ out.write( domain.getFrom() + "-" + domain.getTo() );
+ }
+ }
+ out.write( "/" );
+ out.write( separator_for_output );
+ final List<Domain> domain_list = new ArrayList<Domain>();
+ for( final Domain domain : protein.getProteinDomains() ) {
+ if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
+ domain_list.add( domain );
+ }
+ }
+ final Domain domain_ary[] = new Domain[ domain_list.size() ];
+ for( int i = 0; i < domain_list.size(); ++i ) {
+ domain_ary[ i ] = domain_list.get( i );
+ }
+ Arrays.sort( domain_ary, new DomainComparator( true ) );
+ out.write( "{" );
+ boolean first = true;
+ for( final Domain domain : domain_ary ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( "," );
+ }
+ out.write( domain.getDomainId().toString() );
+ out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
+ out.write( ":" + domain.getPerDomainEvalue() );
+ }
+ out.write( "}" );
+ if ( !( ForesterUtil.isEmpty( protein.getDescription() )
+ || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getDescription() );
+ }
+ out.write( separator_for_output );
+ if ( !( ForesterUtil.isEmpty( protein.getAccession() )
+ || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getAccession() );
+ }
+ out.write( SurfacingConstants.NL );
+ }
+ }
+ }
+ }
+ out.flush();
+ }
+
public static void main( final String[] args ) {
Set<String> a = new HashSet<String>();
Set<String> b = new HashSet<String>();