final static private String NOT_IGNORE_DUFS_OPTION = "dufs";
final static private String MAX_FS_E_VALUE_OPTION = "fs_e";
final static private String MAX_I_E_VALUE_OPTION = "ie";
+ final static private String MIN_REL_ENV_LENGTH_RATIO_OPTION = "mrel";
final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo";
final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo";
final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb";
final static private String INPUT_GENOMES_FILE_OPTION = "genomes";
final static private String INPUT_SPECIES_TREE_OPTION = "species_tree";
final static private String SEQ_EXTRACT_OPTION = "prot_extract";
- final static private String PRG_VERSION = "2.405";
- final static private String PRG_DATE = "170317";
+ final static private String PRG_VERSION = "2.500";
+ final static private String PRG_DATE = "170323";
final static private String E_MAIL = "czmasek@burnham.org";
final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing";
final static private boolean IGNORE_DUFS_DEFAULT = true;
allowed_options.add( surfacing.NOT_IGNORE_DUFS_OPTION );
allowed_options.add( surfacing.MAX_FS_E_VALUE_OPTION );
allowed_options.add( surfacing.MAX_I_E_VALUE_OPTION );
+ allowed_options.add( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION );
allowed_options.add( surfacing.DETAILEDNESS_OPTION );
allowed_options.add( surfacing.OUTPUT_FILE_OPTION );
allowed_options.add( surfacing.DOMAIN_SIMILARITY_SORT_OPTION );
boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT;
double fs_e_value_max = surfacing.MAX_E_VALUE_DEFAULT;
double ie_value_max = surfacing.MAX_E_VALUE_DEFAULT;
+ double rel_env_length_ratio_cutoff = -1;
int max_allowed_overlap = surfacing.MAX_ALLOWED_OVERLAP_DEFAULT;
final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
if ( dissallowed_options.length() > 0 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for E-value maximum" );
}
}
+ if ( cla.isOptionSet( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION ) ) {
+ try {
+ rel_env_length_ratio_cutoff = cla.getOptionValueAsDouble( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION );
+ }
+ catch ( final Exception e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for min rel env length ratio" );
+ }
+ }
if ( cla.isOptionSet( surfacing.MAX_I_E_VALUE_OPTION ) ) {
try {
ie_value_max = cla.getOptionValueAsDouble( surfacing.MAX_I_E_VALUE_OPTION );
System.out.println( "iE-value maximum (incl) : " + ie_value_max );
html_desc.append( "<tr><td>iE-value maximum (inclusive):</td><td>" + ie_value_max + "</td></tr>" + nl );
}
+ if ( rel_env_length_ratio_cutoff > 0.0 ) {
+ System.out.println( "Rel env length ratio min : " + rel_env_length_ratio_cutoff );
+ html_desc.append( "<tr><td>Relative hmm envelope length ratio min (inclusive):</td><td>"
+ + rel_env_length_ratio_cutoff + "</td></tr>" + nl );
+ }
if ( fs_e_value_max >= 0.0 ) {
System.out.println( "FS E-value maximum (incl) : " + fs_e_value_max );
html_desc.append( "<tr><td>FS E-value maximum (inclusive):</td><td>" + fs_e_value_max + "</td></tr>" + nl );
if ( ie_value_max >= 0.0 ) {
parser.setIEValueMaximum( ie_value_max );
}
+ if ( rel_env_length_ratio_cutoff > 0.0 ) {
+ parser.setRelEnvLengthRatioCutoff( rel_env_length_ratio_cutoff );
+ }
parser.setIgnoreDufs( ignore_dufs );
parser.setIgnoreVirusLikeIds( ignore_virus_like_ids );
parser.setIgnoreEngulfedDomains( no_engulfing_overlaps );
SurfacingUtil
.log( "Domains ignored due to iE-value : " + parser.getDomainsIgnoredDueToIEval(),
log_writer );
+ System.out.println( "Domains ignored due to rel env length ratio : "
+ + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff() );
+ SurfacingUtil.log( "Domains ignored due to rel env length ratio : "
+ + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff(), log_writer );
System.out.println( "Domains ignored due to DUF designation : "
+ parser.getDomainsIgnoredDueToDuf() );
SurfacingUtil.log( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf(),
"Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file );
//
if ( true ) { //TODO
- MinimalDomainomeCalculator.calcDomainome( intree_0_orig, protein_lists_per_species, -1 );
+ try {
+ MinimalDomainomeCalculator.calcOme( false,
+ intrees[ 0 ],
+ protein_lists_per_species,
+ "---",
+ 1000,
+ out_dir.toString() + "/" + output_file );
+ }
+ catch ( IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
+ }
}
if ( true ) { //TODO
try {
- MinimalDomainomeCalculator.calcDAome( intree_0_orig,
- protein_lists_per_species,
- "---",
- 1000,
- out_dir.toString() + "/" + output_file );
+ MinimalDomainomeCalculator.calcOme( true,
+ intrees[ 0 ],
+ protein_lists_per_species,
+ "---",
+ 1000,
+ out_dir.toString() + "/" + output_file );
}
catch ( IOException e ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
System.out.println( surfacing.OUTPUT_FILE_OPTION + ": name for (main) output file (mandatory)" );
System.out.println( surfacing.MAX_I_E_VALUE_OPTION + ": max (inclusive) iE-value" );
System.out.println( surfacing.MAX_FS_E_VALUE_OPTION + ": max (inclusive) FS E-value" );
+ System.out.println( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION
+ + ": min (inclusive) relative envelope length ratio" );
System.out.println( surfacing.MAX_ALLOWED_OVERLAP_OPTION + ": maximal allowed domain overlap" );
System.out.println( surfacing.NO_ENGULFING_OVERLAP_OPTION + ": to ignore engulfed lower confidence domains" );
System.out.println( surfacing.SPECIES_MATRIX_OPTION + ": species matrix" );
private static final String HERPES = "HERPES_";
private static final String BACULO = "BACULO_";
private static final int E_VALUE_MAXIMUM_DEFAULT = -1;
+ private static final int LENGTH_RATIO_CUTOFF_DEFAULT = -1;
private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
private static final boolean IGNORE_DUFS_DEFAULT = false;
private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
- private static final boolean IGNORE_REPLACED_RRMS = false;
- private static final boolean IGNORE_hGDE_amylase = true; //TODO eventually remove me, added 10/22/13
private final Set<String> _filter;
private final FilterType _filter_type;
private final File _input_file;
private final String _species;
private double _fs_e_value_maximum;
private double _i_e_value_maximum;
+ private double _rel_env_length_ratio_cutoff;
private Map<String, Double> _individual_score_cutoffs;
private boolean _ignore_dufs;
private boolean _ignore_virus_like_ids;
private int _domains_ignored_due_to_overlap;
private int _domains_ignored_due_to_fs_e_value;
private int _domains_ignored_due_to_i_e_value;
+ private int _domains_ignored_due_to_rel_env_length_ratio_cutoff;
private int _domains_ignored_due_to_individual_score_cutoff;
private int _domains_stored;
private SortedSet<String> _domains_stored_set;
_domains_stored -= domains_removed;
_domains_ignored_due_to_overlap += domains_removed;
}
- if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
+ if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN )
+ || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
final Set<String> domain_ids_in_protein = new HashSet<String>();
for( final Domain d : current_protein.getProteinDomains() ) {
domain_ids_in_protein.add( d.getDomainId() );
public int getDomainsIgnoredDueToIEval() {
return _domains_ignored_due_to_i_e_value;
}
+
+ public int getDomainsIgnoredDueToRelEnvLengthRatioCutoff() {
+ return _domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+
+
public int getDomainsIgnoredDueToFsEval() {
return _domains_ignored_due_to_fs_e_value;
private double getIEValueMaximum() {
return _i_e_value_maximum;
}
+
+ private double getRelEnvLengthRatioCutoff() {
+ return _rel_env_length_ratio_cutoff;
+ }
private Set<String> getFilter() {
return _filter;
}
private void init() {
- _fs_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
- _i_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
+ _fs_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+ _i_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+ _rel_env_length_ratio_cutoff = LENGTH_RATIO_CUTOFF_DEFAULT;
setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
_max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
setDomainsIgnoredDueToDuf( 0 );
setDomainsIgnoredDueToFsEval( 0 );
setDomainsIgnoredDueToIEval( 0 );
+ setDomainsIgnoredDueToRelEnvLengthRatioCutoff( 0 );
setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
setDomainsIgnoredDueToVirusLikeId( 0 );
setDomainsIgnoredDueToOverlap( 0 );
if ( !isAllowProteinsWithSameName() ) {
if ( query.equals( prev_query ) ) {
throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
- + ", " + prev_qlen );
+ + ", " + prev_qlen );
}
if ( prev_queries.contains( query ) ) {
throw new IOException( "more than one protein named [" + query + "]" );
}
else {
throw new IOException( "could not find a score cutoff value for domain id \"" + target_id
- + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
+ + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
}
}
final String uc_id = target_id.toUpperCase();
+ final int env_length = 1 + env_to - env_from;
if ( failed_cutoff ) {
++_domains_ignored_due_to_individual_score_cutoff;
}
&& ( i_e_value > getIEValueMaximum() ) ) {
++_domains_ignored_due_to_i_e_value;
}
+ //
+ else if ( ( getRelEnvLengthRatioCutoff() > 0.0 )
+ && ( env_length < ( getRelEnvLengthRatioCutoff() * tlen) ) ) {
+ ++_domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+ //
else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
++_domains_ignored_due_to_duf;
}
- else if ( IGNORE_REPLACED_RRMS
- && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
- .contains( "RRM_6" ) ) ) {
- }
- else if ( IGNORE_hGDE_amylase && ( uc_id.equals( "hGDE_amylase" ) ) ) {
- }
else if ( isIgnoreVirusLikeIds()
&& ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
|| uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
( short ) domain_number,
( short ) total_domains,
i_e_value,
- domain_score );
+ domain_score,
+ ( short ) tlen,
+ ( short ) hmm_from,
+ ( short ) hmm_to );
current_protein.addProteinDomain( pd );
}
catch ( final IllegalArgumentException e ) {
return proteins;
}
- private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException {
+ private double parseDouble( final String double_str, final int line_number, final String label )
+ throws IOException {
double d = -1;
try {
d = Double.valueOf( double_str ).doubleValue();
}
catch ( final NumberFormatException e ) {
throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number
- + "] in [" + getInputFile().getCanonicalPath() + "]" );
+ + "] in [" + getInputFile().getCanonicalPath() + "]" );
}
return d;
}
}
catch ( final NumberFormatException e ) {
throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number
- + "] in [" + getInputFile().getCanonicalPath() + "]" );
+ + "] in [" + getInputFile().getCanonicalPath() + "]" );
}
return i;
}
private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) {
_domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value;
}
+
+ private void setDomainsIgnoredDueToRelEnvLengthRatioCutoff( final int domains_ignored_due_to_rel_env_length_ratio_cutoff ) {
+ _domains_ignored_due_to_rel_env_length_ratio_cutoff = domains_ignored_due_to_rel_env_length_ratio_cutoff;
+ }
+
+
private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
_domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
}
}
_i_e_value_maximum = i_e_value_maximum;
}
+
+ public void setRelEnvLengthRatioCutoff( final double rel_env_length_ratio_cutoff ) {
+ if ( rel_env_length_ratio_cutoff <= 0.0 ) {
+ throw new IllegalArgumentException( "attempt to set rel env length ratio cutoff to zero or a negative value" );
+ }
+ _rel_env_length_ratio_cutoff = rel_env_length_ratio_cutoff;
+ }
public void setIgnoreDufs( final boolean ignore_dufs ) {
_ignore_dufs = ignore_dufs;
}
public static enum FilterType {
- NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
+ NONE,
+ POSITIVE_PROTEIN,
+ NEGATIVE_PROTEIN,
+ NEGATIVE_DOMAIN
}
static public enum INDIVIDUAL_SCORE_CUTOFF {
- FULL_SEQUENCE, DOMAIN, NONE;
+ FULL_SEQUENCE,
+ DOMAIN,
+ NONE;
}
public static enum ReturnType {
- UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
+ UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
}
}
final private double _per_domain_score;
final private int _to;
final private short _total_count;
+ final private short _hmm_len;
+ final private short _hmm_from;
+ final private short _hmm_to;
public BasicDomain( final String id ) {
if ( ForesterUtil.isEmpty( id ) ) {
_total_count = -1;
_per_domain_evalue = -1;
_per_domain_score = -1;
+ _hmm_len = -1;
+ _hmm_from= -1;
+ _hmm_to= -1;
+
}
public BasicDomain( final String id,
_total_count = total_count;
_per_domain_evalue = per_domain_evalue;
_per_domain_score = per_domain_score;
+ _hmm_len = -1;
+ _hmm_from= -1;
+ _hmm_to= -1;
+ }
+
+ public BasicDomain( final String id,
+ final int from,
+ final int to,
+ final short number,
+ final short total_count,
+ final double per_domain_evalue,
+ final double per_domain_score,
+ final short hmm_len,
+ final short hmm_from,
+ final short hmm_to) {
+ if ( ( from >= to ) || ( from < 0 ) ) {
+ throw new IllegalArgumentException( "attempt to create protein domain from " + from + " to " + to );
+ }
+ if ( ForesterUtil.isEmpty( id ) ) {
+ throw new IllegalArgumentException( "attempt to create protein domain with null or empty id" );
+ }
+ if ( ( number > total_count ) || ( number < 0 ) ) {
+ throw new IllegalArgumentException( "attempt to create protein domain number " + number + " out of "
+ + total_count );
+ }
+ if ( per_domain_evalue < 0.0 ) {
+ throw new IllegalArgumentException( "attempt to create protein domain with negative E-value" );
+ }
+ if ( ( hmm_from >= hmm_to ) || ( hmm_from < 0 ) ) {
+ throw new IllegalArgumentException( "attempt to create protein domain matching hmm from " + from + " to " + to );
+ }
+ if ( hmm_len <= 0 ) {
+ throw new IllegalArgumentException( "attempt to create protein domain with zero or negative hmm length" );
+ }
+ _id = obtainIdAsShort( id );
+ _from = from;
+ _to = to;
+ _number = number;
+ _total_count = total_count;
+ _per_domain_evalue = per_domain_evalue;
+ _per_domain_score = per_domain_score;
+ _hmm_len = hmm_len;
+ _hmm_from= hmm_from;
+ _hmm_to= hmm_to;
}
/**
}
@Override
+ public final short getHmmLen() {
+ return _hmm_len;
+ }
+
+ @Override
+ public final short getHmmFrom() {
+ return _hmm_from;
+ }
+
+ @Override
+ public final short getHmmTo() {
+ return _hmm_to;
+ }
+
+ @Override
public int hashCode() {
return getDomainId().hashCode();
}
public final static String obtainIdFromShort( final short id ) {
return ID_TO_STRING.get( id );
}
+
+
}
public int getTo();
public short getTotalCount();
+
+ short getHmmLen();
+
+ short getHmmFrom();
+
+ short getHmmTo();
}
\ No newline at end of file
}
}
+ static final public void calcOme( final boolean use_domain_architectures,
+ final Phylogeny tre,
+ final SortedMap<Species, List<Protein>> protein_lists_per_species,
+ final String separator,
+ final double ie_cutoff,
+ final String outfile_base )
+ throws IOException {
+ final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
+ if ( protein_lists_per_species == null || tre == null ) {
+ throw new IllegalArgumentException( "argument is null" );
+ }
+ if ( protein_lists_per_species.size() < 2 ) {
+ throw new IllegalArgumentException( "not enough genomes" );
+ }
+ final String x;
+ if ( use_domain_architectures ) {
+ x = "DA";
+ }
+ else {
+ x = "domain";
+ }
+ final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
+ final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
+ SurfacingUtil.checkForOutputFileWriteability( outfile );
+ SurfacingUtil.checkForOutputFileWriteability( outfile_table );
+ final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
+ final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
+ out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
+ out.write( ForesterUtil.LINE_SEPARATOR );
+ for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
+ final PhylogenyNode node = iter.next();
+ final String species_name = node.getNodeData().isHasTaxonomy()
+ ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
+ final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
+ : "";
+ final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
+ : "";
+ final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
+ out.write( species_name );
+ if ( !ForesterUtil.isEmpty( common ) ) {
+ out.write( "\t" + common );
+ }
+ else {
+ out.write( "\t" );
+ }
+ if ( !ForesterUtil.isEmpty( tcode ) ) {
+ out.write( "\t" + tcode );
+ }
+ else {
+ out.write( "\t" );
+ }
+ if ( !ForesterUtil.isEmpty( rank ) ) {
+ out.write( "\t" + rank );
+ }
+ else {
+ out.write( "\t" );
+ }
+ final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
+ if ( node.isInternal() ) {
+ out.write( "\t" + external_descs.size() + "\t" );
+ }
+ else {
+ out.write( "\t\t" );
+ }
+ final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
+ boolean first = true;
+ for( final PhylogenyNode external_desc : external_descs ) {
+ final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
+ if ( node.isInternal() ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( ", " );
+ }
+ out.write( code );
+ }
+ final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
+ if ( proteins_per_species != null ) {
+ final SortedSet<String> das_per_genome = new TreeSet<String>();
+ for( final Protein protein : proteins_per_species ) {
+ if ( use_domain_architectures ) {
+ final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
+ das_per_genome.add( da );
+ }
+ else {
+ List<Domain> domains = protein.getProteinDomains();
+ for( final Domain domain : domains ) {
+ if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
+ das_per_genome.add( domain.getDomainId() );
+ }
+ }
+ }
+ }
+ if ( das_per_genome.size() > 0 ) {
+ das_per_genome_list.add( das_per_genome );
+ }
+ }
+ }
+ if ( das_per_genome_list.size() > 0 ) {
+ SortedSet<String> intersection = calcIntersection( das_per_genome_list );
+ out.write( "\t" + intersection.size() + "\t" );
+ first = true;
+ for( final String s : intersection ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( ", " );
+ }
+ out.write( s );
+ }
+ out.write( ForesterUtil.LINE_SEPARATOR );
+ species_to_das_map.put( species_name, intersection );
+ }
+ }
+ final SortedSet<String> all_species_names = new TreeSet<String>();
+ final SortedSet<String> all_das = new TreeSet<String>();
+ for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
+ all_species_names.add( e.getKey() );
+ for( final String das : e.getValue() ) {
+ all_das.add( das );
+ }
+ }
+ out_table.write( '\t' );
+ boolean first = true;
+ for( final String species_name : all_species_names ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out_table.write( '\t' );
+ }
+ out_table.write( species_name );
+ }
+ out_table.write( ForesterUtil.LINE_SEPARATOR );
+ for( final String das : all_das ) {
+ out_table.write( das );
+ out_table.write( '\t' );
+ first = true;
+ for( final String species_name : all_species_names ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out_table.write( '\t' );
+ }
+ if ( species_to_das_map.get( species_name ).contains( das ) ) {
+ out_table.write( '1' );
+ }
+ else {
+ out_table.write( '0' );
+ }
+ }
+ out_table.write( ForesterUtil.LINE_SEPARATOR );
+ }
+ out.flush();
+ out.close();
+ out_table.flush();
+ out_table.close();
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
+ }
+
static final public void calcDAome( final Phylogeny tre,
final SortedMap<Species, List<Protein>> protein_lists_per_species,
final String separator,
SurfacingUtil.checkForOutputFileWriteability( outfile_table );
final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
+ out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
+ out.write( ForesterUtil.LINE_SEPARATOR );
for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
final PhylogenyNode node = iter.next();
final String species_name = node.getNodeData().isHasTaxonomy()
? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
+ final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
+ : "";
+ final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
+ : "";
+ final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
out.write( species_name );
+ if ( !ForesterUtil.isEmpty( common ) ) {
+ out.write( "\t" + common );
+ }
+ else {
+ out.write( "\t" );
+ }
+ if ( !ForesterUtil.isEmpty( tcode ) ) {
+ out.write( "\t" + tcode );
+ }
+ else {
+ out.write( "\t" );
+ }
+ if ( !ForesterUtil.isEmpty( rank ) ) {
+ out.write( "\t" + rank );
+ }
+ else {
+ out.write( "\t" );
+ }
final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
- out.write( "\t[" + external_descs.size() + "]:" );
+ if ( node.isInternal() ) {
+ out.write( "\t" + external_descs.size() + "\t" );
+ }
+ else {
+ out.write( "\t\t" );
+ }
final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
+ boolean first = true;
for( final PhylogenyNode external_desc : external_descs ) {
final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
- out.write( '\t' + code );
+ if ( node.isInternal() ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( ", " );
+ }
+ out.write( code );
+ }
final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
if ( proteins_per_species != null ) {
final SortedSet<String> das_per_genome = new TreeSet<String>();
das_per_genome_list.add( das_per_genome );
}
}
- }
+ }
if ( das_per_genome_list.size() > 0 ) {
SortedSet<String> intersection = calcIntersection( das_per_genome_list );
- out.write( "\t[" + intersection.size() + "]:" );
+ out.write( "\t" + intersection.size() + "\t" );
+ first = true;
for( final String s : intersection ) {
- out.write( '\t' + s );
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( ", " );
+ }
+ out.write( s );
}
out.write( ForesterUtil.LINE_SEPARATOR );
- out.write( ForesterUtil.LINE_SEPARATOR );
species_to_das_map.put( species_name, intersection );
}
}
out.close();
out_table.flush();
out_table.close();
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Wrote minimal DAome data to : " + outfile );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Wrote minimal DAome data to (as table): " + outfile_table );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
}
private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
public short getTotalCount() {
throw new RuntimeException( "method not implemented" );
}
+
+ @Override
+ public short getHmmLen() {
+ throw new RuntimeException( "method not implemented" );
+ }
+
+ @Override
+ public short getHmmFrom() {
+ throw new RuntimeException( "method not implemented" );
+ }
+
+ @Override
+ public short getHmmTo() {
+ throw new RuntimeException( "method not implemented" );
+ }
}