From bc608c6e85a6389cdc3d21c1ea1728b44be82bae Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Thu, 23 Mar 2017 18:51:30 -0700 Subject: [PATCH] in progress... --- .../src/org/forester/application/surfacing.java | 52 ++++- .../io/parsers/HmmscanPerDomainTableParser.java | 77 +++++-- .../java/src/org/forester/protein/BasicDomain.java | 68 ++++++ forester/java/src/org/forester/protein/Domain.java | 6 + .../surfacing/MinimalDomainomeCalculator.java | 227 +++++++++++++++++++- .../src/org/forester/surfacing/SimpleDomain.java | 15 ++ 6 files changed, 407 insertions(+), 38 deletions(-) diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index a3572d5..c35c98d 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -175,6 +175,7 @@ public class surfacing { final static private String NOT_IGNORE_DUFS_OPTION = "dufs"; final static private String MAX_FS_E_VALUE_OPTION = "fs_e"; final static private String MAX_I_E_VALUE_OPTION = "ie"; + final static private String MIN_REL_ENV_LENGTH_RATIO_OPTION = "mrel"; final static private String MAX_ALLOWED_OVERLAP_OPTION = "mo"; final static private String NO_ENGULFING_OVERLAP_OPTION = "no_eo"; final static private String IGNORE_COMBINATION_WITH_SAME_OPTION = "ignore_self_comb"; @@ -216,8 +217,8 @@ public class surfacing { final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.405"; - final static private String PRG_DATE = "170317"; + final static private String PRG_VERSION = "2.500"; + final static private String PRG_DATE = "170323"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -308,6 +309,7 @@ public class surfacing { allowed_options.add( surfacing.NOT_IGNORE_DUFS_OPTION ); allowed_options.add( surfacing.MAX_FS_E_VALUE_OPTION ); allowed_options.add( surfacing.MAX_I_E_VALUE_OPTION ); + allowed_options.add( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION ); allowed_options.add( surfacing.DETAILEDNESS_OPTION ); allowed_options.add( surfacing.OUTPUT_FILE_OPTION ); allowed_options.add( surfacing.DOMAIN_SIMILARITY_SORT_OPTION ); @@ -352,6 +354,7 @@ public class surfacing { boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; double fs_e_value_max = surfacing.MAX_E_VALUE_DEFAULT; double ie_value_max = surfacing.MAX_E_VALUE_DEFAULT; + double rel_env_length_ratio_cutoff = -1; int max_allowed_overlap = surfacing.MAX_ALLOWED_OVERLAP_DEFAULT; final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options ); if ( dissallowed_options.length() > 0 ) { @@ -393,6 +396,14 @@ public class surfacing { ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for E-value maximum" ); } } + if ( cla.isOptionSet( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION ) ) { + try { + rel_env_length_ratio_cutoff = cla.getOptionValueAsDouble( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for min rel env length ratio" ); + } + } if ( cla.isOptionSet( surfacing.MAX_I_E_VALUE_OPTION ) ) { try { ie_value_max = cla.getOptionValueAsDouble( surfacing.MAX_I_E_VALUE_OPTION ); @@ -1106,6 +1117,11 @@ public class surfacing { System.out.println( "iE-value maximum (incl) : " + ie_value_max ); html_desc.append( "iE-value maximum (inclusive):" + ie_value_max + "" + nl ); } + if ( rel_env_length_ratio_cutoff > 0.0 ) { + System.out.println( "Rel env length ratio min : " + rel_env_length_ratio_cutoff ); + html_desc.append( "Relative hmm envelope length ratio min (inclusive):" + + rel_env_length_ratio_cutoff + "" + nl ); + } if ( fs_e_value_max >= 0.0 ) { System.out.println( "FS E-value maximum (incl) : " + fs_e_value_max ); html_desc.append( "FS E-value maximum (inclusive):" + fs_e_value_max + "" + nl ); @@ -1558,6 +1574,9 @@ public class surfacing { if ( ie_value_max >= 0.0 ) { parser.setIEValueMaximum( ie_value_max ); } + if ( rel_env_length_ratio_cutoff > 0.0 ) { + parser.setRelEnvLengthRatioCutoff( rel_env_length_ratio_cutoff ); + } parser.setIgnoreDufs( ignore_dufs ); parser.setIgnoreVirusLikeIds( ignore_virus_like_ids ); parser.setIgnoreEngulfedDomains( no_engulfing_overlaps ); @@ -1628,6 +1647,10 @@ public class surfacing { SurfacingUtil .log( "Domains ignored due to iE-value : " + parser.getDomainsIgnoredDueToIEval(), log_writer ); + System.out.println( "Domains ignored due to rel env length ratio : " + + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff() ); + SurfacingUtil.log( "Domains ignored due to rel env length ratio : " + + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff(), log_writer ); System.out.println( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf() ); SurfacingUtil.log( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf(), @@ -1754,15 +1777,26 @@ public class surfacing { "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file ); // if ( true ) { //TODO - MinimalDomainomeCalculator.calcDomainome( intree_0_orig, protein_lists_per_species, -1 ); + try { + MinimalDomainomeCalculator.calcOme( false, + intrees[ 0 ], + protein_lists_per_species, + "---", + 1000, + out_dir.toString() + "/" + output_file ); + } + catch ( IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } } if ( true ) { //TODO try { - MinimalDomainomeCalculator.calcDAome( intree_0_orig, - protein_lists_per_species, - "---", - 1000, - out_dir.toString() + "/" + output_file ); + MinimalDomainomeCalculator.calcOme( true, + intrees[ 0 ], + protein_lists_per_species, + "---", + 1000, + out_dir.toString() + "/" + output_file ); } catch ( IOException e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); @@ -2211,6 +2245,8 @@ public class surfacing { System.out.println( surfacing.OUTPUT_FILE_OPTION + ": name for (main) output file (mandatory)" ); System.out.println( surfacing.MAX_I_E_VALUE_OPTION + ": max (inclusive) iE-value" ); System.out.println( surfacing.MAX_FS_E_VALUE_OPTION + ": max (inclusive) FS E-value" ); + System.out.println( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION + + ": min (inclusive) relative envelope length ratio" ); System.out.println( surfacing.MAX_ALLOWED_OVERLAP_OPTION + ": maximal allowed domain overlap" ); System.out.println( surfacing.NO_ENGULFING_OVERLAP_OPTION + ": to ignore engulfed lower confidence domains" ); System.out.println( surfacing.SPECIES_MATRIX_OPTION + ": species matrix" ); diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java index 5e0ba9c..c75e521 100644 --- a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java +++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java @@ -59,17 +59,17 @@ public final class HmmscanPerDomainTableParser { private static final String HERPES = "HERPES_"; private static final String BACULO = "BACULO_"; private static final int E_VALUE_MAXIMUM_DEFAULT = -1; + private static final int LENGTH_RATIO_CUTOFF_DEFAULT = -1; private static final ReturnType RETURN_TYPE_DEFAULT = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN; private static final boolean IGNORE_DUFS_DEFAULT = false; private static final int MAX_ALLOWED_OVERLAP_DEFAULT = -1; - private static final boolean IGNORE_REPLACED_RRMS = false; - private static final boolean IGNORE_hGDE_amylase = true; //TODO eventually remove me, added 10/22/13 private final Set _filter; private final FilterType _filter_type; private final File _input_file; private final String _species; private double _fs_e_value_maximum; private double _i_e_value_maximum; + private double _rel_env_length_ratio_cutoff; private Map _individual_score_cutoffs; private boolean _ignore_dufs; private boolean _ignore_virus_like_ids; @@ -84,6 +84,7 @@ public final class HmmscanPerDomainTableParser { private int _domains_ignored_due_to_overlap; private int _domains_ignored_due_to_fs_e_value; private int _domains_ignored_due_to_i_e_value; + private int _domains_ignored_due_to_rel_env_length_ratio_cutoff; private int _domains_ignored_due_to_individual_score_cutoff; private int _domains_stored; private SortedSet _domains_stored_set; @@ -173,7 +174,8 @@ public final class HmmscanPerDomainTableParser { _domains_stored -= domains_removed; _domains_ignored_due_to_overlap += domains_removed; } - if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { + if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) + || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) { final Set domain_ids_in_protein = new HashSet(); for( final Domain d : current_protein.getProteinDomains() ) { domain_ids_in_protein.add( d.getDomainId() ); @@ -212,6 +214,12 @@ public final class HmmscanPerDomainTableParser { public int getDomainsIgnoredDueToIEval() { return _domains_ignored_due_to_i_e_value; } + + public int getDomainsIgnoredDueToRelEnvLengthRatioCutoff() { + return _domains_ignored_due_to_rel_env_length_ratio_cutoff; + } + + public int getDomainsIgnoredDueToFsEval() { return _domains_ignored_due_to_fs_e_value; @@ -256,6 +264,10 @@ public final class HmmscanPerDomainTableParser { private double getIEValueMaximum() { return _i_e_value_maximum; } + + private double getRelEnvLengthRatioCutoff() { + return _rel_env_length_ratio_cutoff; + } private Set getFilter() { return _filter; @@ -306,8 +318,9 @@ public final class HmmscanPerDomainTableParser { } private void init() { - _fs_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; - _i_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT; + _fs_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT; + _i_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT; + _rel_env_length_ratio_cutoff = LENGTH_RATIO_CUTOFF_DEFAULT; setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT ); setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT ); _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT; @@ -326,6 +339,7 @@ public final class HmmscanPerDomainTableParser { setDomainsIgnoredDueToDuf( 0 ); setDomainsIgnoredDueToFsEval( 0 ); setDomainsIgnoredDueToIEval( 0 ); + setDomainsIgnoredDueToRelEnvLengthRatioCutoff( 0 ); setDomainsIgnoredDueToIndividualScoreCutoff( 0 ); setDomainsIgnoredDueToVirusLikeId( 0 ); setDomainsIgnoredDueToOverlap( 0 ); @@ -406,7 +420,7 @@ public final class HmmscanPerDomainTableParser { if ( !isAllowProteinsWithSameName() ) { if ( query.equals( prev_query ) ) { throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen - + ", " + prev_qlen ); + + ", " + prev_qlen ); } if ( prev_queries.contains( query ) ) { throw new IOException( "more than one protein named [" + query + "]" ); @@ -442,10 +456,11 @@ public final class HmmscanPerDomainTableParser { } else { throw new IOException( "could not find a score cutoff value for domain id \"" + target_id - + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); + + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" ); } } final String uc_id = target_id.toUpperCase(); + final int env_length = 1 + env_to - env_from; if ( failed_cutoff ) { ++_domains_ignored_due_to_individual_score_cutoff; } @@ -460,15 +475,15 @@ public final class HmmscanPerDomainTableParser { && ( i_e_value > getIEValueMaximum() ) ) { ++_domains_ignored_due_to_i_e_value; } + // + else if ( ( getRelEnvLengthRatioCutoff() > 0.0 ) + && ( env_length < ( getRelEnvLengthRatioCutoff() * tlen) ) ) { + ++_domains_ignored_due_to_rel_env_length_ratio_cutoff; + } + // else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) { ++_domains_ignored_due_to_duf; } - else if ( IGNORE_REPLACED_RRMS - && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id - .contains( "RRM_6" ) ) ) { - } - else if ( IGNORE_hGDE_amylase && ( uc_id.equals( "hGDE_amylase" ) ) ) { - } else if ( isIgnoreVirusLikeIds() && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO ) || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG ) @@ -488,7 +503,10 @@ public final class HmmscanPerDomainTableParser { ( short ) domain_number, ( short ) total_domains, i_e_value, - domain_score ); + domain_score, + ( short ) tlen, + ( short ) hmm_from, + ( short ) hmm_to ); current_protein.addProteinDomain( pd ); } catch ( final IllegalArgumentException e ) { @@ -506,14 +524,15 @@ public final class HmmscanPerDomainTableParser { return proteins; } - private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException { + private double parseDouble( final String double_str, final int line_number, final String label ) + throws IOException { double d = -1; try { d = Double.valueOf( double_str ).doubleValue(); } catch ( final NumberFormatException e ) { throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number - + "] in [" + getInputFile().getCanonicalPath() + "]" ); + + "] in [" + getInputFile().getCanonicalPath() + "]" ); } return d; } @@ -525,7 +544,7 @@ public final class HmmscanPerDomainTableParser { } catch ( final NumberFormatException e ) { throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number - + "] in [" + getInputFile().getCanonicalPath() + "]" ); + + "] in [" + getInputFile().getCanonicalPath() + "]" ); } return i; } @@ -545,7 +564,13 @@ public final class HmmscanPerDomainTableParser { private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) { _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value; } + + private void setDomainsIgnoredDueToRelEnvLengthRatioCutoff( final int domains_ignored_due_to_rel_env_length_ratio_cutoff ) { + _domains_ignored_due_to_rel_env_length_ratio_cutoff = domains_ignored_due_to_rel_env_length_ratio_cutoff; + } + + private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff; } @@ -591,6 +616,13 @@ public final class HmmscanPerDomainTableParser { } _i_e_value_maximum = i_e_value_maximum; } + + public void setRelEnvLengthRatioCutoff( final double rel_env_length_ratio_cutoff ) { + if ( rel_env_length_ratio_cutoff <= 0.0 ) { + throw new IllegalArgumentException( "attempt to set rel env length ratio cutoff to zero or a negative value" ); + } + _rel_env_length_ratio_cutoff = rel_env_length_ratio_cutoff; + } public void setIgnoreDufs( final boolean ignore_dufs ) { _ignore_dufs = ignore_dufs; @@ -649,14 +681,19 @@ public final class HmmscanPerDomainTableParser { } public static enum FilterType { - NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN + NONE, + POSITIVE_PROTEIN, + NEGATIVE_PROTEIN, + NEGATIVE_DOMAIN } static public enum INDIVIDUAL_SCORE_CUTOFF { - FULL_SEQUENCE, DOMAIN, NONE; + FULL_SEQUENCE, + DOMAIN, + NONE; } public static enum ReturnType { - UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN + UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN } } diff --git a/forester/java/src/org/forester/protein/BasicDomain.java b/forester/java/src/org/forester/protein/BasicDomain.java index b132898..b7ab1d8 100644 --- a/forester/java/src/org/forester/protein/BasicDomain.java +++ b/forester/java/src/org/forester/protein/BasicDomain.java @@ -43,6 +43,9 @@ public class BasicDomain implements Domain { final private double _per_domain_score; final private int _to; final private short _total_count; + final private short _hmm_len; + final private short _hmm_from; + final private short _hmm_to; public BasicDomain( final String id ) { if ( ForesterUtil.isEmpty( id ) ) { @@ -55,6 +58,10 @@ public class BasicDomain implements Domain { _total_count = -1; _per_domain_evalue = -1; _per_domain_score = -1; + _hmm_len = -1; + _hmm_from= -1; + _hmm_to= -1; + } public BasicDomain( final String id, @@ -84,6 +91,50 @@ public class BasicDomain implements Domain { _total_count = total_count; _per_domain_evalue = per_domain_evalue; _per_domain_score = per_domain_score; + _hmm_len = -1; + _hmm_from= -1; + _hmm_to= -1; + } + + public BasicDomain( final String id, + final int from, + final int to, + final short number, + final short total_count, + final double per_domain_evalue, + final double per_domain_score, + final short hmm_len, + final short hmm_from, + final short hmm_to) { + if ( ( from >= to ) || ( from < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain from " + from + " to " + to ); + } + if ( ForesterUtil.isEmpty( id ) ) { + throw new IllegalArgumentException( "attempt to create protein domain with null or empty id" ); + } + if ( ( number > total_count ) || ( number < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain number " + number + " out of " + + total_count ); + } + if ( per_domain_evalue < 0.0 ) { + throw new IllegalArgumentException( "attempt to create protein domain with negative E-value" ); + } + if ( ( hmm_from >= hmm_to ) || ( hmm_from < 0 ) ) { + throw new IllegalArgumentException( "attempt to create protein domain matching hmm from " + from + " to " + to ); + } + if ( hmm_len <= 0 ) { + throw new IllegalArgumentException( "attempt to create protein domain with zero or negative hmm length" ); + } + _id = obtainIdAsShort( id ); + _from = from; + _to = to; + _number = number; + _total_count = total_count; + _per_domain_evalue = per_domain_evalue; + _per_domain_score = per_domain_score; + _hmm_len = hmm_len; + _hmm_from= hmm_from; + _hmm_to= hmm_to; } /** @@ -166,6 +217,21 @@ public class BasicDomain implements Domain { } @Override + public final short getHmmLen() { + return _hmm_len; + } + + @Override + public final short getHmmFrom() { + return _hmm_from; + } + + @Override + public final short getHmmTo() { + return _hmm_to; + } + + @Override public int hashCode() { return getDomainId().hashCode(); } @@ -194,4 +260,6 @@ public class BasicDomain implements Domain { public final static String obtainIdFromShort( final short id ) { return ID_TO_STRING.get( id ); } + + } diff --git a/forester/java/src/org/forester/protein/Domain.java b/forester/java/src/org/forester/protein/Domain.java index b1f720a..dcc41fb 100644 --- a/forester/java/src/org/forester/protein/Domain.java +++ b/forester/java/src/org/forester/protein/Domain.java @@ -43,4 +43,10 @@ public interface Domain extends Comparable { public int getTo(); public short getTotalCount(); + + short getHmmLen(); + + short getHmmFrom(); + + short getHmmTo(); } \ No newline at end of file diff --git a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java index f252202..ba667ce 100644 --- a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java +++ b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java @@ -79,6 +79,170 @@ public final class MinimalDomainomeCalculator { } } + static final public void calcOme( final boolean use_domain_architectures, + final Phylogeny tre, + final SortedMap> protein_lists_per_species, + final String separator, + final double ie_cutoff, + final String outfile_base ) + throws IOException { + final SortedMap> species_to_das_map = new TreeMap>(); + if ( protein_lists_per_species == null || tre == null ) { + throw new IllegalArgumentException( "argument is null" ); + } + if ( protein_lists_per_species.size() < 2 ) { + throw new IllegalArgumentException( "not enough genomes" ); + } + final String x; + if ( use_domain_architectures ) { + x = "DA"; + } + else { + x = "domain"; + } + final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" ); + final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" ); + SurfacingUtil.checkForOutputFileWriteability( outfile ); + SurfacingUtil.checkForOutputFileWriteability( outfile_table ); + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) ); + out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" ); + out.write( ForesterUtil.LINE_SEPARATOR ); + for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + final String species_name = node.getNodeData().isHasTaxonomy() + ? node.getNodeData().getTaxonomy().getScientificName() : node.getName(); + final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName() + : ""; + final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode() + : ""; + final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : ""; + out.write( species_name ); + if ( !ForesterUtil.isEmpty( common ) ) { + out.write( "\t" + common ); + } + else { + out.write( "\t" ); + } + if ( !ForesterUtil.isEmpty( tcode ) ) { + out.write( "\t" + tcode ); + } + else { + out.write( "\t" ); + } + if ( !ForesterUtil.isEmpty( rank ) ) { + out.write( "\t" + rank ); + } + else { + out.write( "\t" ); + } + final List external_descs = node.getAllExternalDescendants(); + if ( node.isInternal() ) { + out.write( "\t" + external_descs.size() + "\t" ); + } + else { + out.write( "\t\t" ); + } + final List> das_per_genome_list = new ArrayList>(); + boolean first = true; + for( final PhylogenyNode external_desc : external_descs ) { + final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); + if ( node.isInternal() ) { + if ( first ) { + first = false; + } + else { + out.write( ", " ); + } + out.write( code ); + } + final List proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) ); + if ( proteins_per_species != null ) { + final SortedSet das_per_genome = new TreeSet(); + for( final Protein protein : proteins_per_species ) { + if ( use_domain_architectures ) { + final String da = protein.toDomainArchitectureString( separator, ie_cutoff ); + das_per_genome.add( da ); + } + else { + List domains = protein.getProteinDomains(); + for( final Domain domain : domains ) { + if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) { + das_per_genome.add( domain.getDomainId() ); + } + } + } + } + if ( das_per_genome.size() > 0 ) { + das_per_genome_list.add( das_per_genome ); + } + } + } + if ( das_per_genome_list.size() > 0 ) { + SortedSet intersection = calcIntersection( das_per_genome_list ); + out.write( "\t" + intersection.size() + "\t" ); + first = true; + for( final String s : intersection ) { + if ( first ) { + first = false; + } + else { + out.write( ", " ); + } + out.write( s ); + } + out.write( ForesterUtil.LINE_SEPARATOR ); + species_to_das_map.put( species_name, intersection ); + } + } + final SortedSet all_species_names = new TreeSet(); + final SortedSet all_das = new TreeSet(); + for( final Entry> e : species_to_das_map.entrySet() ) { + all_species_names.add( e.getKey() ); + for( final String das : e.getValue() ) { + all_das.add( das ); + } + } + out_table.write( '\t' ); + boolean first = true; + for( final String species_name : all_species_names ) { + if ( first ) { + first = false; + } + else { + out_table.write( '\t' ); + } + out_table.write( species_name ); + } + out_table.write( ForesterUtil.LINE_SEPARATOR ); + for( final String das : all_das ) { + out_table.write( das ); + out_table.write( '\t' ); + first = true; + for( final String species_name : all_species_names ) { + if ( first ) { + first = false; + } + else { + out_table.write( '\t' ); + } + if ( species_to_das_map.get( species_name ).contains( das ) ) { + out_table.write( '1' ); + } + else { + out_table.write( '0' ); + } + } + out_table.write( ForesterUtil.LINE_SEPARATOR ); + } + out.flush(); + out.close(); + out_table.flush(); + out_table.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table ); + } + static final public void calcDAome( final Phylogeny tre, final SortedMap> protein_lists_per_species, final String separator, @@ -98,17 +262,56 @@ public final class MinimalDomainomeCalculator { SurfacingUtil.checkForOutputFileWriteability( outfile_table ); final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) ); + out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" ); + out.write( ForesterUtil.LINE_SEPARATOR ); for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); final String species_name = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getScientificName() : node.getName(); + final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName() + : ""; + final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode() + : ""; + final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : ""; out.write( species_name ); + if ( !ForesterUtil.isEmpty( common ) ) { + out.write( "\t" + common ); + } + else { + out.write( "\t" ); + } + if ( !ForesterUtil.isEmpty( tcode ) ) { + out.write( "\t" + tcode ); + } + else { + out.write( "\t" ); + } + if ( !ForesterUtil.isEmpty( rank ) ) { + out.write( "\t" + rank ); + } + else { + out.write( "\t" ); + } final List external_descs = node.getAllExternalDescendants(); - out.write( "\t[" + external_descs.size() + "]:" ); + if ( node.isInternal() ) { + out.write( "\t" + external_descs.size() + "\t" ); + } + else { + out.write( "\t\t" ); + } final List> das_per_genome_list = new ArrayList>(); + boolean first = true; for( final PhylogenyNode external_desc : external_descs ) { final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); - out.write( '\t' + code ); + if ( node.isInternal() ) { + if ( first ) { + first = false; + } + else { + out.write( ", " ); + } + out.write( code ); + } final List proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) ); if ( proteins_per_species != null ) { final SortedSet das_per_genome = new TreeSet(); @@ -120,15 +323,21 @@ public final class MinimalDomainomeCalculator { das_per_genome_list.add( das_per_genome ); } } - } + } if ( das_per_genome_list.size() > 0 ) { SortedSet intersection = calcIntersection( das_per_genome_list ); - out.write( "\t[" + intersection.size() + "]:" ); + out.write( "\t" + intersection.size() + "\t" ); + first = true; for( final String s : intersection ) { - out.write( '\t' + s ); + if ( first ) { + first = false; + } + else { + out.write( ", " ); + } + out.write( s ); } out.write( ForesterUtil.LINE_SEPARATOR ); - out.write( ForesterUtil.LINE_SEPARATOR ); species_to_das_map.put( species_name, intersection ); } } @@ -176,10 +385,8 @@ public final class MinimalDomainomeCalculator { out.close(); out_table.flush(); out_table.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote minimal DAome data to : " + outfile ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote minimal DAome data to (as table): " + outfile_table ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table ); } private final static SortedSet calcIntersection( final List> features_per_genome_list ) { diff --git a/forester/java/src/org/forester/surfacing/SimpleDomain.java b/forester/java/src/org/forester/surfacing/SimpleDomain.java index 782ce20..2520e6c 100644 --- a/forester/java/src/org/forester/surfacing/SimpleDomain.java +++ b/forester/java/src/org/forester/surfacing/SimpleDomain.java @@ -92,4 +92,19 @@ public class SimpleDomain implements Domain { public short getTotalCount() { throw new RuntimeException( "method not implemented" ); } + + @Override + public short getHmmLen() { + throw new RuntimeException( "method not implemented" ); + } + + @Override + public short getHmmFrom() { + throw new RuntimeException( "method not implemented" ); + } + + @Override + public short getHmmTo() { + throw new RuntimeException( "method not implemented" ); + } } -- 1.7.10.2