in progress...

author cmzmasek <chris.zma@outlook.com>

Fri, 24 Mar 2017 01:51:30 +0000 (18:51 -0700)

committer cmzmasek <chris.zma@outlook.com>

Fri, 24 Mar 2017 01:51:30 +0000 (18:51 -0700)
author cmzmasek <chris.zma@outlook.com>
Fri, 24 Mar 2017 01:51:30 +0000 (18:51 -0700)
committer cmzmasek <chris.zma@outlook.com>
Fri, 24 Mar 2017 01:51:30 +0000 (18:51 -0700)
diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java

index a3572d5..c35c98d 100644 (file)
--- a/forester/java/src/org/forester/application/surfacing.java
+++ b/forester/java/src/org/forester/application/surfacing.java
@@ -175,6 +175,7 @@ public class surfacing {
      final static private String                                     NOT_IGNORE_DUFS_OPTION                                                        = "dufs";
      final static private String                                     MAX_FS_E_VALUE_OPTION                                                         = "fs_e";
      final static private String                                     MAX_I_E_VALUE_OPTION                                                          = "ie";
+    final static private String                                     MIN_REL_ENV_LENGTH_RATIO_OPTION                                               = "mrel";
      final static private String                                     MAX_ALLOWED_OVERLAP_OPTION                                                    = "mo";
      final static private String                                     NO_ENGULFING_OVERLAP_OPTION                                                   = "no_eo";
      final static private String                                     IGNORE_COMBINATION_WITH_SAME_OPTION                                           = "ignore_self_comb";
@@ -216,8 +217,8 @@ public class surfacing {
      final static private String                                     INPUT_GENOMES_FILE_OPTION                                                     = "genomes";
      final static private String                                     INPUT_SPECIES_TREE_OPTION                                                     = "species_tree";
      final static private String                                     SEQ_EXTRACT_OPTION                                                            = "prot_extract";
-    final static private String                                     PRG_VERSION                                                                   = "2.405";
-    final static private String                                     PRG_DATE                                                                      = "170317";
+    final static private String                                     PRG_VERSION                                                                   = "2.500";
+    final static private String                                     PRG_DATE                                                                      = "170323";
      final static private String                                     E_MAIL                                                                        = "czmasek@burnham.org";
      final static private String                                     WWW                                                                           = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing";
      final static private boolean                                    IGNORE_DUFS_DEFAULT                                                           = true;
@@ -308,6 +309,7 @@ public class surfacing {
          allowed_options.add( surfacing.NOT_IGNORE_DUFS_OPTION );
          allowed_options.add( surfacing.MAX_FS_E_VALUE_OPTION );
          allowed_options.add( surfacing.MAX_I_E_VALUE_OPTION );
+        allowed_options.add( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION );
          allowed_options.add( surfacing.DETAILEDNESS_OPTION );
          allowed_options.add( surfacing.OUTPUT_FILE_OPTION );
          allowed_options.add( surfacing.DOMAIN_SIMILARITY_SORT_OPTION );
@@ -352,6 +354,7 @@ public class surfacing {
          boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT;
          double fs_e_value_max = surfacing.MAX_E_VALUE_DEFAULT;
          double ie_value_max = surfacing.MAX_E_VALUE_DEFAULT;
+        double rel_env_length_ratio_cutoff = -1;
          int max_allowed_overlap = surfacing.MAX_ALLOWED_OVERLAP_DEFAULT;
          final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
          if ( dissallowed_options.length() > 0 ) {
@@ -393,6 +396,14 @@ public class surfacing {
                  ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for E-value maximum" );
              }
          }
+        if ( cla.isOptionSet( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION ) ) {
+            try {
+                rel_env_length_ratio_cutoff = cla.getOptionValueAsDouble( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION );
+            }
+            catch ( final Exception e ) {
+                ForesterUtil.fatalError( surfacing.PRG_NAME, "no acceptable value for min rel env length ratio" );
+            }
+        }
          if ( cla.isOptionSet( surfacing.MAX_I_E_VALUE_OPTION ) ) {
              try {
                  ie_value_max = cla.getOptionValueAsDouble( surfacing.MAX_I_E_VALUE_OPTION );
@@ -1106,6 +1117,11 @@ public class surfacing {
              System.out.println( "iE-value maximum (incl)     : " + ie_value_max );
              html_desc.append( "<tr><td>iE-value maximum (inclusive):</td><td>" + ie_value_max + "</td></tr>" + nl );
          }
+        if ( rel_env_length_ratio_cutoff > 0.0 ) {
+            System.out.println( "Rel env length ratio min    : " + rel_env_length_ratio_cutoff );
+            html_desc.append( "<tr><td>Relative hmm envelope length ratio min (inclusive):</td><td>"
+                    + rel_env_length_ratio_cutoff + "</td></tr>" + nl );
+        }
          if ( fs_e_value_max >= 0.0 ) {
              System.out.println( "FS E-value maximum (incl)   : " + fs_e_value_max );
              html_desc.append( "<tr><td>FS E-value maximum (inclusive):</td><td>" + fs_e_value_max + "</td></tr>" + nl );
@@ -1558,6 +1574,9 @@ public class surfacing {
              if ( ie_value_max >= 0.0 ) {
                  parser.setIEValueMaximum( ie_value_max );
              }
+            if ( rel_env_length_ratio_cutoff > 0.0 ) {
+                parser.setRelEnvLengthRatioCutoff( rel_env_length_ratio_cutoff );
+            }
              parser.setIgnoreDufs( ignore_dufs );
              parser.setIgnoreVirusLikeIds( ignore_virus_like_ids );
              parser.setIgnoreEngulfedDomains( no_engulfing_overlaps );
@@ -1628,6 +1647,10 @@ public class surfacing {
              SurfacingUtil
                      .log( "Domains ignored due to iE-value                : " + parser.getDomainsIgnoredDueToIEval(),
                            log_writer );
+            System.out.println( "Domains ignored due to rel env length ratio    : "
+                    + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff() );
+            SurfacingUtil.log( "Domains ignored due to rel env length ratio    : "
+                    + parser.getDomainsIgnoredDueToRelEnvLengthRatioCutoff(), log_writer );
              System.out.println( "Domains ignored due to DUF designation         : "
                      + parser.getDomainsIgnoredDueToDuf() );
              SurfacingUtil.log( "Domains ignored due to DUF designation         : " + parser.getDomainsIgnoredDueToDuf(),
@@ -1754,15 +1777,26 @@ public class surfacing {
                                   "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file );
          //
          if ( true ) { //TODO
-            MinimalDomainomeCalculator.calcDomainome( intree_0_orig, protein_lists_per_species, -1 );
+            try {
+                MinimalDomainomeCalculator.calcOme( false,
+                                                    intrees[ 0 ],
+                                                    protein_lists_per_species,
+                                                    "---",
+                                                    1000,
+                                                    out_dir.toString() + "/" + output_file );
+            }
+            catch ( IOException e ) {
+                ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
+            }
          }
          if ( true ) { //TODO
              try {
-                MinimalDomainomeCalculator.calcDAome( intree_0_orig,
-                                                      protein_lists_per_species,
-                                                      "---",
-                                                      1000,
-                                                      out_dir.toString() + "/" + output_file );
+                MinimalDomainomeCalculator.calcOme( true,
+                                                    intrees[ 0 ],
+                                                    protein_lists_per_species,
+                                                    "---",
+                                                    1000,
+                                                    out_dir.toString() + "/" + output_file );
              }
              catch ( IOException e ) {
                  ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
@@ -2211,6 +2245,8 @@ public class surfacing {
          System.out.println( surfacing.OUTPUT_FILE_OPTION + ": name for (main) output file (mandatory)" );
          System.out.println( surfacing.MAX_I_E_VALUE_OPTION + ": max (inclusive) iE-value" );
          System.out.println( surfacing.MAX_FS_E_VALUE_OPTION + ": max (inclusive) FS E-value" );
+        System.out.println( surfacing.MIN_REL_ENV_LENGTH_RATIO_OPTION
+                + ": min (inclusive) relative envelope length ratio" );
          System.out.println( surfacing.MAX_ALLOWED_OVERLAP_OPTION + ": maximal allowed domain overlap" );
          System.out.println( surfacing.NO_ENGULFING_OVERLAP_OPTION + ": to ignore engulfed lower confidence domains" );
          System.out.println( surfacing.SPECIES_MATRIX_OPTION + ": species matrix" );
diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java

index 5e0ba9c..c75e521 100644 (file)
--- a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java
+++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java
@@ -59,17 +59,17 @@ public final class HmmscanPerDomainTableParser {
      private static final String           HERPES                      = "HERPES_";
      private static final String           BACULO                      = "BACULO_";
      private static final int              E_VALUE_MAXIMUM_DEFAULT     = -1;
+    private static final int              LENGTH_RATIO_CUTOFF_DEFAULT = -1;
      private static final ReturnType       RETURN_TYPE_DEFAULT         = ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN;
      private static final boolean          IGNORE_DUFS_DEFAULT         = false;
      private static final int              MAX_ALLOWED_OVERLAP_DEFAULT = -1;
-    private static final boolean          IGNORE_REPLACED_RRMS        = false;
-    private static final boolean          IGNORE_hGDE_amylase         = true;                                                      //TODO eventually remove me, added 10/22/13
      private final Set<String>             _filter;
      private final FilterType              _filter_type;
      private final File                    _input_file;
      private final String                  _species;
      private double                        _fs_e_value_maximum;
      private double                        _i_e_value_maximum;
+    private double                        _rel_env_length_ratio_cutoff;
      private Map<String, Double>           _individual_score_cutoffs;
      private boolean                       _ignore_dufs;
      private boolean                       _ignore_virus_like_ids;
@@ -84,6 +84,7 @@ public final class HmmscanPerDomainTableParser {
      private int                           _domains_ignored_due_to_overlap;
      private int                           _domains_ignored_due_to_fs_e_value;
      private int                           _domains_ignored_due_to_i_e_value;
+    private int                           _domains_ignored_due_to_rel_env_length_ratio_cutoff;
      private int                           _domains_ignored_due_to_individual_score_cutoff;
      private int                           _domains_stored;
      private SortedSet<String>             _domains_stored_set;
@@ -173,7 +174,8 @@ public final class HmmscanPerDomainTableParser {
              _domains_stored -= domains_removed;
              _domains_ignored_due_to_overlap += domains_removed;
          }
-        if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN ) || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
+        if ( ( getFilterType() == FilterType.POSITIVE_PROTEIN )
+                || ( getFilterType() == FilterType.NEGATIVE_PROTEIN ) ) {
              final Set<String> domain_ids_in_protein = new HashSet<String>();
              for( final Domain d : current_protein.getProteinDomains() ) {
                  domain_ids_in_protein.add( d.getDomainId() );
@@ -212,6 +214,12 @@ public final class HmmscanPerDomainTableParser {
      public int getDomainsIgnoredDueToIEval() {
          return _domains_ignored_due_to_i_e_value;
      }
+    
+    public int getDomainsIgnoredDueToRelEnvLengthRatioCutoff() {
+        return _domains_ignored_due_to_rel_env_length_ratio_cutoff;
+    }
+    
+    
  
      public int getDomainsIgnoredDueToFsEval() {
          return _domains_ignored_due_to_fs_e_value;
@@ -256,6 +264,10 @@ public final class HmmscanPerDomainTableParser {
      private double getIEValueMaximum() {
          return _i_e_value_maximum;
      }
+    
+    private double getRelEnvLengthRatioCutoff() {
+        return _rel_env_length_ratio_cutoff;
+    }
  
      private Set<String> getFilter() {
          return _filter;
@@ -306,8 +318,9 @@ public final class HmmscanPerDomainTableParser {
      }
  
      private void init() {
-        _fs_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
-        _i_e_value_maximum = HmmscanPerDomainTableParser.E_VALUE_MAXIMUM_DEFAULT;
+        _fs_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+        _i_e_value_maximum = E_VALUE_MAXIMUM_DEFAULT;
+        _rel_env_length_ratio_cutoff = LENGTH_RATIO_CUTOFF_DEFAULT;
          setIgnoreDufs( HmmscanPerDomainTableParser.IGNORE_DUFS_DEFAULT );
          setReturnType( HmmscanPerDomainTableParser.RETURN_TYPE_DEFAULT );
          _max_allowed_overlap = HmmscanPerDomainTableParser.MAX_ALLOWED_OVERLAP_DEFAULT;
@@ -326,6 +339,7 @@ public final class HmmscanPerDomainTableParser {
          setDomainsIgnoredDueToDuf( 0 );
          setDomainsIgnoredDueToFsEval( 0 );
          setDomainsIgnoredDueToIEval( 0 );
+        setDomainsIgnoredDueToRelEnvLengthRatioCutoff( 0 );
          setDomainsIgnoredDueToIndividualScoreCutoff( 0 );
          setDomainsIgnoredDueToVirusLikeId( 0 );
          setDomainsIgnoredDueToOverlap( 0 );
@@ -406,7 +420,7 @@ public final class HmmscanPerDomainTableParser {
                  if ( !isAllowProteinsWithSameName() ) {
                      if ( query.equals( prev_query ) ) {
                          throw new IOException( "more than one protein named [" + query + "]" + " lengths: " + qlen
-                                               + ", " + prev_qlen );
+                                + ", " + prev_qlen );
                      }
                      if ( prev_queries.contains( query ) ) {
                          throw new IOException( "more than one protein named [" + query + "]" );
@@ -442,10 +456,11 @@ public final class HmmscanPerDomainTableParser {
                  }
                  else {
                      throw new IOException( "could not find a score cutoff value for domain id \"" + target_id
-                                           + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                            + "\" [line " + line_number + "] in [" + getInputFile().getCanonicalPath() + "]" );
                  }
              }
              final String uc_id = target_id.toUpperCase();
+            final int env_length = 1 + env_to - env_from;
              if ( failed_cutoff ) {
                  ++_domains_ignored_due_to_individual_score_cutoff;
              }
@@ -460,15 +475,15 @@ public final class HmmscanPerDomainTableParser {
                      && ( i_e_value > getIEValueMaximum() ) ) {
                  ++_domains_ignored_due_to_i_e_value;
              }
+            //
+            else if ( ( getRelEnvLengthRatioCutoff() > 0.0 )
+                    && (  env_length < ( getRelEnvLengthRatioCutoff() * tlen)   ) ) {
+                ++_domains_ignored_due_to_rel_env_length_ratio_cutoff;
+            }
+            //
              else if ( isIgnoreDufs() && uc_id.startsWith( "DUF" ) ) {
                  ++_domains_ignored_due_to_duf;
              }
-            else if ( IGNORE_REPLACED_RRMS
-                    && ( uc_id.contains( "RRM_1" ) || uc_id.contains( "RRM_3" ) || uc_id.contains( "RRM_5" ) || uc_id
-                            .contains( "RRM_6" ) ) ) {
-            }
-            else if ( IGNORE_hGDE_amylase && ( uc_id.equals( "hGDE_amylase" ) ) ) {
-            }
              else if ( isIgnoreVirusLikeIds()
                      && ( uc_id.contains( VIR ) || uc_id.contains( PHAGE ) || uc_id.contains( RETRO )
                              || uc_id.contains( TRANSPOS ) || uc_id.startsWith( RV ) || uc_id.startsWith( GAG )
@@ -488,7 +503,10 @@ public final class HmmscanPerDomainTableParser {
                                                         ( short ) domain_number,
                                                         ( short ) total_domains,
                                                         i_e_value,
-                                                       domain_score );
+                                                       domain_score,
+                                                       ( short ) tlen,
+                                                       ( short ) hmm_from,
+                                                       ( short ) hmm_to );
                      current_protein.addProteinDomain( pd );
                  }
                  catch ( final IllegalArgumentException e ) {
@@ -506,14 +524,15 @@ public final class HmmscanPerDomainTableParser {
          return proteins;
      }
  
-    private double parseDouble( final String double_str, final int line_number, final String label ) throws IOException {
+    private double parseDouble( final String double_str, final int line_number, final String label )
+            throws IOException {
          double d = -1;
          try {
              d = Double.valueOf( double_str ).doubleValue();
          }
          catch ( final NumberFormatException e ) {
              throw new IOException( "could not parse \" +label + \" from \"" + double_str + "\" [line " + line_number
-                                   + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                    + "] in [" + getInputFile().getCanonicalPath() + "]" );
          }
          return d;
      }
@@ -525,7 +544,7 @@ public final class HmmscanPerDomainTableParser {
          }
          catch ( final NumberFormatException e ) {
              throw new IOException( "could not parse \"" + label + "\" from \"" + double_str + "\" [line " + line_number
-                                   + "] in [" + getInputFile().getCanonicalPath() + "]" );
+                    + "] in [" + getInputFile().getCanonicalPath() + "]" );
          }
          return i;
      }
@@ -545,7 +564,13 @@ public final class HmmscanPerDomainTableParser {
      private void setDomainsIgnoredDueToIEval( final int domains_ignored_due_to_i_e_value ) {
          _domains_ignored_due_to_i_e_value = domains_ignored_due_to_i_e_value;
      }
+    
+    private void setDomainsIgnoredDueToRelEnvLengthRatioCutoff( final int domains_ignored_due_to_rel_env_length_ratio_cutoff ) {
+        _domains_ignored_due_to_rel_env_length_ratio_cutoff = domains_ignored_due_to_rel_env_length_ratio_cutoff;
+    }
  
+    
+    
      private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
          _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
      }
@@ -591,6 +616,13 @@ public final class HmmscanPerDomainTableParser {
          }
          _i_e_value_maximum = i_e_value_maximum;
      }
+    
+    public void setRelEnvLengthRatioCutoff( final double rel_env_length_ratio_cutoff ) {
+        if ( rel_env_length_ratio_cutoff <= 0.0 ) {
+            throw new IllegalArgumentException( "attempt to set rel env length ratio cutoff to zero or a negative value" );
+        }
+        _rel_env_length_ratio_cutoff = rel_env_length_ratio_cutoff;
+    }
  
      public void setIgnoreDufs( final boolean ignore_dufs ) {
          _ignore_dufs = ignore_dufs;
@@ -649,14 +681,19 @@ public final class HmmscanPerDomainTableParser {
      }
  
      public static enum FilterType {
-        NONE, POSITIVE_PROTEIN, NEGATIVE_PROTEIN, NEGATIVE_DOMAIN
+                                   NONE,
+                                   POSITIVE_PROTEIN,
+                                   NEGATIVE_PROTEIN,
+                                   NEGATIVE_DOMAIN
      }
  
      static public enum INDIVIDUAL_SCORE_CUTOFF {
-        FULL_SEQUENCE, DOMAIN, NONE;
+                                                FULL_SEQUENCE,
+                                                DOMAIN,
+                                                NONE;
      }
  
      public static enum ReturnType {
-        UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
+                                   UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN
      }
  }
diff --git a/forester/java/src/org/forester/protein/BasicDomain.java b/forester/java/src/org/forester/protein/BasicDomain.java

index b132898..b7ab1d8 100644 (file)
--- a/forester/java/src/org/forester/protein/BasicDomain.java
+++ b/forester/java/src/org/forester/protein/BasicDomain.java
@@ -43,6 +43,9 @@ public class BasicDomain implements Domain {
      final private double                    _per_domain_score;
      final private int                       _to;
      final private short                     _total_count;
+    final private short                     _hmm_len;
+    final private short                     _hmm_from;
+    final private short                     _hmm_to;
  
      public BasicDomain( final String id ) {
          if ( ForesterUtil.isEmpty( id ) ) {
@@ -55,6 +58,10 @@ public class BasicDomain implements Domain {
          _total_count = -1;
          _per_domain_evalue = -1;
          _per_domain_score = -1;
+        _hmm_len = -1;
+        _hmm_from= -1;
+        _hmm_to= -1;
+
      }
  
      public BasicDomain( final String id,
@@ -84,6 +91,50 @@ public class BasicDomain implements Domain {
          _total_count = total_count;
          _per_domain_evalue = per_domain_evalue;
          _per_domain_score = per_domain_score;
+        _hmm_len = -1;
+        _hmm_from= -1;
+        _hmm_to= -1;
+    }
+    
+    public BasicDomain( final String id,
+                        final int from,
+                        final int to,
+                        final short number,
+                        final short total_count,
+                        final double per_domain_evalue,
+                        final double per_domain_score,
+                        final short hmm_len,
+                        final short hmm_from,
+                        final short hmm_to) {
+        if ( ( from >= to ) || ( from < 0 ) ) {
+            throw new IllegalArgumentException( "attempt to create protein domain from " + from + " to " + to );
+        }
+        if ( ForesterUtil.isEmpty( id ) ) {
+            throw new IllegalArgumentException( "attempt to create protein domain with null or empty id" );
+        }
+        if ( ( number > total_count ) || ( number < 0 ) ) {
+            throw new IllegalArgumentException( "attempt to create protein domain number " + number + " out of "
+                    + total_count );
+        }
+        if ( per_domain_evalue < 0.0 ) {
+            throw new IllegalArgumentException( "attempt to create protein domain with negative E-value" );
+        }
+        if ( ( hmm_from >= hmm_to ) || (  hmm_from < 0 ) ) {
+            throw new IllegalArgumentException( "attempt to create protein domain matching hmm from " + from + " to " + to );
+        }
+        if ( hmm_len <= 0 ) {
+            throw new IllegalArgumentException( "attempt to create protein domain with zero or negative hmm length" );
+        }
+        _id = obtainIdAsShort( id );
+        _from = from;
+        _to = to;
+        _number = number;
+        _total_count = total_count;
+        _per_domain_evalue = per_domain_evalue;
+        _per_domain_score = per_domain_score;
+        _hmm_len = hmm_len;
+        _hmm_from= hmm_from;
+        _hmm_to= hmm_to;
      }
  
      /**
@@ -166,6 +217,21 @@ public class BasicDomain implements Domain {
      }
  
      @Override
+    public final short getHmmLen() {
+        return _hmm_len;
+    }
+
+    @Override
+    public final short getHmmFrom() {
+        return _hmm_from;
+    }
+
+    @Override
+    public final short getHmmTo() {
+        return _hmm_to;
+    }
+    
+    @Override
      public int hashCode() {
          return getDomainId().hashCode();
      }
@@ -194,4 +260,6 @@ public class BasicDomain implements Domain {
      public final static String obtainIdFromShort( final short id ) {
          return ID_TO_STRING.get( id );
      }
+
+    
  }
diff --git a/forester/java/src/org/forester/protein/Domain.java b/forester/java/src/org/forester/protein/Domain.java

index b1f720a..dcc41fb 100644 (file)
--- a/forester/java/src/org/forester/protein/Domain.java
+++ b/forester/java/src/org/forester/protein/Domain.java
@@ -43,4 +43,10 @@ public interface Domain extends Comparable<Domain> {
      public int getTo();
  
      public short getTotalCount();
+
+    short getHmmLen();
+
+    short getHmmFrom();
+
+    short getHmmTo();
  }
\ No newline at end of file
diff --git a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java

index f252202..ba667ce 100644 (file)
--- a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java
+++ b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java
@@ -79,6 +79,170 @@ public final class MinimalDomainomeCalculator {
          }
      }
  
+    static final public void calcOme( final boolean use_domain_architectures,
+                                      final Phylogeny tre,
+                                      final SortedMap<Species, List<Protein>> protein_lists_per_species,
+                                      final String separator,
+                                      final double ie_cutoff,
+                                      final String outfile_base )
+            throws IOException {
+        final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
+        if ( protein_lists_per_species == null || tre == null ) {
+            throw new IllegalArgumentException( "argument is null" );
+        }
+        if ( protein_lists_per_species.size() < 2 ) {
+            throw new IllegalArgumentException( "not enough genomes" );
+        }
+        final String x;
+        if ( use_domain_architectures ) {
+            x = "DA";
+        }
+        else {
+            x = "domain";
+        }
+        final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
+        final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
+        SurfacingUtil.checkForOutputFileWriteability( outfile );
+        SurfacingUtil.checkForOutputFileWriteability( outfile_table );
+        final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
+        final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
+        out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
+        out.write( ForesterUtil.LINE_SEPARATOR );
+        for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
+            final PhylogenyNode node = iter.next();
+            final String species_name = node.getNodeData().isHasTaxonomy()
+                    ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
+            final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
+                    : "";
+            final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
+                    : "";
+            final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
+            out.write( species_name );
+            if ( !ForesterUtil.isEmpty( common ) ) {
+                out.write( "\t" + common );
+            }
+            else {
+                out.write( "\t" );
+            }
+            if ( !ForesterUtil.isEmpty( tcode ) ) {
+                out.write( "\t" + tcode );
+            }
+            else {
+                out.write( "\t" );
+            }
+            if ( !ForesterUtil.isEmpty( rank ) ) {
+                out.write( "\t" + rank );
+            }
+            else {
+                out.write( "\t" );
+            }
+            final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
+            if ( node.isInternal() ) {
+                out.write( "\t" + external_descs.size() + "\t" );
+            }
+            else {
+                out.write( "\t\t" );
+            }
+            final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
+            boolean first = true;
+            for( final PhylogenyNode external_desc : external_descs ) {
+                final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
+                if ( node.isInternal() ) {
+                    if ( first ) {
+                        first = false;
+                    }
+                    else {
+                        out.write( ", " );
+                    }
+                    out.write( code );
+                }
+                final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
+                if ( proteins_per_species != null ) {
+                    final SortedSet<String> das_per_genome = new TreeSet<String>();
+                    for( final Protein protein : proteins_per_species ) {
+                        if ( use_domain_architectures ) {
+                            final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
+                            das_per_genome.add( da );
+                        }
+                        else {
+                            List<Domain> domains = protein.getProteinDomains();
+                            for( final Domain domain : domains ) {
+                                if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
+                                    das_per_genome.add( domain.getDomainId() );
+                                }
+                            }
+                        }
+                    }
+                    if ( das_per_genome.size() > 0 ) {
+                        das_per_genome_list.add( das_per_genome );
+                    }
+                }
+            }
+            if ( das_per_genome_list.size() > 0 ) {
+                SortedSet<String> intersection = calcIntersection( das_per_genome_list );
+                out.write( "\t" + intersection.size() + "\t" );
+                first = true;
+                for( final String s : intersection ) {
+                    if ( first ) {
+                        first = false;
+                    }
+                    else {
+                        out.write( ", " );
+                    }
+                    out.write( s );
+                }
+                out.write( ForesterUtil.LINE_SEPARATOR );
+                species_to_das_map.put( species_name, intersection );
+            }
+        }
+        final SortedSet<String> all_species_names = new TreeSet<String>();
+        final SortedSet<String> all_das = new TreeSet<String>();
+        for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
+            all_species_names.add( e.getKey() );
+            for( final String das : e.getValue() ) {
+                all_das.add( das );
+            }
+        }
+        out_table.write( '\t' );
+        boolean first = true;
+        for( final String species_name : all_species_names ) {
+            if ( first ) {
+                first = false;
+            }
+            else {
+                out_table.write( '\t' );
+            }
+            out_table.write( species_name );
+        }
+        out_table.write( ForesterUtil.LINE_SEPARATOR );
+        for( final String das : all_das ) {
+            out_table.write( das );
+            out_table.write( '\t' );
+            first = true;
+            for( final String species_name : all_species_names ) {
+                if ( first ) {
+                    first = false;
+                }
+                else {
+                    out_table.write( '\t' );
+                }
+                if ( species_to_das_map.get( species_name ).contains( das ) ) {
+                    out_table.write( '1' );
+                }
+                else {
+                    out_table.write( '0' );
+                }
+            }
+            out_table.write( ForesterUtil.LINE_SEPARATOR );
+        }
+        out.flush();
+        out.close();
+        out_table.flush();
+        out_table.close();
+        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
+        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
+    }
+
      static final public void calcDAome( final Phylogeny tre,
                                          final SortedMap<Species, List<Protein>> protein_lists_per_species,
                                          final String separator,
@@ -98,17 +262,56 @@ public final class MinimalDomainomeCalculator {
          SurfacingUtil.checkForOutputFileWriteability( outfile_table );
          final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
          final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
+        out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
+        out.write( ForesterUtil.LINE_SEPARATOR );
          for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
              final PhylogenyNode node = iter.next();
              final String species_name = node.getNodeData().isHasTaxonomy()
                      ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
+            final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
+                    : "";
+            final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
+                    : "";
+            final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
              out.write( species_name );
+            if ( !ForesterUtil.isEmpty( common ) ) {
+                out.write( "\t" + common );
+            }
+            else {
+                out.write( "\t" );
+            }
+            if ( !ForesterUtil.isEmpty( tcode ) ) {
+                out.write( "\t" + tcode );
+            }
+            else {
+                out.write( "\t" );
+            }
+            if ( !ForesterUtil.isEmpty( rank ) ) {
+                out.write( "\t" + rank );
+            }
+            else {
+                out.write( "\t" );
+            }
              final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
-            out.write( "\t[" + external_descs.size() + "]:" );
+            if ( node.isInternal() ) {
+                out.write( "\t" + external_descs.size() + "\t" );
+            }
+            else {
+                out.write( "\t\t" );
+            }
              final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
+            boolean first = true;
              for( final PhylogenyNode external_desc : external_descs ) {
                  final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
-                out.write( '\t' + code );
+                if ( node.isInternal() ) {
+                    if ( first ) {
+                        first = false;
+                    }
+                    else {
+                        out.write( ", " );
+                    }
+                    out.write( code );
+                }
                  final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
                  if ( proteins_per_species != null ) {
                      final SortedSet<String> das_per_genome = new TreeSet<String>();
@@ -120,15 +323,21 @@ public final class MinimalDomainomeCalculator {
                          das_per_genome_list.add( das_per_genome );
                      }
                  }
-            } 
+            }
              if ( das_per_genome_list.size() > 0 ) {
                  SortedSet<String> intersection = calcIntersection( das_per_genome_list );
-                out.write( "\t[" + intersection.size() + "]:" );
+                out.write( "\t" + intersection.size() + "\t" );
+                first = true;
                  for( final String s : intersection ) {
-                    out.write( '\t' + s );    
+                    if ( first ) {
+                        first = false;
+                    }
+                    else {
+                        out.write( ", " );
+                    }
+                    out.write( s );
                  }
                  out.write( ForesterUtil.LINE_SEPARATOR );
-                out.write( ForesterUtil.LINE_SEPARATOR );
                  species_to_das_map.put( species_name, intersection );
              }
          }
@@ -176,10 +385,8 @@ public final class MinimalDomainomeCalculator {
          out.close();
          out_table.flush();
          out_table.close();
-        ForesterUtil.programMessage( surfacing.PRG_NAME,
-                                     "Wrote minimal DAome data to           : " + outfile );
-        ForesterUtil.programMessage( surfacing.PRG_NAME,
-                                     "Wrote minimal DAome data to (as table): " + outfile_table );
+        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
+        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
      }
  
      private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
diff --git a/forester/java/src/org/forester/surfacing/SimpleDomain.java b/forester/java/src/org/forester/surfacing/SimpleDomain.java

index 782ce20..2520e6c 100644 (file)
--- a/forester/java/src/org/forester/surfacing/SimpleDomain.java
+++ b/forester/java/src/org/forester/surfacing/SimpleDomain.java
@@ -92,4 +92,19 @@ public class SimpleDomain implements Domain {
      public short getTotalCount() {
          throw new RuntimeException( "method not implemented" );
      }
+
+    @Override
+    public short getHmmLen() {
+        throw new RuntimeException( "method not implemented" );
+    }
+
+    @Override
+    public short getHmmFrom() {
+        throw new RuntimeException( "method not implemented" );
+    }
+
+    @Override
+    public short getHmmTo() {
+        throw new RuntimeException( "method not implemented" );
+    }
  }
author	cmzmasek <chris.zma@outlook.com>
	Fri, 24 Mar 2017 01:51:30 +0000 (18:51 -0700)
committer	cmzmasek <chris.zma@outlook.com>
	Fri, 24 Mar 2017 01:51:30 +0000 (18:51 -0700)
forester/java/src/org/forester/application/surfacing.java		patch \| blob \| history
forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java		patch \| blob \| history
forester/java/src/org/forester/protein/BasicDomain.java		patch \| blob \| history
forester/java/src/org/forester/protein/Domain.java		patch \| blob \| history
forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java		patch \| blob \| history
forester/java/src/org/forester/surfacing/SimpleDomain.java		patch \| blob \| history