in progress...

author cmzmasek <chris.zma@outlook.com>

Mon, 27 Mar 2017 05:01:52 +0000 (22:01 -0700)

committer cmzmasek <chris.zma@outlook.com>

Mon, 27 Mar 2017 05:01:52 +0000 (22:01 -0700)
author cmzmasek <chris.zma@outlook.com>
Mon, 27 Mar 2017 05:01:52 +0000 (22:01 -0700)
committer cmzmasek <chris.zma@outlook.com>
Mon, 27 Mar 2017 05:01:52 +0000 (22:01 -0700)
diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java

index c35c98d..88be3d2 100644 (file)
--- a/forester/java/src/org/forester/application/surfacing.java
+++ b/forester/java/src/org/forester/application/surfacing.java
@@ -1775,32 +1775,27 @@ public class surfacing {
          ForesterUtil
                  .programMessage( PRG_NAME,
                                   "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file );
-        //
-        if ( true ) { //TODO
-            try {
-                MinimalDomainomeCalculator.calcOme( false,
-                                                    intrees[ 0 ],
-                                                    protein_lists_per_species,
-                                                    "---",
-                                                    1000,
-                                                    out_dir.toString() + "/" + output_file );
-            }
-            catch ( IOException e ) {
-                ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
-            }
+        try {
+            MinimalDomainomeCalculator.calcOme( false,
+                                                intrees[ 0 ],
+                                                protein_lists_per_species,
+                                                "---",
+                                                -1,
+                                                out_dir.toString() + "/" + output_file );
          }
-        if ( true ) { //TODO
-            try {
-                MinimalDomainomeCalculator.calcOme( true,
-                                                    intrees[ 0 ],
-                                                    protein_lists_per_species,
-                                                    "---",
-                                                    1000,
-                                                    out_dir.toString() + "/" + output_file );
-            }
-            catch ( IOException e ) {
-                ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
-            }
+        catch ( IOException e ) {
+            ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
+        }
+        try {
+            MinimalDomainomeCalculator.calcOme( true,
+                                                intrees[ 0 ],
+                                                protein_lists_per_species,
+                                                "---",
+                                                -1,
+                                                out_dir.toString() + "/" + output_file );
+        }
+        catch ( IOException e ) {
+            ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() );
          }
          if ( da_analysis ) {
              SurfacingUtil.performDomainArchitectureAnalysis( distinct_domain_architecutures_per_genome,
diff --git a/forester/java/src/org/forester/protein/BasicProtein.java b/forester/java/src/org/forester/protein/BasicProtein.java

index 975e48d..91f5eb4 100644 (file)
--- a/forester/java/src/org/forester/protein/BasicProtein.java
+++ b/forester/java/src/org/forester/protein/BasicProtein.java
@@ -233,6 +233,8 @@ public class BasicProtein implements Protein {
          return sb.toString();
      }
      
+    
+    
      public final String toDomainArchitectureString( final String separator ) {
          return toDomainArchitectureString( separator, -1 );
      }
diff --git a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java

index ba667ce..9abb02a 100644 (file)
--- a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java
+++ b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java
@@ -5,11 +5,11 @@ import java.io.BufferedWriter;
  import java.io.File;
  import java.io.FileWriter;
  import java.io.IOException;
+import java.io.Writer;
  import java.util.ArrayList;
-import java.util.HashMap;
+import java.util.Arrays;
  import java.util.HashSet;
  import java.util.List;
-import java.util.Map;
  import java.util.Map.Entry;
  import java.util.Set;
  import java.util.SortedMap;
@@ -25,60 +25,11 @@ import org.forester.protein.Domain;
  import org.forester.protein.Protein;
  import org.forester.species.BasicSpecies;
  import org.forester.species.Species;
+import org.forester.surfacing.SurfacingUtil.DomainComparator;
  import org.forester.util.ForesterUtil;
  
  public final class MinimalDomainomeCalculator {
  
-    static final public void calcDomainome( final Phylogeny tre,
-                                            final SortedMap<Species, List<Protein>> protein_lists_per_species,
-                                            final double ie_cutoff ) {
-        if ( protein_lists_per_species == null || tre == null ) {
-            throw new IllegalArgumentException( "argument is null" );
-        }
-        if ( protein_lists_per_species.size() < 2 ) {
-            throw new IllegalArgumentException( "not enough genomes" );
-        }
-        for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
-            final PhylogenyNode node = iter.next();
-            if ( node.isInternal() ) {
-                System.out.println();
-                if ( node.getNodeData().isHasTaxonomy() ) {
-                    System.out.println( node.getNodeData().getTaxonomy().getScientificName() + ":" );
-                }
-                else {
-                    System.out.println( node.getName() + ":" );
-                }
-                final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
-                final List<Set<String>> domains_per_genome_list = new ArrayList<Set<String>>();
-                for( final PhylogenyNode external_desc : external_descs ) {
-                    final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
-                    System.out.print( code + " " );
-                    final List<Protein> proteins_per_species = protein_lists_per_species
-                            .get( new BasicSpecies( code ) );
-                    if ( proteins_per_species != null ) {
-                        final SortedSet<String> domains_per_genome = new TreeSet<String>();
-                        for( final Protein protein : proteins_per_species ) {
-                            List<Domain> domains = protein.getProteinDomains();
-                            for( final Domain domain : domains ) {
-                                if ( ( domain.getPerDomainEvalue() <= ie_cutoff ) || ( ie_cutoff <= -1 ) ) {
-                                    domains_per_genome.add( domain.getDomainId() );
-                                }
-                            }
-                        }
-                        if ( domains_per_genome.size() > 0 ) {
-                            domains_per_genome_list.add( domains_per_genome );
-                        }
-                    }
-                }
-                System.out.println();
-                if ( domains_per_genome_list.size() > 0 ) {
-                    Set<String> intersection = calcIntersection( domains_per_genome_list );
-                    System.out.println( intersection );
-                }
-            }
-        }
-    }
-
      static final public void calcOme( final boolean use_domain_architectures,
                                        final Phylogeny tre,
                                        final SortedMap<Species, List<Protein>> protein_lists_per_species,
@@ -86,7 +37,7 @@ public final class MinimalDomainomeCalculator {
                                        final double ie_cutoff,
                                        final String outfile_base )
              throws IOException {
-        final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
+        final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
          if ( protein_lists_per_species == null || tre == null ) {
              throw new IllegalArgumentException( "argument is null" );
          }
@@ -106,7 +57,7 @@ public final class MinimalDomainomeCalculator {
          SurfacingUtil.checkForOutputFileWriteability( outfile_table );
          final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
          final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
-        out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
+        out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
          out.write( ForesterUtil.LINE_SEPARATOR );
          for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
              final PhylogenyNode node = iter.next();
@@ -143,7 +94,7 @@ public final class MinimalDomainomeCalculator {
              else {
                  out.write( "\t\t" );
              }
-            final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
+            final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
              boolean first = true;
              for( final PhylogenyNode external_desc : external_descs ) {
                  final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
@@ -158,28 +109,28 @@ public final class MinimalDomainomeCalculator {
                  }
                  final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
                  if ( proteins_per_species != null ) {
-                    final SortedSet<String> das_per_genome = new TreeSet<String>();
+                    final SortedSet<String> features_per_genome = new TreeSet<String>();
                      for( final Protein protein : proteins_per_species ) {
                          if ( use_domain_architectures ) {
                              final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
-                            das_per_genome.add( da );
+                            features_per_genome.add( da );
                          }
                          else {
                              List<Domain> domains = protein.getProteinDomains();
                              for( final Domain domain : domains ) {
                                  if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
-                                    das_per_genome.add( domain.getDomainId() );
+                                    features_per_genome.add( domain.getDomainId() );
                                  }
                              }
                          }
                      }
-                    if ( das_per_genome.size() > 0 ) {
-                        das_per_genome_list.add( das_per_genome );
+                    if ( features_per_genome.size() > 0 ) {
+                        features_per_genome_list.add( features_per_genome );
                      }
                  }
              }
-            if ( das_per_genome_list.size() > 0 ) {
-                SortedSet<String> intersection = calcIntersection( das_per_genome_list );
+            if ( features_per_genome_list.size() > 0 ) {
+                SortedSet<String> intersection = calcIntersection( features_per_genome_list );
                  out.write( "\t" + intersection.size() + "\t" );
                  first = true;
                  for( final String s : intersection ) {
@@ -192,15 +143,15 @@ public final class MinimalDomainomeCalculator {
                      out.write( s );
                  }
                  out.write( ForesterUtil.LINE_SEPARATOR );
-                species_to_das_map.put( species_name, intersection );
+                species_to_features_map.put( species_name, intersection );
              }
          }
          final SortedSet<String> all_species_names = new TreeSet<String>();
-        final SortedSet<String> all_das = new TreeSet<String>();
-        for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
+        final SortedSet<String> all_features = new TreeSet<String>();
+        for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
              all_species_names.add( e.getKey() );
-            for( final String das : e.getValue() ) {
-                all_das.add( das );
+            for( final String f : e.getValue() ) {
+                all_features.add( f );
              }
          }
          out_table.write( '\t' );
@@ -215,7 +166,7 @@ public final class MinimalDomainomeCalculator {
              out_table.write( species_name );
          }
          out_table.write( ForesterUtil.LINE_SEPARATOR );
-        for( final String das : all_das ) {
+        for( final String das : all_features ) {
              out_table.write( das );
              out_table.write( '\t' );
              first = true;
@@ -226,7 +177,7 @@ public final class MinimalDomainomeCalculator {
                  else {
                      out_table.write( '\t' );
                  }
-                if ( species_to_das_map.get( species_name ).contains( das ) ) {
+                if ( species_to_features_map.get( species_name ).contains( das ) ) {
                      out_table.write( '1' );
                  }
                  else {
@@ -241,152 +192,26 @@ public final class MinimalDomainomeCalculator {
          out_table.close();
          ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
          ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
-    }
-
-    static final public void calcDAome( final Phylogeny tre,
-                                        final SortedMap<Species, List<Protein>> protein_lists_per_species,
-                                        final String separator,
-                                        final double ie_cutoff,
-                                        final String outfile_base )
-            throws IOException {
-        final SortedMap<String, SortedSet<String>> species_to_das_map = new TreeMap<String, SortedSet<String>>();
-        if ( protein_lists_per_species == null || tre == null ) {
-            throw new IllegalArgumentException( "argument is null" );
-        }
-        if ( protein_lists_per_species.size() < 2 ) {
-            throw new IllegalArgumentException( "not enough genomes" );
-        }
-        final File outfile = new File( outfile_base + "_minimal_daome.txt" );
-        final File outfile_table = new File( outfile_base + "_minimal_daome.tsv" );
-        SurfacingUtil.checkForOutputFileWriteability( outfile );
-        SurfacingUtil.checkForOutputFileWriteability( outfile_table );
-        final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
-        final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
-        out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#DA\tDA" );
-        out.write( ForesterUtil.LINE_SEPARATOR );
-        for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
-            final PhylogenyNode node = iter.next();
-            final String species_name = node.getNodeData().isHasTaxonomy()
-                    ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
-            final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
-                    : "";
-            final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
-                    : "";
-            final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
-            out.write( species_name );
-            if ( !ForesterUtil.isEmpty( common ) ) {
-                out.write( "\t" + common );
-            }
-            else {
-                out.write( "\t" );
-            }
-            if ( !ForesterUtil.isEmpty( tcode ) ) {
-                out.write( "\t" + tcode );
-            }
-            else {
-                out.write( "\t" );
-            }
-            if ( !ForesterUtil.isEmpty( rank ) ) {
-                out.write( "\t" + rank );
-            }
-            else {
-                out.write( "\t" );
-            }
-            final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
-            if ( node.isInternal() ) {
-                out.write( "\t" + external_descs.size() + "\t" );
-            }
-            else {
-                out.write( "\t\t" );
-            }
-            final List<Set<String>> das_per_genome_list = new ArrayList<Set<String>>();
-            boolean first = true;
-            for( final PhylogenyNode external_desc : external_descs ) {
-                final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
-                if ( node.isInternal() ) {
-                    if ( first ) {
-                        first = false;
-                    }
-                    else {
-                        out.write( ", " );
-                    }
-                    out.write( code );
-                }
-                final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
-                if ( proteins_per_species != null ) {
-                    final SortedSet<String> das_per_genome = new TreeSet<String>();
-                    for( final Protein protein : proteins_per_species ) {
-                        final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
-                        das_per_genome.add( da );
-                    }
-                    if ( das_per_genome.size() > 0 ) {
-                        das_per_genome_list.add( das_per_genome );
-                    }
-                }
-            }
-            if ( das_per_genome_list.size() > 0 ) {
-                SortedSet<String> intersection = calcIntersection( das_per_genome_list );
-                out.write( "\t" + intersection.size() + "\t" );
-                first = true;
-                for( final String s : intersection ) {
-                    if ( first ) {
-                        first = false;
-                    }
-                    else {
-                        out.write( ", " );
-                    }
-                    out.write( s );
-                }
-                out.write( ForesterUtil.LINE_SEPARATOR );
-                species_to_das_map.put( species_name, intersection );
-            }
-        }
-        final SortedSet<String> all_species_names = new TreeSet<String>();
-        final SortedSet<String> all_das = new TreeSet<String>();
-        for( final Entry<String, SortedSet<String>> e : species_to_das_map.entrySet() ) {
-            all_species_names.add( e.getKey() );
-            for( final String das : e.getValue() ) {
-                all_das.add( das );
-            }
-        }
-        out_table.write( '\t' );
-        boolean first = true;
-        for( final String species_name : all_species_names ) {
-            if ( first ) {
-                first = false;
+        for( String f : all_features ) {
+            final String a;
+            if ( use_domain_architectures ) {
+                a = "DA_";
              }
              else {
-                out_table.write( '\t' );
-            }
-            out_table.write( species_name );
-        }
-        out_table.write( ForesterUtil.LINE_SEPARATOR );
-        for( final String das : all_das ) {
-            out_table.write( das );
-            out_table.write( '\t' );
-            first = true;
-            for( final String species_name : all_species_names ) {
-                if ( first ) {
-                    first = false;
-                }
-                else {
-                    out_table.write( '\t' );
-                }
-                if ( species_to_das_map.get( species_name ).contains( das ) ) {
-                    out_table.write( '1' );
-                }
-                else {
-                    out_table.write( '0' );
-                }
+                a = "domain_";
              }
-            out_table.write( ForesterUtil.LINE_SEPARATOR );
+            final File prot_dir = new File( outfile_base + "_prot" );
+            prot_dir.mkdir();
+            final File outt = new File( outfile_base + "_prot/" + a + f + surfacing.SEQ_EXTRACT_SUFFIX );
+            final Writer proteins_file_writer = new BufferedWriter( new FileWriter( outt ) );
+            extractProteinFeatures( use_domain_architectures,
+                                    protein_lists_per_species,
+                                    f,
+                                    proteins_file_writer,
+                                    ie_cutoff,
+                                    separator );
+            proteins_file_writer.close();
          }
-        out.flush();
-        out.close();
-        out_table.flush();
-        out_table.close();
-        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
-        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
      }
  
      private final static SortedSet<String> calcIntersection( final List<Set<String>> features_per_genome_list ) {
@@ -401,6 +226,102 @@ public final class MinimalDomainomeCalculator {
          return my_first;
      }
  
+    public static void extractProteinFeatures( final boolean use_domain_architectures,
+                                               final SortedMap<Species, List<Protein>> protein_lists_per_species,
+                                               final String domain_id,
+                                               final Writer out,
+                                               final double ie_cutoff,
+                                               final String domain_separator )
+            throws IOException {
+        final String separator_for_output = "\t";
+        for( final Species species : protein_lists_per_species.keySet() ) {
+            final List<Protein> proteins_per_species = protein_lists_per_species.get( species );
+            for( final Protein protein : proteins_per_species ) {
+                if ( use_domain_architectures ) {
+                    if ( domain_id.equals( protein.toDomainArchitectureString( domain_separator, ie_cutoff ) ) ) {
+                        int from = Integer.MAX_VALUE;
+                        int to = -1;
+                        for( final Domain d : protein.getProteinDomains() ) {
+                            if ( ( ie_cutoff <= -1 ) || ( d.getPerDomainEvalue() <= ie_cutoff ) ) {
+                                if ( d.getFrom() < from ) {
+                                    from = d.getFrom();
+                                }
+                                if ( d.getTo() > to ) {
+                                    to = d.getTo();
+                                }
+                            }
+                        }
+                        out.write( protein.getSpecies().getSpeciesId() );
+                        out.write( separator_for_output );
+                        out.write( protein.getProteinId().getId() );
+                        out.write( separator_for_output );
+                        out.write( domain_id );
+                        out.write( separator_for_output );
+                        out.write( "/" );
+                        out.write( from + "-" + to );
+                        out.write( "/" );
+                        out.write( SurfacingConstants.NL );
+                    }
+                }
+                else {
+                    final List<Domain> domains = protein.getProteinDomains( domain_id );
+                    if ( domains.size() > 0 ) {
+                        out.write( protein.getSpecies().getSpeciesId() );
+                        out.write( separator_for_output );
+                        out.write( protein.getProteinId().getId() );
+                        out.write( separator_for_output );
+                        out.write( domain_id );
+                        out.write( separator_for_output );
+                        for( final Domain domain : domains ) {
+                            if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
+                                out.write( "/" );
+                                out.write( domain.getFrom() + "-" + domain.getTo() );
+                            }
+                        }
+                        out.write( "/" );
+                        out.write( separator_for_output );
+                        final List<Domain> domain_list = new ArrayList<Domain>();
+                        for( final Domain domain : protein.getProteinDomains() ) {
+                            if ( ( ie_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
+                                domain_list.add( domain );
+                            }
+                        }
+                        final Domain domain_ary[] = new Domain[ domain_list.size() ];
+                        for( int i = 0; i < domain_list.size(); ++i ) {
+                            domain_ary[ i ] = domain_list.get( i );
+                        }
+                        Arrays.sort( domain_ary, new DomainComparator( true ) );
+                        out.write( "{" );
+                        boolean first = true;
+                        for( final Domain domain : domain_ary ) {
+                            if ( first ) {
+                                first = false;
+                            }
+                            else {
+                                out.write( "," );
+                            }
+                            out.write( domain.getDomainId().toString() );
+                            out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
+                            out.write( ":" + domain.getPerDomainEvalue() );
+                        }
+                        out.write( "}" );
+                        if ( !( ForesterUtil.isEmpty( protein.getDescription() )
+                                || protein.getDescription().equals( SurfacingConstants.NONE ) ) ) {
+                            out.write( protein.getDescription() );
+                        }
+                        out.write( separator_for_output );
+                        if ( !( ForesterUtil.isEmpty( protein.getAccession() )
+                                || protein.getAccession().equals( SurfacingConstants.NONE ) ) ) {
+                            out.write( protein.getAccession() );
+                        }
+                        out.write( SurfacingConstants.NL );
+                    }
+                }
+            }
+        }
+        out.flush();
+    }
+
      public static void main( final String[] args ) {
          Set<String> a = new HashSet<String>();
          Set<String> b = new HashSet<String>();
author	cmzmasek <chris.zma@outlook.com>
	Mon, 27 Mar 2017 05:01:52 +0000 (22:01 -0700)
committer	cmzmasek <chris.zma@outlook.com>
	Mon, 27 Mar 2017 05:01:52 +0000 (22:01 -0700)
forester/java/src/org/forester/application/surfacing.java		patch \| blob \| history
forester/java/src/org/forester/protein/BasicProtein.java		patch \| blob \| history
forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java		patch \| blob \| history