in progress...
authorcmzmasek <chris.zma@outlook.com>
Tue, 16 May 2017 23:34:06 +0000 (16:34 -0700)
committercmzmasek <chris.zma@outlook.com>
Tue, 16 May 2017 23:34:06 +0000 (16:34 -0700)
forester/java/src/org/forester/application/surfacing.java
forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java

index b9d544c..9326929 100644 (file)
@@ -217,8 +217,8 @@ public class surfacing {
     final static private String                                     INPUT_GENOMES_FILE_OPTION                                                     = "genomes";
     final static private String                                     INPUT_SPECIES_TREE_OPTION                                                     = "species_tree";
     final static private String                                     SEQ_EXTRACT_OPTION                                                            = "prot_extract";
-    final static private String                                     PRG_VERSION                                                                   = "2.501";
-    final static private String                                     PRG_DATE                                                                      = "170327";
+    final static private String                                     PRG_VERSION                                                                   = "2.502";
+    final static private String                                     PRG_DATE                                                                      = "170511";
     final static private String                                     E_MAIL                                                                        = "phyloxml@gmail.com";
     final static private String                                     WWW                                                                           = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing";
     final static private boolean                                    IGNORE_DUFS_DEFAULT                                                           = true;
@@ -1776,9 +1776,11 @@ public class surfacing {
         ForesterUtil
                 .programMessage( PRG_NAME,
                                  "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file );
+        final int LEVEL = 2;
         try {
             MinimalDomainomeCalculator.calc( false,
                                              intrees[ 0 ],
+                                             LEVEL,
                                              protein_lists_per_species,
                                              SEPARATOR_FOR_DA,
                                              -1,
@@ -1791,6 +1793,7 @@ public class surfacing {
         try {
             MinimalDomainomeCalculator.calc( true,
                                              intrees[ 0 ],
+                                             LEVEL,
                                              protein_lists_per_species,
                                              SEPARATOR_FOR_DA,
                                              -1,
index 1fa7929..d48cc2b 100644 (file)
@@ -33,6 +33,7 @@ public final class MinimalDomainomeCalculator {
 
     public final static void calc( final boolean use_domain_architectures,
                                    final Phylogeny tre,
+                                   final int target_level,
                                    final SortedMap<Species, List<Protein>> protein_lists_per_species,
                                    final String separator,
                                    final double ie_cutoff,
@@ -61,8 +62,20 @@ public final class MinimalDomainomeCalculator {
         final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
         out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
         out.write( ForesterUtil.LINE_SEPARATOR );
+        ///////////
+        //////////
+        SortedMap<String, List<Protein>> protein_lists_per_quasi_species = null;
+        if ( target_level >= 1 ) {
+            protein_lists_per_quasi_species = makeProteinListsPerQuasiSpecies( tre,
+                                                                               target_level,
+                                                                               protein_lists_per_species );
+           
+        }
+        /////////
+        ///////////
         for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
             final PhylogenyNode node = iter.next();
+            final int node_level = PhylogenyMethods.calculateLevel( node );
             final String species_name = node.getNodeData().isHasTaxonomy()
                     ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
             final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
@@ -70,259 +83,85 @@ public final class MinimalDomainomeCalculator {
             final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
                     : "";
             final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
-            out.write( species_name );
-            if ( !ForesterUtil.isEmpty( common ) ) {
-                out.write( "\t" + common );
-            }
-            else {
-                out.write( "\t" );
-            }
-            if ( !ForesterUtil.isEmpty( tcode ) ) {
-                out.write( "\t" + tcode );
-            }
-            else {
-                out.write( "\t" );
-            }
-            if ( !ForesterUtil.isEmpty( rank ) ) {
-                out.write( "\t" + rank );
-            }
-            else {
-                out.write( "\t" );
-            }
             final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
-            if ( node.isInternal() ) {
-                out.write( "\t" + external_descs.size() + "\t" );
-            }
-            else {
-                out.write( "\t\t" );
-            }
-            final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
-            boolean first = true;
-            for( final PhylogenyNode external_desc : external_descs ) {
-                final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
-                if ( node.isInternal() ) {
-                    if ( first ) {
-                        first = false;
-                    }
-                    else {
-                        out.write( ", " );
-                    }
-                    out.write( code );
+            if ( ( target_level < 1 ) || ( node_level >= target_level ) ) {
+                out.write( species_name + " " + node_level);
+                if ( !ForesterUtil.isEmpty( common ) ) {
+                    out.write( "\t" + common );
                 }
-                final List<Protein> proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) );
-                if ( proteins_per_species != null ) {
-                    final SortedSet<String> features_per_genome = new TreeSet<String>();
-                    for( final Protein protein : proteins_per_species ) {
-                        if ( use_domain_architectures ) {
-                            final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
-                            features_per_genome.add( da );
-                        }
-                        else {
-                            List<Domain> domains = protein.getProteinDomains();
-                            for( final Domain domain : domains ) {
-                                if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
-                                    features_per_genome.add( domain.getDomainId() );
-                                }
-                            }
-                        }
-                    }
-                    if ( features_per_genome.size() > 0 ) {
-                        features_per_genome_list.add( features_per_genome );
-                    }
+                else {
+                    out.write( "\t" );
                 }
-            }
-            if ( features_per_genome_list.size() > 0 ) {
-                SortedSet<String> intersection = calcIntersection( features_per_genome_list );
-                out.write( "\t" + intersection.size() + "\t" );
-                first = true;
-                for( final String s : intersection ) {
-                    if ( first ) {
-                        first = false;
-                    }
-                    else {
-                        out.write( ", " );
-                    }
-                    out.write( s );
-                }
-                out.write( ForesterUtil.LINE_SEPARATOR );
-                species_to_features_map.put( species_name, intersection );
-            }
-        }
-        final SortedSet<String> all_species_names = new TreeSet<String>();
-        final SortedSet<String> all_features = new TreeSet<String>();
-        for( final Entry<String, SortedSet<String>> e : species_to_features_map.entrySet() ) {
-            all_species_names.add( e.getKey() );
-            for( final String f : e.getValue() ) {
-                all_features.add( f );
-            }
-        }
-        out_table.write( '\t' );
-        boolean first = true;
-        for( final String species_name : all_species_names ) {
-            if ( first ) {
-                first = false;
-            }
-            else {
-                out_table.write( '\t' );
-            }
-            out_table.write( species_name );
-        }
-        out_table.write( ForesterUtil.LINE_SEPARATOR );
-        for( final String das : all_features ) {
-            out_table.write( das );
-            out_table.write( '\t' );
-            first = true;
-            for( final String species_name : all_species_names ) {
-                if ( first ) {
-                    first = false;
+                if ( !ForesterUtil.isEmpty( tcode ) ) {
+                    out.write( "\t" + tcode );
                 }
                 else {
-                    out_table.write( '\t' );
+                    out.write( "\t" );
                 }
-                if ( species_to_features_map.get( species_name ).contains( das ) ) {
-                    out_table.write( '1' );
+                if ( !ForesterUtil.isEmpty( rank ) ) {
+                    out.write( "\t" + rank );
                 }
                 else {
-                    out_table.write( '0' );
+                    out.write( "\t" );
                 }
-            }
-            out_table.write( ForesterUtil.LINE_SEPARATOR );
-        }
-        out.flush();
-        out.close();
-        out_table.flush();
-        out_table.close();
-        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to           : " + outfile );
-        ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table );
-        if ( write_protein_files ) {
-            final String protdirname;
-            final String a;
-            final String b;
-            if ( use_domain_architectures ) {
-                a = "_DA";
-                b = "domain architectures (DAs)";
-                protdirname = "_DAS";
-            }
-            else {
-                a = "_domain";
-                b = "domains";
-                protdirname = "_DOMAINS";
-            }
-            final File prot_dir = new File( outfile_base + protdirname );
-            final boolean success = prot_dir.mkdir();
-            if ( !success ) {
-                throw new IOException( "failed to create dir " + prot_dir );
-            }
-            int total = 0;
-            final String dir = outfile_base + protdirname + "/";
-            for( final String feat : all_features ) {
-                final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX );
-                SurfacingUtil.checkForOutputFileWriteability( extract_outfile );
-                final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) );
-                final int counter = extractProteinFeatures( use_domain_architectures,
-                                                            protein_lists_per_species,
-                                                            feat,
-                                                            proteins_file_writer,
-                                                            ie_cutoff,
-                                                            separator );
-                if ( counter < 1 ) {
-                    ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" );
+                if ( node.isInternal() ) {
+                    out.write( "\t" + external_descs.size() + "\t" );
+                }
+                else {
+                    out.write( "\t\t" );
                 }
-                total += counter;
-                proteins_file_writer.close();
-            }
-            ForesterUtil.programMessage( "surfacing",
-                                         "Wrote " + total + " individual " + b + " from a total of "
-                                                 + all_features.size() + " into: " + dir );
-        }
-    }
-
-    public final static void calcNEW( final boolean use_domain_architectures,
-                                      final Phylogeny tre,
-                                      final int level,
-                                      final SortedMap<Species, List<Protein>> protein_lists_per_species,
-                                      final String separator,
-                                      final double ie_cutoff,
-                                      final String outfile_base,
-                                      final boolean write_protein_files )
-            throws IOException {
-        final SortedMap<String, SortedSet<String>> species_to_features_map = new TreeMap<String, SortedSet<String>>();
-        if ( protein_lists_per_species == null || tre == null ) {
-            throw new IllegalArgumentException( "argument is null" );
-        }
-        if ( protein_lists_per_species.size() < 2 ) {
-            throw new IllegalArgumentException( "not enough genomes" );
-        }
-        final String x;
-        if ( use_domain_architectures ) {
-            x = "DA";
-        }
-        else {
-            x = "domain";
-        }
-        final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" );
-        final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" );
-        SurfacingUtil.checkForOutputFileWriteability( outfile );
-        SurfacingUtil.checkForOutputFileWriteability( outfile_table );
-        final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
-        final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) );
-        out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" );
-        out.write( ForesterUtil.LINE_SEPARATOR );
-        ///////////
-        //////////
-        SortedMap<String, List<Protein>> protein_lists_per_quasi_species = null;
-        if ( level >= 1 ) {
-            protein_lists_per_quasi_species = makeProteinListsPerQuasiSpecies( tre, level, protein_lists_per_species );
-        }
-        /////////
-        ///////////
-        for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
-            final PhylogenyNode node = iter.next();
-            final String species_name = node.getNodeData().isHasTaxonomy()
-                    ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
-            final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName()
-                    : "";
-            final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode()
-                    : "";
-            final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : "";
-            out.write( species_name );
-            if ( !ForesterUtil.isEmpty( common ) ) {
-                out.write( "\t" + common );
-            }
-            else {
-                out.write( "\t" );
-            }
-            if ( !ForesterUtil.isEmpty( tcode ) ) {
-                out.write( "\t" + tcode );
-            }
-            else {
-                out.write( "\t" );
-            }
-            if ( !ForesterUtil.isEmpty( rank ) ) {
-                out.write( "\t" + rank );
-            }
-            else {
-                out.write( "\t" );
-            }
-            final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
-            if ( node.isInternal() ) {
-                out.write( "\t" + external_descs.size() + "\t" );
-            }
-            else {
-                out.write( "\t\t" );
             }
             final List<Set<String>> features_per_genome_list = new ArrayList<Set<String>>();
             boolean first = true;
-            if ( level >= 1 ) {
+            if ( target_level >= 1 ) {
                 ////////////
                 ////////////
-                final int node_level = PhylogenyMethods.calculateLevel( node );
-                if ( node_level >= level ) {
+                if ( node_level >= target_level ) {
                     final List<PhylogenyNode> given_level_descs = PhylogenyMethods
-                            .getAllDescendantsOfGivenLevel( node, level );
+                            .getAllDescendantsOfGivenLevel( node, target_level );
                     for( final PhylogenyNode given_level_desc : given_level_descs ) {
                         final String spec_name = given_level_desc.getNodeData().isHasTaxonomy()
-                                ? given_level_desc.getNodeData().getTaxonomy().getScientificName() : given_level_desc.getName();
+                                ? given_level_desc.getNodeData().getTaxonomy().getScientificName()
+                                : given_level_desc.getName();
+                        if ( node.isInternal() ) {
+                            if ( first ) {
+                                first = false;
+                            }
+                            else {
+                                out.write( ", " );
+                            }
+                            out.write( "sp_n=" + spec_name );
+                        }
+                        final List<Protein> proteins_per_species = protein_lists_per_quasi_species.get( spec_name );
+                        if ( proteins_per_species != null ) {
+                            final SortedSet<String> features_per_genome = new TreeSet<String>();
+                            for( final Protein protein : proteins_per_species ) {
+                                if ( use_domain_architectures ) {
+                                    final String da = protein.toDomainArchitectureString( separator, ie_cutoff );
+                                    features_per_genome.add( da );
+                                }
+                                else {
+                                    List<Domain> domains = protein.getProteinDomains();
+                                    for( final Domain domain : domains ) {
+                                        if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) {
+                                            features_per_genome.add( domain.getDomainId() );
+                                        }
+                                    }
+                                }
+                            }
+                            System.out.println( ">>>>>>>>>>>>>> features_per_genome.size()=" + features_per_genome.size() );
+                            if ( features_per_genome.size() > 0 ) {
+                                features_per_genome_list.add( features_per_genome );
+                            }
+                            else {
+                                System.out.println( "error!" );
+                                System.exit( -1 );
+                            }
+                        }
+                        else {
+                            System.out.println( "error!" );
+                            System.exit( -1 );
+                        }
                     }
                 }
                 ///////////
@@ -474,25 +313,32 @@ public final class MinimalDomainomeCalculator {
                                                                                            final int level,
                                                                                            final SortedMap<Species, List<Protein>> protein_lists_per_species ) {
         final SortedMap<String, List<Protein>> protein_lists_per_quasi_species = new TreeMap<String, List<Protein>>();
+        System.out.println( "---------------------------------" );
+        System.out.println( "level=" + level );
         for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) {
             final PhylogenyNode node = iter.next();
             final int node_level = PhylogenyMethods.calculateLevel( node );
             if ( node_level == level ) {
+                System.out.println( "level=" + level );
                 final List<PhylogenyNode> external_descs = node.getAllExternalDescendants();
                 final List<Protein> protein_list_per_quasi_species = new ArrayList<Protein>();
                 for( final PhylogenyNode external_desc : external_descs ) {
                     final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode();
                     final List<Protein> proteins_per_species = protein_lists_per_species
                             .get( new BasicSpecies( code ) );
+                    //System.out.println( code );
                     for( Protein protein : proteins_per_species ) {
                         protein_list_per_quasi_species.add( protein );
                     }
                 }
                 final String species_name = node.getNodeData().isHasTaxonomy()
                         ? node.getNodeData().getTaxonomy().getScientificName() : node.getName();
+                System.out.println( "species_name=" + species_name );
                 protein_lists_per_quasi_species.put( species_name, protein_list_per_quasi_species );
+                System.out.println( ">>>>" + protein_list_per_quasi_species.size() );
             }
         }
+      
         return protein_lists_per_quasi_species;
     }