From 63685dd0262a506f0bf17cba6be1aa1ada5330f3 Mon Sep 17 00:00:00 2001 From: cmzmasek Date: Tue, 16 May 2017 16:34:06 -0700 Subject: [PATCH] in progress... --- .../src/org/forester/application/surfacing.java | 7 +- .../surfacing/MinimalDomainomeCalculator.java | 316 +++++--------------- 2 files changed, 86 insertions(+), 237 deletions(-) diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index b9d544c..9326929 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -217,8 +217,8 @@ public class surfacing { final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.501"; - final static private String PRG_DATE = "170327"; + final static private String PRG_VERSION = "2.502"; + final static private String PRG_DATE = "170511"; final static private String E_MAIL = "phyloxml@gmail.com"; final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -1776,9 +1776,11 @@ public class surfacing { ForesterUtil .programMessage( PRG_NAME, "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file ); + final int LEVEL = 2; try { MinimalDomainomeCalculator.calc( false, intrees[ 0 ], + LEVEL, protein_lists_per_species, SEPARATOR_FOR_DA, -1, @@ -1791,6 +1793,7 @@ public class surfacing { try { MinimalDomainomeCalculator.calc( true, intrees[ 0 ], + LEVEL, protein_lists_per_species, SEPARATOR_FOR_DA, -1, diff --git a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java index 1fa7929..d48cc2b 100644 --- a/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java +++ b/forester/java/src/org/forester/surfacing/MinimalDomainomeCalculator.java @@ -33,6 +33,7 @@ public final class MinimalDomainomeCalculator { public final static void calc( final boolean use_domain_architectures, final Phylogeny tre, + final int target_level, final SortedMap> protein_lists_per_species, final String separator, final double ie_cutoff, @@ -61,8 +62,20 @@ public final class MinimalDomainomeCalculator { final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) ); out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" ); out.write( ForesterUtil.LINE_SEPARATOR ); + /////////// + ////////// + SortedMap> protein_lists_per_quasi_species = null; + if ( target_level >= 1 ) { + protein_lists_per_quasi_species = makeProteinListsPerQuasiSpecies( tre, + target_level, + protein_lists_per_species ); + + } + ///////// + /////////// for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); + final int node_level = PhylogenyMethods.calculateLevel( node ); final String species_name = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getScientificName() : node.getName(); final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName() @@ -70,259 +83,85 @@ public final class MinimalDomainomeCalculator { final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode() : ""; final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : ""; - out.write( species_name ); - if ( !ForesterUtil.isEmpty( common ) ) { - out.write( "\t" + common ); - } - else { - out.write( "\t" ); - } - if ( !ForesterUtil.isEmpty( tcode ) ) { - out.write( "\t" + tcode ); - } - else { - out.write( "\t" ); - } - if ( !ForesterUtil.isEmpty( rank ) ) { - out.write( "\t" + rank ); - } - else { - out.write( "\t" ); - } final List external_descs = node.getAllExternalDescendants(); - if ( node.isInternal() ) { - out.write( "\t" + external_descs.size() + "\t" ); - } - else { - out.write( "\t\t" ); - } - final List> features_per_genome_list = new ArrayList>(); - boolean first = true; - for( final PhylogenyNode external_desc : external_descs ) { - final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); - if ( node.isInternal() ) { - if ( first ) { - first = false; - } - else { - out.write( ", " ); - } - out.write( code ); + if ( ( target_level < 1 ) || ( node_level >= target_level ) ) { + out.write( species_name + " " + node_level); + if ( !ForesterUtil.isEmpty( common ) ) { + out.write( "\t" + common ); } - final List proteins_per_species = protein_lists_per_species.get( new BasicSpecies( code ) ); - if ( proteins_per_species != null ) { - final SortedSet features_per_genome = new TreeSet(); - for( final Protein protein : proteins_per_species ) { - if ( use_domain_architectures ) { - final String da = protein.toDomainArchitectureString( separator, ie_cutoff ); - features_per_genome.add( da ); - } - else { - List domains = protein.getProteinDomains(); - for( final Domain domain : domains ) { - if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) { - features_per_genome.add( domain.getDomainId() ); - } - } - } - } - if ( features_per_genome.size() > 0 ) { - features_per_genome_list.add( features_per_genome ); - } + else { + out.write( "\t" ); } - } - if ( features_per_genome_list.size() > 0 ) { - SortedSet intersection = calcIntersection( features_per_genome_list ); - out.write( "\t" + intersection.size() + "\t" ); - first = true; - for( final String s : intersection ) { - if ( first ) { - first = false; - } - else { - out.write( ", " ); - } - out.write( s ); - } - out.write( ForesterUtil.LINE_SEPARATOR ); - species_to_features_map.put( species_name, intersection ); - } - } - final SortedSet all_species_names = new TreeSet(); - final SortedSet all_features = new TreeSet(); - for( final Entry> e : species_to_features_map.entrySet() ) { - all_species_names.add( e.getKey() ); - for( final String f : e.getValue() ) { - all_features.add( f ); - } - } - out_table.write( '\t' ); - boolean first = true; - for( final String species_name : all_species_names ) { - if ( first ) { - first = false; - } - else { - out_table.write( '\t' ); - } - out_table.write( species_name ); - } - out_table.write( ForesterUtil.LINE_SEPARATOR ); - for( final String das : all_features ) { - out_table.write( das ); - out_table.write( '\t' ); - first = true; - for( final String species_name : all_species_names ) { - if ( first ) { - first = false; + if ( !ForesterUtil.isEmpty( tcode ) ) { + out.write( "\t" + tcode ); } else { - out_table.write( '\t' ); + out.write( "\t" ); } - if ( species_to_features_map.get( species_name ).contains( das ) ) { - out_table.write( '1' ); + if ( !ForesterUtil.isEmpty( rank ) ) { + out.write( "\t" + rank ); } else { - out_table.write( '0' ); + out.write( "\t" ); } - } - out_table.write( ForesterUtil.LINE_SEPARATOR ); - } - out.flush(); - out.close(); - out_table.flush(); - out_table.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to : " + outfile ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote minimal DAome data to (as table): " + outfile_table ); - if ( write_protein_files ) { - final String protdirname; - final String a; - final String b; - if ( use_domain_architectures ) { - a = "_DA"; - b = "domain architectures (DAs)"; - protdirname = "_DAS"; - } - else { - a = "_domain"; - b = "domains"; - protdirname = "_DOMAINS"; - } - final File prot_dir = new File( outfile_base + protdirname ); - final boolean success = prot_dir.mkdir(); - if ( !success ) { - throw new IOException( "failed to create dir " + prot_dir ); - } - int total = 0; - final String dir = outfile_base + protdirname + "/"; - for( final String feat : all_features ) { - final File extract_outfile = new File( dir + feat + a + surfacing.SEQ_EXTRACT_SUFFIX ); - SurfacingUtil.checkForOutputFileWriteability( extract_outfile ); - final Writer proteins_file_writer = new BufferedWriter( new FileWriter( extract_outfile ) ); - final int counter = extractProteinFeatures( use_domain_architectures, - protein_lists_per_species, - feat, - proteins_file_writer, - ie_cutoff, - separator ); - if ( counter < 1 ) { - ForesterUtil.printWarningMessage( "surfacing", feat + " not present (in " + b + " extraction)" ); + if ( node.isInternal() ) { + out.write( "\t" + external_descs.size() + "\t" ); + } + else { + out.write( "\t\t" ); } - total += counter; - proteins_file_writer.close(); - } - ForesterUtil.programMessage( "surfacing", - "Wrote " + total + " individual " + b + " from a total of " - + all_features.size() + " into: " + dir ); - } - } - - public final static void calcNEW( final boolean use_domain_architectures, - final Phylogeny tre, - final int level, - final SortedMap> protein_lists_per_species, - final String separator, - final double ie_cutoff, - final String outfile_base, - final boolean write_protein_files ) - throws IOException { - final SortedMap> species_to_features_map = new TreeMap>(); - if ( protein_lists_per_species == null || tre == null ) { - throw new IllegalArgumentException( "argument is null" ); - } - if ( protein_lists_per_species.size() < 2 ) { - throw new IllegalArgumentException( "not enough genomes" ); - } - final String x; - if ( use_domain_architectures ) { - x = "DA"; - } - else { - x = "domain"; - } - final File outfile = new File( outfile_base + "_minimal_" + x + "ome.tsv" ); - final File outfile_table = new File( outfile_base + "_minimal_" + x + "ome_matrix.tsv" ); - SurfacingUtil.checkForOutputFileWriteability( outfile ); - SurfacingUtil.checkForOutputFileWriteability( outfile_table ); - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - final BufferedWriter out_table = new BufferedWriter( new FileWriter( outfile_table ) ); - out.write( "SPECIES\tCOMMON NAME\tCODE\tRANK\t#EXT NODES\tEXT NODE CODES\t#" + x + "\t" + x + "" ); - out.write( ForesterUtil.LINE_SEPARATOR ); - /////////// - ////////// - SortedMap> protein_lists_per_quasi_species = null; - if ( level >= 1 ) { - protein_lists_per_quasi_species = makeProteinListsPerQuasiSpecies( tre, level, protein_lists_per_species ); - } - ///////// - /////////// - for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { - final PhylogenyNode node = iter.next(); - final String species_name = node.getNodeData().isHasTaxonomy() - ? node.getNodeData().getTaxonomy().getScientificName() : node.getName(); - final String common = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getCommonName() - : ""; - final String tcode = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getTaxonomyCode() - : ""; - final String rank = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getRank() : ""; - out.write( species_name ); - if ( !ForesterUtil.isEmpty( common ) ) { - out.write( "\t" + common ); - } - else { - out.write( "\t" ); - } - if ( !ForesterUtil.isEmpty( tcode ) ) { - out.write( "\t" + tcode ); - } - else { - out.write( "\t" ); - } - if ( !ForesterUtil.isEmpty( rank ) ) { - out.write( "\t" + rank ); - } - else { - out.write( "\t" ); - } - final List external_descs = node.getAllExternalDescendants(); - if ( node.isInternal() ) { - out.write( "\t" + external_descs.size() + "\t" ); - } - else { - out.write( "\t\t" ); } final List> features_per_genome_list = new ArrayList>(); boolean first = true; - if ( level >= 1 ) { + if ( target_level >= 1 ) { //////////// //////////// - final int node_level = PhylogenyMethods.calculateLevel( node ); - if ( node_level >= level ) { + if ( node_level >= target_level ) { final List given_level_descs = PhylogenyMethods - .getAllDescendantsOfGivenLevel( node, level ); + .getAllDescendantsOfGivenLevel( node, target_level ); for( final PhylogenyNode given_level_desc : given_level_descs ) { final String spec_name = given_level_desc.getNodeData().isHasTaxonomy() - ? given_level_desc.getNodeData().getTaxonomy().getScientificName() : given_level_desc.getName(); + ? given_level_desc.getNodeData().getTaxonomy().getScientificName() + : given_level_desc.getName(); + if ( node.isInternal() ) { + if ( first ) { + first = false; + } + else { + out.write( ", " ); + } + out.write( "sp_n=" + spec_name ); + } + final List proteins_per_species = protein_lists_per_quasi_species.get( spec_name ); + if ( proteins_per_species != null ) { + final SortedSet features_per_genome = new TreeSet(); + for( final Protein protein : proteins_per_species ) { + if ( use_domain_architectures ) { + final String da = protein.toDomainArchitectureString( separator, ie_cutoff ); + features_per_genome.add( da ); + } + else { + List domains = protein.getProteinDomains(); + for( final Domain domain : domains ) { + if ( ( ie_cutoff <= -1 ) || ( domain.getPerDomainEvalue() <= ie_cutoff ) ) { + features_per_genome.add( domain.getDomainId() ); + } + } + } + } + System.out.println( ">>>>>>>>>>>>>> features_per_genome.size()=" + features_per_genome.size() ); + if ( features_per_genome.size() > 0 ) { + features_per_genome_list.add( features_per_genome ); + } + else { + System.out.println( "error!" ); + System.exit( -1 ); + } + } + else { + System.out.println( "error!" ); + System.exit( -1 ); + } } } /////////// @@ -474,25 +313,32 @@ public final class MinimalDomainomeCalculator { final int level, final SortedMap> protein_lists_per_species ) { final SortedMap> protein_lists_per_quasi_species = new TreeMap>(); + System.out.println( "---------------------------------" ); + System.out.println( "level=" + level ); for( final PhylogenyNodeIterator iter = tre.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); final int node_level = PhylogenyMethods.calculateLevel( node ); if ( node_level == level ) { + System.out.println( "level=" + level ); final List external_descs = node.getAllExternalDescendants(); final List protein_list_per_quasi_species = new ArrayList(); for( final PhylogenyNode external_desc : external_descs ) { final String code = external_desc.getNodeData().getTaxonomy().getTaxonomyCode(); final List proteins_per_species = protein_lists_per_species .get( new BasicSpecies( code ) ); + //System.out.println( code ); for( Protein protein : proteins_per_species ) { protein_list_per_quasi_species.add( protein ); } } final String species_name = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy().getScientificName() : node.getName(); + System.out.println( "species_name=" + species_name ); protein_lists_per_quasi_species.put( species_name, protein_list_per_quasi_species ); + System.out.println( ">>>>" + protein_list_per_quasi_species.size() ); } } + return protein_lists_per_quasi_species; } -- 1.7.10.2