+ addToCountMap( lca_rank_counts, rank );
+ String lca_species;
+ if ( lca.getNodeData().isHasTaxonomy()
+ && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
+ lca_species = lca.getNodeData().getTaxonomy().getScientificName();
+ }
+ else if ( lca.getNodeData().isHasTaxonomy()
+ && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
+ lca_species = lca.getNodeData().getTaxonomy().getCommonName();
+ }
+ else {
+ lca_species = lca.getName();
+ }
+ addToCountMap( lca_ancestor_species_counts, lca_species );
+ }
+ }
+ }
+ final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
+ final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
+ ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
+ ForesterUtil.map2writer( out_for_ancestor_species_counts,
+ lca_ancestor_species_counts,
+ "\t",
+ ForesterUtil.LINE_SEPARATOR );
+ out_for_rank_counts.close();
+ out_for_ancestor_species_counts.close();
+ if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
+ && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
+ final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
+ w.write( "Domain Lengths: " );
+ w.write( "\n" );
+ if ( domain_length_stats_by_domain != null ) {
+ for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_lengths_stats
+ .entrySet() ) {
+ w.write( entry.getKey().toString() );
+ w.write( "\t" + entry.getValue().arithmeticMean() );
+ w.write( "\t" + entry.getValue().median() );
+ w.write( "\n" );
+ }
+ }
+ w.flush();
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Protein Lengths: " );
+ w.write( "\n" );
+ if ( protein_length_stats_by_dc != null ) {
+ for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_protein_length_stats
+ .entrySet() ) {
+ w.write( entry.getKey().toString() );
+ w.write( "\t" + entry.getValue().arithmeticMean() );
+ w.write( "\t" + entry.getValue().median() );
+ w.write( "\n" );
+ }
+ }
+ w.flush();
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Number of domains: " );
+ w.write( "\n" );
+ if ( domain_number_stats_by_dc != null ) {
+ for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_number_stats
+ .entrySet() ) {
+ w.write( entry.getKey().toString() );
+ w.write( "\t" + entry.getValue().arithmeticMean() );
+ w.write( "\t" + entry.getValue().median() );
+ w.write( "\n" );
+ }
+ }
+ w.flush();
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained once, domain lengths:" );
+ w.write( "\n" );
+ w.write( "N: " + gained_once_domain_length_count );
+ w.write( "\n" );
+ w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained multiple times, domain lengths:" );
+ w.write( "\n" );
+ w.write( "N: " + gained_multiple_times_domain_length_count );
+ w.write( "\n" );
+ w.write( "Avg: "
+ + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained once, protein lengths:" );
+ w.write( "\n" );
+ w.write( gained_once_lengths_stats.toString() );
+ gained_once_lengths_stats = null;
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained once, domain counts:" );
+ w.write( "\n" );
+ w.write( gained_once_domain_count_stats.toString() );
+ gained_once_domain_count_stats = null;
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained multiple times, protein lengths:" );
+ w.write( "\n" );
+ w.write( gained_multiple_times_lengths_stats.toString() );
+ gained_multiple_times_lengths_stats = null;
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained multiple times, domain counts:" );
+ w.write( "\n" );
+ w.write( gained_multiple_times_domain_count_stats.toString() );
+ w.flush();
+ w.close();
+ }
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
+ }
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to ["
+ + outfilename_for_counts + "]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to ["
+ + outfilename_for_dc + "]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME,
+ "Wrote independent domain combination gains fitch lists to (for GO mapping) ["
+ + outfilename_for_dc_for_go_mapping + "]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME,
+ "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) ["
+ + outfilename_for_dc_for_go_mapping_unique + "]" );
+ }
+
+ private static SortedSet<String> collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root,
+ final boolean get_gains ) {
+ final SortedSet<String> domains = new TreeSet<String>();
+ for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) {
+ final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters();
+ if ( get_gains ) {
+ domains.addAll( chars.getGainedCharacters() );
+ }
+ else {
+ domains.addAll( chars.getLostCharacters() );
+ }
+ }
+ return domains;
+ }
+
+ private static File createBaseDirForPerNodeDomainFiles( final String base_dir,
+ final boolean domain_combinations,
+ final CharacterStateMatrix.GainLossStates state,
+ final String outfile ) {
+ File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent()
+ + ForesterUtil.FILE_SEPARATOR + base_dir );
+ if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
+ per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
+ }
+ if ( domain_combinations ) {
+ per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
+ + ForesterUtil.FILE_SEPARATOR + "DC" );
+ }
+ else {
+ per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
+ + ForesterUtil.FILE_SEPARATOR + "DOMAINS" );
+ }
+ if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
+ per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
+ }
+ if ( state == GainLossStates.GAIN ) {
+ per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
+ + ForesterUtil.FILE_SEPARATOR + "GAINS" );
+ }
+ else if ( state == GainLossStates.LOSS ) {
+ per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
+ + ForesterUtil.FILE_SEPARATOR + "LOSSES" );
+ }
+ else {
+ per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
+ + ForesterUtil.FILE_SEPARATOR + "PRESENT" );
+ }
+ if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) {
+ per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir();
+ }
+ return per_node_go_mapped_domain_gain_loss_files_base_dir;
+ }
+
+ private static SortedSet<BinaryDomainCombination> createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) {
+ final SortedMap<String, CombinableDomains> cds = gwcd.getAllCombinableDomainsIds();
+ final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
+ for( final String domain_id : cds.keySet() ) {
+ final CombinableDomains cd = cds.get( domain_id );
+ binary_combinations.addAll( cd.toBinaryDomainCombinations() );
+ }
+ return binary_combinations;
+ }
+
+ private static List<String> splitDomainCombination( final String dc ) {
+ final String[] s = dc.split( "=" );
+ if ( s.length != 2 ) {
+ ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: "
+ + dc );
+ System.exit( -1 );
+ }
+ final List<String> l = new ArrayList<String>( 2 );
+ l.add( s[ 0 ] );
+ l.add( s[ 1 ] );
+ return l;
+ }
+
+ private static void writeAllEncounteredPfamsToFile( final Map<String, List<GoId>> domain_id_to_go_ids_map,
+ final Map<GoId, GoTerm> go_id_to_term_map,
+ final String outfile_name,
+ final SortedSet<String> all_pfams_encountered ) {
+ final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX );
+ final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name
+ + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX );
+ final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX );
+ int biological_process_counter = 0;
+ int cellular_component_counter = 0;
+ int molecular_function_counter = 0;
+ int pfams_with_mappings_counter = 0;
+ int pfams_without_mappings_counter = 0;
+ int pfams_without_mappings_to_bp_or_mf_counter = 0;
+ int pfams_with_mappings_to_bp_or_mf_counter = 0;
+ try {
+ final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) );
+ final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) );
+ final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) );
+ summary_writer.write( "# Pfam to GO mapping summary" );
+ summary_writer.write( ForesterUtil.LINE_SEPARATOR );
+ summary_writer.write( "# Actual summary is at the end of this file." );
+ summary_writer.write( ForesterUtil.LINE_SEPARATOR );
+ summary_writer.write( "# Encountered Pfams without a GO mapping:" );
+ summary_writer.write( ForesterUtil.LINE_SEPARATOR );
+ for( final String pfam : all_pfams_encountered ) {
+ all_pfams_encountered_writer.write( pfam );
+ all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR );
+ final String domain_id = new String( pfam );
+ if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) {
+ ++pfams_with_mappings_counter;
+ all_pfams_encountered_with_go_annotation_writer.write( pfam );
+ all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR );
+ final List<GoId> go_ids = domain_id_to_go_ids_map.get( domain_id );
+ boolean maps_to_bp = false;
+ boolean maps_to_cc = false;
+ boolean maps_to_mf = false;
+ for( final GoId go_id : go_ids ) {
+ final GoTerm go_term = go_id_to_term_map.get( go_id );
+ if ( go_term.getGoNameSpace().isBiologicalProcess() ) {
+ maps_to_bp = true;
+ }
+ else if ( go_term.getGoNameSpace().isCellularComponent() ) {
+ maps_to_cc = true;
+ }
+ else if ( go_term.getGoNameSpace().isMolecularFunction() ) {
+ maps_to_mf = true;
+ }
+ }
+ if ( maps_to_bp ) {
+ ++biological_process_counter;
+ }
+ if ( maps_to_cc ) {
+ ++cellular_component_counter;
+ }
+ if ( maps_to_mf ) {
+ ++molecular_function_counter;
+ }
+ if ( maps_to_bp || maps_to_mf ) {
+ ++pfams_with_mappings_to_bp_or_mf_counter;