import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.PhylogenyNodeI.NH_CONVERSION_SUPPORT_VALUE_STYLE;
import org.forester.phylogeny.data.BinaryCharacters;
import org.forester.phylogeny.data.Confidence;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
final String outfilename_for_counts,
final String outfilename_for_dc,
final String outfilename_for_dc_for_go_mapping,
- final String outfilename_for_dc_for_go_mapping_unique ) {
+ final String outfilename_for_dc_for_go_mapping_unique,
+ final String outfilename_for_rank_counts,
+ final String outfilename_for_ancestor_species_counts ) {
try {
final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
final Set<String> dcs = dc_gain_counts.keySet();
+ final SortedSet<String> more_than_once = new TreeSet<String>();
for( final String dc : dcs ) {
final int count = dc_gain_counts.get( dc );
if ( histogram.containsKey( count ) ) {
set.addAll( splitDomainCombination( dc ) );
domain_lists_go_unique.put( count, set );
}
+ if ( count > 1 ) {
+ more_than_once.add( dc );
+ }
}
final Set<Integer> histogram_keys = histogram.keySet();
for( final Integer histogram_key : histogram_keys ) {
out_dc.close();
out_dc_for_go_mapping.close();
out_dc_for_go_mapping_unique.close();
+ //
+ final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
+ final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
+ for( final String dc : more_than_once ) {
+ final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
+ for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) {
+ final PhylogenyNode n = it.next();
+ if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) {
+ nodes.add( n );
+ }
+ }
+ for( int i = 0; i < nodes.size() - 1; ++i ) {
+ for( int j = i + 1; j < nodes.size(); ++j ) {
+ final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( nodes.get( i ),
+ nodes.get( j ) );
+ String rank = "unknown";
+ if ( lca.getNodeData().isHasTaxonomy()
+ && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
+ rank = lca.getNodeData().getTaxonomy().getRank();
+ }
+ addToCountMap( lca_rank_counts, rank );
+ String lca_species;
+ if ( lca.getNodeData().isHasTaxonomy()
+ && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) {
+ lca_species = lca.getNodeData().getTaxonomy().getScientificName();
+ }
+ else if ( lca.getNodeData().isHasTaxonomy()
+ && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) {
+ lca_species = lca.getNodeData().getTaxonomy().getCommonName();
+ }
+ else {
+ lca_species = lca.getName();
+ }
+ addToCountMap( lca_ancestor_species_counts, lca_species );
+ }
+ }
+ }
+ final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) );
+ final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) );
+ ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR );
+ ForesterUtil.map2writer( out_for_ancestor_species_counts,
+ lca_ancestor_species_counts,
+ "\t",
+ ForesterUtil.LINE_SEPARATOR );
+ out_for_rank_counts.close();
+ out_for_ancestor_species_counts.close();
}
catch ( final IOException e ) {
ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
+ outfilename_for_dc_for_go_mapping_unique + "]" );
}
+ private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
+ if ( map.containsKey( s ) ) {
+ map.put( s, map.get( s ) + 1 );
+ }
+ else {
+ map.put( s, 1 );
+ }
+ }
+
public static int calculateOverlap( final Domain domain, final List<Boolean> covered_positions ) {
int overlap_count = 0;
for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) {
randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
}
else {
- domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( false );
+ domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( true );
}
SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
+ surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
+ surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name
+ surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name
+ surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX );
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name
+ + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" );
}
}
final PhylogenyNode n = it.next();
if ( ForesterUtil.isEmpty( n.getName() )
&& ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
- .getScientificName() ) ) ) {
+ .getScientificName() ) )
+ && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy()
+ .getCommonName() ) ) ) {
if ( n.getParent() != null ) {
names.append( " " );
names.append( n.getParent().getName() );
}
+ final List l = n.getAllExternalDescendants();
+ for( final Object object : l ) {
+ System.out.println( l.toString() );
+ }
++c;
}
}
public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc,
final StringBuilder html_title,
- final Writer w,
+ final Writer single_writer,
+ Map<Character, Writer> split_writers,
final SortedSet<DomainSimilarity> similarities,
final boolean treat_as_binary,
final List<Species> species_order,
System.out.println( "Pearsonian skewness : n/a" );
}
}
+ if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) {
+ split_writers = new HashMap<Character, Writer>();
+ split_writers.put( '_', single_writer );
+ }
switch ( print_option ) {
case SIMPLE_TAB_DELIMITED:
break;
case HTML:
- w.write( "<html>" );
- w.write( SurfacingConstants.NL );
- addHtmlHead( w, "SURFACING :: " + html_title );
- w.write( SurfacingConstants.NL );
- w.write( "<body>" );
- w.write( SurfacingConstants.NL );
- w.write( html_desc.toString() );
- w.write( SurfacingConstants.NL );
- w.write( "<hr>" );
- w.write( "<br>" );
- w.write( SurfacingConstants.NL );
- w.write( "<tt><pre>" );
- w.write( SurfacingConstants.NL );
- if ( histo != null ) {
- w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+ for( final Character key : split_writers.keySet() ) {
+ final Writer w = split_writers.get( key );
+ w.write( "<html>" );
+ w.write( SurfacingConstants.NL );
+ if ( key != '_' ) {
+ addHtmlHead( w, "DCs (" + html_title + ") " + key.toString().toUpperCase() );
+ }
+ else {
+ addHtmlHead( w, "DCs (" + html_title + ")" );
+ }
+ w.write( SurfacingConstants.NL );
+ w.write( "<body>" );
+ w.write( SurfacingConstants.NL );
+ w.write( html_desc.toString() );
+ w.write( SurfacingConstants.NL );
+ w.write( "<hr>" );
+ w.write( "<br>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tt><pre>" );
+ w.write( SurfacingConstants.NL );
+ if ( histo != null ) {
+ w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+ w.write( SurfacingConstants.NL );
+ }
+ w.write( "</pre></tt>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ if ( stats.getN() > 1 ) {
+ w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
+ }
+ else {
+ w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
+ }
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>Median: </td><td>" + stats.median() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ if ( stats.getN() > 1 ) {
+ w.write( "<tr><td>Pearsonian skewness: </td><td>" + stats.pearsonianSkewness() + "</td></tr>" );
+ }
+ else {
+ w.write( "<tr><td>Pearsonian skewness: </td><td>n/a</td></tr>" );
+ }
+ w.write( SurfacingConstants.NL );
+ w.write( "</table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<br>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<hr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<br>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<table>" );
w.write( SurfacingConstants.NL );
}
- w.write( "</pre></tt>" );
- w.write( SurfacingConstants.NL );
- w.write( "<table>" );
- w.write( SurfacingConstants.NL );
- w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
- w.write( SurfacingConstants.NL );
- w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
- w.write( SurfacingConstants.NL );
- w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
- w.write( SurfacingConstants.NL );
- w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
- w.write( SurfacingConstants.NL );
- if ( stats.getN() > 1 ) {
- w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
- }
- else {
- w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
- }
- w.write( SurfacingConstants.NL );
- w.write( "<tr><td>Median: </td><td>" + stats.median() + "</td></tr>" );
- w.write( SurfacingConstants.NL );
- if ( stats.getN() > 1 ) {
- w.write( "<tr><td>Pearsonian skewness: </td><td>" + stats.pearsonianSkewness() + "</td></tr>" );
- }
- else {
- w.write( "<tr><td>Pearsonian skewness: </td><td>n/a</td></tr>" );
- }
- w.write( SurfacingConstants.NL );
- w.write( "</table>" );
- w.write( SurfacingConstants.NL );
- w.write( "<br>" );
- w.write( SurfacingConstants.NL );
- w.write( "<hr>" );
- w.write( SurfacingConstants.NL );
- w.write( "<br>" );
- w.write( SurfacingConstants.NL );
- w.write( "<table>" );
- w.write( SurfacingConstants.NL );
break;
}
- w.write( SurfacingConstants.NL );
+ for( final Writer w : split_writers.values() ) {
+ w.write( SurfacingConstants.NL );
+ }
for( final DomainSimilarity similarity : similarities ) {
if ( ( species_order != null ) && !species_order.isEmpty() ) {
( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
}
- w.write( similarity.toStringBuffer( print_option ).toString() );
- w.write( SurfacingConstants.NL );
+ if ( single_writer != null ) {
+ single_writer.write( similarity.toStringBuffer( print_option ).toString() );
+ }
+ else {
+ Writer local_writer = split_writers.get( ( similarity.getDomainId().getId().charAt( 0 ) + "" )
+ .toLowerCase().charAt( 0 ) );
+ if ( local_writer == null ) {
+ local_writer = split_writers.get( '0' );
+ }
+ local_writer.write( similarity.toStringBuffer( print_option ).toString() );
+ }
+ for( final Writer w : split_writers.values() ) {
+ w.write( SurfacingConstants.NL );
+ }
}
switch ( print_option ) {
case HTML:
- w.write( SurfacingConstants.NL );
- w.write( "</table>" );
- w.write( SurfacingConstants.NL );
- w.write( "</font>" );
- w.write( SurfacingConstants.NL );
- w.write( "</body>" );
- w.write( SurfacingConstants.NL );
- w.write( "</html>" );
- w.write( SurfacingConstants.NL );
+ for( final Writer w : split_writers.values() ) {
+ w.write( SurfacingConstants.NL );
+ w.write( "</table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "</font>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "</body>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "</html>" );
+ w.write( SurfacingConstants.NL );
+ }
break;
}
- w.flush();
- w.close();
+ for( final Writer w : split_writers.values() ) {
+ w.close();
+ }
return stats;
}
w.write( ForesterUtil.LINE_SEPARATOR );
my_matrix.writeNexusTaxaBlock( w );
my_matrix.writeNexusBinaryChractersBlock( w );
- PhylogenyWriter.writeNexusTreesBlock( w, phylogenies );
+ PhylogenyWriter.writeNexusTreesBlock( w, phylogenies, NH_CONVERSION_SUPPORT_VALUE_STYLE.NONE );
w.flush();
w.close();
ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" );
domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(),
phylogeny );
}
+
+ public static void domainsPerProteinsStatistics( final String genome,
+ final List<Protein> protein_list,
+ final DescriptiveStatistics all_genomes_domains_per_potein_stats,
+ final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
+ final SortedSet<String> domains_which_are_always_single,
+ final SortedSet<String> domains_which_are_sometimes_single_sometimes_not,
+ final SortedSet<String> domains_which_never_single,
+ final Writer writer ) {
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( final Protein protein : protein_list ) {
+ final int domains = protein.getNumberOfProteinDomains();
+ //System.out.println( domains );
+ stats.addValue( domains );
+ all_genomes_domains_per_potein_stats.addValue( domains );
+ if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
+ all_genomes_domains_per_potein_histo.put( domains, 1 );
+ }
+ else {
+ all_genomes_domains_per_potein_histo.put( domains,
+ 1 + all_genomes_domains_per_potein_histo.get( domains ) );
+ }
+ if ( domains == 1 ) {
+ final String domain = protein.getProteinDomain( 0 ).getDomainId().getId();
+ if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
+ if ( domains_which_never_single.contains( domain ) ) {
+ domains_which_never_single.remove( domain );
+ domains_which_are_sometimes_single_sometimes_not.add( domain );
+ }
+ else {
+ domains_which_are_always_single.add( domain );
+ }
+ }
+ }
+ else if ( domains > 1 ) {
+ for( final Domain d : protein.getProteinDomains() ) {
+ final String domain = d.getDomainId().getId();
+ // System.out.println( domain );
+ if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
+ if ( domains_which_are_always_single.contains( domain ) ) {
+ domains_which_are_always_single.remove( domain );
+ domains_which_are_sometimes_single_sometimes_not.add( domain );
+ }
+ else {
+ domains_which_never_single.add( domain );
+ }
+ }
+ }
+ }
+ }
+ try {
+ writer.write( genome );
+ writer.write( "\t" );
+ if ( stats.getN() >= 1 ) {
+ writer.write( stats.arithmeticMean() + "" );
+ writer.write( "\t" );
+ if ( stats.getN() >= 2 ) {
+ writer.write( stats.sampleStandardDeviation() + "" );
+ }
+ else {
+ writer.write( "" );
+ }
+ writer.write( "\t" );
+ writer.write( stats.median() + "" );
+ writer.write( "\t" );
+ writer.write( stats.getN() + "" );
+ writer.write( "\t" );
+ writer.write( stats.getMin() + "" );
+ writer.write( "\t" );
+ writer.write( stats.getMax() + "" );
+ }
+ else {
+ writer.write( "\t" );
+ writer.write( "\t" );
+ writer.write( "\t" );
+ writer.write( "0" );
+ writer.write( "\t" );
+ writer.write( "\t" );
+ }
+ writer.write( "\n" );
+ }
+ catch ( final IOException e ) {
+ e.printStackTrace();
+ }
+ }
}