import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() );
}
+ final DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics();
final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo = new TreeMap<Integer, Integer>();
final SortedSet<String> domains_which_are_always_single = new TreeSet<String>();
catch ( final IOException e3 ) {
e3.printStackTrace();
}
+ // Main loop:
for( int i = 0; i < number_of_genomes; ++i ) {
System.out.println();
System.out.println( ( i + 1 ) + "/" + number_of_genomes );
System.out.println( "Domains ignored due to virus like id: " );
ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() );
}
+ final double coverage = ( double ) protein_list.size() / parser.getProteinsEncountered();
+ protein_coverage_stats.addValue( coverage );
System.out.println( "Number of proteins encountered : " + parser.getProteinsEncountered() );
log( "Number of proteins encountered : " + parser.getProteinsEncountered(), log_writer );
System.out.println( "Number of proteins stored : " + protein_list.size() );
log( "Number of proteins stored : " + protein_list.size(), log_writer );
+ System.out.println( "Coverage : "
+ + ForesterUtil.roundToInt( 100.0 * coverage ) + "%" );
+ log( "Coverage : " + ForesterUtil.roundToInt( 100.0 * coverage )
+ + "%", log_writer );
System.out.println( "Domains encountered : " + parser.getDomainsEncountered() );
log( "Domains encountered : " + parser.getDomainsEncountered(), log_writer );
System.out.println( "Domains stored : " + parser.getDomainsStored() );
catch ( final IOException e ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e.toString() );
}
- SurfacingUtil.domainsPerProteinsStatistics( input_file_properties[ i ][ 0 ],
+ SurfacingUtil.domainsPerProteinsStatistics( input_file_properties[ i ][ 1 ],
protein_list,
all_genomes_domains_per_potein_stats,
all_genomes_domains_per_potein_histo,
}
System.gc();
} // for( int i = 0; i < number_of_genomes; ++i ) {
- try {
- per_genome_domain_promiscuity_statistics_writer.flush();
- per_genome_domain_promiscuity_statistics_writer.close();
- dc_data_writer.flush();
- dc_data_writer.close();
- log_writer.flush();
- log_writer.close();
- }
- catch ( final IOException e2 ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
- }
ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: "
+ per_genome_domain_promiscuity_statistics_file );
//
domains_per_potein_stats_writer.write( "\t" );
domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" );
domains_per_potein_stats_writer.write( "\n" );
- domains_per_potein_stats_writer.flush();
domains_per_potein_stats_writer.close();
+ printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer );
ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ "__all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" );
ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ "__domains_single_or_combined.txt" ), domains_which_are_sometimes_single_sometimes_not, "\n" );
ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ "__domains_always_combined.txt" ), domains_which_never_single, "\n" );
+ ForesterUtil.programMessage( PRG_NAME,
+ "Average of proteins with a least one domain assigned: "
+ + ( 100 * protein_coverage_stats.arithmeticMean() ) + "% (+/-"
+ + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)" );
+ ForesterUtil.programMessage( PRG_NAME, "Range of proteins with a least one domain assigned: " + 100
+ * protein_coverage_stats.getMin() + "%-" + 100 * protein_coverage_stats.getMax() + "%" );
+ log( "Average of prot with a least one dom assigned : " + ( 100 * protein_coverage_stats.arithmeticMean() )
+ + "% (+/-" + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)", log_writer );
+ log( "Range of prot with a least one dom assigned : " + 100 * protein_coverage_stats.getMin() + "%-"
+ + 100 * protein_coverage_stats.getMax() + "%", log_writer );
}
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
}
}
}
+ try {
+ per_genome_domain_promiscuity_statistics_writer.close();
+ dc_data_writer.close();
+ log_writer.close();
+ }
+ catch ( final IOException e2 ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
+ }
if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
try {
SurfacingUtil.executeDomainLengthAnalysis( input_file_properties,
System.out.println();
}
+ private static void printOutPercentageOfMultidomainProteins( final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
+ final Writer log_writer ) {
+ int sum = 0;
+ for( final Entry<Integer, Integer> entry : all_genomes_domains_per_potein_histo.entrySet() ) {
+ sum += entry.getValue();
+ }
+ final double percentage = 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) / sum;
+ ForesterUtil.programMessage( PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" );
+ log( "Percentage of multidomain proteins: : " + percentage + "%", log_writer );
+ }
+
private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree,
final String[][] input_file_properties ) {
final String[] genomes = new String[ input_file_properties.length ];
parser1.parse();
final HmmscanPerDomainTableParser parser2 = new HmmscanPerDomainTableParser( new File( test_dir
+ ForesterUtil.getFileSeparator() + "hmmscan30b3_output_2" ), "MONBR", INDIVIDUAL_SCORE_CUTOFF.NONE );
- final List<Protein> domain_collections = parser2.parse();
+ final List<Protein> proteins = parser2.parse();
if ( parser2.getProteinsEncountered() != 4 ) {
return false;
}
- if ( domain_collections.size() != 4 ) {
+ if ( proteins.size() != 4 ) {
return false;
}
if ( parser2.getDomainsEncountered() != 69 ) {
if ( parser2.getDomainsIgnoredDueToEval() != 0 ) {
return false;
}
- final Protein p1 = domain_collections.get( 0 );
+ final Protein p1 = proteins.get( 0 );
if ( p1.getNumberOfProteinDomains() != 15 ) {
return false;
}
- final Protein p4 = domain_collections.get( 3 );
+ final Protein p2 = proteins.get( 1 );
+ if ( p2.getNumberOfProteinDomains() != 51 ) {
+ return false;
+ }
+ final Protein p3 = proteins.get( 2 );
+ if ( p3.getNumberOfProteinDomains() != 2 ) {
+ return false;
+ }
+ final Protein p4 = proteins.get( 3 );
if ( p4.getNumberOfProteinDomains() != 1 ) {
return false;
}