import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.SortedMap;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
+import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.matrix.distance.DistanceMatrix;
import org.forester.go.GoId;
import org.forester.go.GoNameSpace;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.PhylogenyNodeI.NH_CONVERSION_SUPPORT_VALUE_STYLE;
+import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
import org.forester.phylogeny.data.BinaryCharacters;
import org.forester.phylogeny.data.Confidence;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.protein.BasicDomain;
+import org.forester.protein.BasicProtein;
+import org.forester.protein.BinaryDomainCombination;
+import org.forester.protein.Domain;
+import org.forester.protein.DomainId;
+import org.forester.protein.Protein;
+import org.forester.species.Species;
import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput;
import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
}
};
public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" );
+ private static final boolean USE_LAST = true;
private SurfacingUtil() {
// Hidden constructor.
final String outfilename_for_dc_for_go_mapping,
final String outfilename_for_dc_for_go_mapping_unique,
final String outfilename_for_rank_counts,
- final String outfilename_for_ancestor_species_counts ) {
+ final String outfilename_for_ancestor_species_counts,
+ final String outfilename_for_protein_stats,
+ final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
try {
+ //
+ // if ( protein_length_stats_by_dc != null ) {
+ // for( final Entry<?, DescriptiveStatistics> entry : protein_length_stats_by_dc.entrySet() ) {
+ // System.out.print( entry.getKey().toString() );
+ // System.out.print( ": " );
+ // double[] a = entry.getValue().getDataAsDoubleArray();
+ // for( int i = 0; i < a.length; i++ ) {
+ // System.out.print( a[ i ] + " " );
+ // }
+ // System.out.println();
+ // }
+ // }
+ // if ( domain_number_stats_by_dc != null ) {
+ // for( final Entry<?, DescriptiveStatistics> entry : domain_number_stats_by_dc.entrySet() ) {
+ // System.out.print( entry.getKey().toString() );
+ // System.out.print( ": " );
+ // double[] a = entry.getValue().getDataAsDoubleArray();
+ // for( int i = 0; i < a.length; i++ ) {
+ // System.out.print( a[ i ] + " " );
+ // }
+ // System.out.println();
+ // }
+ // }
+ //
final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
}
final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_lengths_stats = new TreeMap<Integer, DescriptiveStatistics>();
final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
final Set<String> dcs = dc_gain_counts.keySet();
final SortedSet<String> more_than_once = new TreeSet<String>();
+ final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics();
+ final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
+ final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
+ final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
+ long gained_multiple_times_domain_length_sum = 0;
+ long gained_once_domain_length_sum = 0;
+ long gained_multiple_times_domain_length_count = 0;
+ long gained_once_domain_length_count = 0;
for( final String dc : dcs ) {
final int count = dc_gain_counts.get( dc );
if ( histogram.containsKey( count ) ) {
set.addAll( splitDomainCombination( dc ) );
domain_lists_go_unique.put( count, set );
}
+ if ( protein_length_stats_by_dc != null ) {
+ if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
+ .arithmeticMean() );
+ }
+ if ( domain_number_stats_by_dc != null ) {
+ if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
+ .arithmeticMean() );
+ }
+ if ( domain_length_stats_by_domain != null ) {
+ if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ final String[] ds = dc.split( "=" );
+ dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
+ .get( ds[ 0 ] ).arithmeticMean() );
+ dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
+ .get( ds[ 1 ] ).arithmeticMean() );
+ }
if ( count > 1 ) {
more_than_once.add( dc );
+ if ( protein_length_stats_by_dc != null ) {
+ final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
+ for( final double element : s.getData() ) {
+ gained_multiple_times_lengths_stats.addValue( element );
+ }
+ }
+ if ( domain_number_stats_by_dc != null ) {
+ final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
+ for( final double element : s.getData() ) {
+ gained_multiple_times_domain_count_stats.addValue( element );
+ }
+ }
+ if ( domain_length_stats_by_domain != null ) {
+ final String[] ds = dc.split( "=" );
+ final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
+ final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
+ for( final double element : s0.getData() ) {
+ gained_multiple_times_domain_length_sum += element;
+ ++gained_multiple_times_domain_length_count;
+ }
+ for( final double element : s1.getData() ) {
+ gained_multiple_times_domain_length_sum += element;
+ ++gained_multiple_times_domain_length_count;
+ }
+ }
+ }
+ else {
+ if ( protein_length_stats_by_dc != null ) {
+ final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
+ for( final double element : s.getData() ) {
+ gained_once_lengths_stats.addValue( element );
+ }
+ }
+ if ( domain_number_stats_by_dc != null ) {
+ final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
+ for( final double element : s.getData() ) {
+ gained_once_domain_count_stats.addValue( element );
+ }
+ }
+ if ( domain_length_stats_by_domain != null ) {
+ final String[] ds = dc.split( "=" );
+ final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
+ final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
+ for( final double element : s0.getData() ) {
+ gained_once_domain_length_sum += element;
+ ++gained_once_domain_length_count;
+ }
+ for( final double element : s1.getData() ) {
+ gained_once_domain_length_sum += element;
+ ++gained_once_domain_length_count;
+ }
+ }
}
}
final Set<Integer> histogram_keys = histogram.keySet();
out_dc.close();
out_dc_for_go_mapping.close();
out_dc_for_go_mapping_unique.close();
- //
final SortedMap<String, Integer> lca_rank_counts = new TreeMap<String, Integer>();
final SortedMap<String, Integer> lca_ancestor_species_counts = new TreeMap<String, Integer>();
for( final String dc : more_than_once ) {
nodes.add( n );
}
}
- for( int i = 0; i < nodes.size() - 1; ++i ) {
+ for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
for( int j = i + 1; j < nodes.size(); ++j ) {
- final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( nodes.get( i ),
- nodes.get( j ) );
+ final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
String rank = "unknown";
if ( lca.getNodeData().isHasTaxonomy()
&& !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
ForesterUtil.LINE_SEPARATOR );
out_for_rank_counts.close();
out_for_ancestor_species_counts.close();
+ if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
+ && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
+ final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
+ w.write( "Domain Lengths: " );
+ w.write( "\n" );
+ if ( domain_length_stats_by_domain != null ) {
+ for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_lengths_stats
+ .entrySet() ) {
+ w.write( entry.getKey().toString() );
+ w.write( "\t" + entry.getValue().arithmeticMean() );
+ w.write( "\t" + entry.getValue().median() );
+ w.write( "\n" );
+ }
+ }
+ w.flush();
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Protein Lengths: " );
+ w.write( "\n" );
+ if ( protein_length_stats_by_dc != null ) {
+ for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_protein_length_stats
+ .entrySet() ) {
+ w.write( entry.getKey().toString() );
+ w.write( "\t" + entry.getValue().arithmeticMean() );
+ w.write( "\t" + entry.getValue().median() );
+ w.write( "\n" );
+ }
+ }
+ w.flush();
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Number of domains: " );
+ w.write( "\n" );
+ if ( domain_number_stats_by_dc != null ) {
+ for( final Entry<Integer, DescriptiveStatistics> entry : dc_reapp_counts_to_domain_number_stats
+ .entrySet() ) {
+ w.write( entry.getKey().toString() );
+ w.write( "\t" + entry.getValue().arithmeticMean() );
+ w.write( "\t" + entry.getValue().median() );
+ w.write( "\n" );
+ }
+ }
+ w.flush();
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained once, domain lengths:" );
+ w.write( "\n" );
+ w.write( "N: " + gained_once_domain_length_count );
+ w.write( "\n" );
+ w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained multiple times, domain lengths:" );
+ w.write( "\n" );
+ w.write( "N: " + gained_multiple_times_domain_length_count );
+ w.write( "\n" );
+ w.write( "Avg: "
+ + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained once, protein lengths:" );
+ w.write( "\n" );
+ w.write( gained_once_lengths_stats.toString() );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained once, domain counts:" );
+ w.write( "\n" );
+ w.write( gained_once_domain_count_stats.toString() );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained multiple times, protein lengths:" );
+ w.write( "\n" );
+ w.write( gained_multiple_times_lengths_stats.toString() );
+ w.write( "\n" );
+ w.write( "\n" );
+ w.write( "Gained multiple times, domain counts:" );
+ w.write( "\n" );
+ w.write( gained_multiple_times_domain_count_stats.toString() );
+ w.flush();
+ w.close();
+ }
}
catch ( final IOException e ) {
ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
checkForOutputFileWriteability( nj_tree_outfile );
final NeighborJoining nj = NeighborJoining.createInstance();
- final Phylogeny phylogeny = nj.execute( distance );
+ final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance );
phylogeny.setName( nj_tree_outfile.getName() );
writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
return phylogeny;
final boolean output_binary_domain_combinations_for_graphs,
final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
- final BinaryDomainCombination.DomainCombinationType dc_type ) {
+ final BinaryDomainCombination.DomainCombinationType dc_type,
+ final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
final String date_time = ForesterUtil.getCurrentDateTime();
final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony;
}
else {
- domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( true );
+ domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( USE_LAST );
}
SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name
+ surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER );
parameters_str );
SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
+ surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
- calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name
- + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" );
+ calculateIndependentDomainCombinationGains( local_phylogeny_l,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
+ outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
+ outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
+ outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt",
+ protein_length_stats_by_dc,
+ domain_number_stats_by_dc,
+ domain_length_stats_by_domain );
}
}
writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES,
secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ),
phylogeny );
- final Phylogeny local_phylogeny_copy = phylogeny.copy();
+ Phylogeny local_phylogeny_copy = phylogeny.copy();
secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map );
SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name
+ surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER );
parameters_str );
SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
+ surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO );
+ // FITCH DOMAIN COMBINATIONS
+ // -------------------------
+ local_phylogeny_copy = phylogeny.copy();
+ final String randomization = "no";
+ secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( USE_LAST );
+ preparePhylogeny( local_phylogeny_copy,
+ secondary_features_parsimony,
+ date_time,
+ "Fitch parsimony on secondary binary domain combination presence/absence randomization: "
+ + randomization,
+ "fitch_on_binary_domain_combinations_" + outfile_name,
+ parameters_str );
+ SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name
+ + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED );
+ calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
+ + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
+ + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null );
}
- public static void extractProteinNames( final List<Protein> proteins,
- final List<DomainId> query_domain_ids_nc_order,
- final Writer out,
- final String separator ) throws IOException {
+ public static void doit( final List<Protein> proteins,
+ final List<DomainId> query_domain_ids_nc_order,
+ final Writer out,
+ final String separator,
+ final String limit_to_species,
+ final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
for( final Protein protein : proteins ) {
- if ( protein.contains( query_domain_ids_nc_order, true ) ) {
- out.write( protein.getSpecies().getSpeciesId() );
- out.write( separator );
- out.write( protein.getProteinId().getId() );
- out.write( separator );
- out.write( "[" );
- final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
- boolean first = true;
- for( final Domain domain : protein.getProteinDomains() ) {
- if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
- visited_domain_ids.add( domain.getDomainId() );
- if ( first ) {
- first = false;
- }
- else {
- out.write( " " );
+ if ( ForesterUtil.isEmpty( limit_to_species )
+ || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
+ if ( protein.contains( query_domain_ids_nc_order, true ) ) {
+ out.write( protein.getSpecies().getSpeciesId() );
+ out.write( separator );
+ out.write( protein.getProteinId().getId() );
+ out.write( separator );
+ out.write( "[" );
+ final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
+ boolean first = true;
+ for( final Domain domain : protein.getProteinDomains() ) {
+ if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
+ visited_domain_ids.add( domain.getDomainId() );
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( " " );
+ }
+ out.write( domain.getDomainId().getId() );
+ out.write( " {" );
+ out.write( "" + domain.getTotalCount() );
+ out.write( "}" );
}
- out.write( domain.getDomainId().getId() );
- out.write( " {" );
- out.write( "" + domain.getTotalCount() );
- out.write( "}" );
}
+ out.write( "]" );
+ out.write( separator );
+ if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
+ .equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getDescription() );
+ }
+ out.write( separator );
+ if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
+ .equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getAccession() );
+ }
+ out.write( SurfacingConstants.NL );
}
- out.write( "]" );
- out.write( separator );
- if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
- .equals( SurfacingConstants.NONE ) ) ) {
- out.write( protein.getDescription() );
- }
- out.write( separator );
- if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
- .equals( SurfacingConstants.NONE ) ) ) {
- out.write( protein.getAccession() );
- }
- out.write( SurfacingConstants.NL );
}
}
out.flush();
}
- public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
- final DomainId domain_id,
+ public static void extractProteinNames( final List<Protein> proteins,
+ final List<DomainId> query_domain_ids_nc_order,
final Writer out,
- final String separator ) throws IOException {
- for( final Species species : protein_lists_per_species.keySet() ) {
- for( final Protein protein : protein_lists_per_species.get( species ) ) {
- final List<Domain> domains = protein.getProteinDomains( domain_id );
- if ( domains.size() > 0 ) {
- final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
- for( final Domain domain : domains ) {
- stats.addValue( domain.getPerSequenceEvalue() );
- }
+ final String separator,
+ final String limit_to_species ) throws IOException {
+ for( final Protein protein : proteins ) {
+ if ( ForesterUtil.isEmpty( limit_to_species )
+ || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
+ if ( protein.contains( query_domain_ids_nc_order, true ) ) {
out.write( protein.getSpecies().getSpeciesId() );
out.write( separator );
out.write( protein.getProteinId().getId() );
out.write( separator );
- out.write( "[" + FORMATTER.format( stats.median() ) + "]" );
+ out.write( "[" );
+ final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
+ boolean first = true;
+ for( final Domain domain : protein.getProteinDomains() ) {
+ if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
+ visited_domain_ids.add( domain.getDomainId() );
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( " " );
+ }
+ out.write( domain.getDomainId().getId() );
+ out.write( " {" );
+ out.write( "" + domain.getTotalCount() );
+ out.write( "}" );
+ }
+ }
+ out.write( "]" );
out.write( separator );
if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
.equals( SurfacingConstants.NONE ) ) ) {
out.flush();
}
+ public static void extractProteinNames( final SortedMap<Species, List<Protein>> protein_lists_per_species,
+ final DomainId domain_id,
+ final Writer out,
+ final String separator,
+ final String limit_to_species,
+ final double domain_e_cutoff ) throws IOException {
+ System.out.println( "Per domain E-value: " + domain_e_cutoff );
+ for( final Species species : protein_lists_per_species.keySet() ) {
+ System.out.println( species + ":" );
+ for( final Protein protein : protein_lists_per_species.get( species ) ) {
+ if ( ForesterUtil.isEmpty( limit_to_species )
+ || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
+ final List<Domain> domains = protein.getProteinDomains( domain_id );
+ if ( domains.size() > 0 ) {
+ out.write( protein.getSpecies().getSpeciesId() );
+ out.write( separator );
+ out.write( protein.getProteinId().getId() );
+ out.write( separator );
+ out.write( domain_id.toString() );
+ out.write( separator );
+ int prev_to = -1;
+ for( final Domain domain : domains ) {
+ if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
+ out.write( "/" );
+ out.write( domain.getFrom() + "-" + domain.getTo() );
+ if ( prev_to >= 0 ) {
+ final int l = domain.getFrom() - prev_to;
+ System.out.println( l );
+ }
+ prev_to = domain.getTo();
+ }
+ }
+ out.write( "/" );
+ out.write( separator );
+ final List<Domain> domain_list = new ArrayList<Domain>();
+ for( final Domain domain : protein.getProteinDomains() ) {
+ if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
+ domain_list.add( domain );
+ }
+ }
+ final Domain domain_ary[] = new Domain[ domain_list.size() ];
+ for( int i = 0; i < domain_list.size(); ++i ) {
+ domain_ary[ i ] = domain_list.get( i );
+ }
+ Arrays.sort( domain_ary, new DomainComparator( true ) );
+ out.write( "{" );
+ boolean first = true;
+ for( final Domain domain : domain_ary ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( "," );
+ }
+ out.write( domain.getDomainId().toString() );
+ out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
+ out.write( ":" + domain.getPerDomainEvalue() );
+ }
+ out.write( "}" );
+ if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
+ .equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getDescription() );
+ }
+ out.write( separator );
+ if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
+ .equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getAccession() );
+ }
+ out.write( SurfacingConstants.NL );
+ }
+ }
+ }
+ }
+ out.flush();
+ }
+
public static SortedSet<DomainId> getAllDomainIds( final List<GenomeWideCombinableDomains> gwcd_list ) {
final SortedSet<DomainId> all_domains_ids = new TreeSet<DomainId>();
for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
final boolean remove_engulfed_domains,
final Protein protein ) {
final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies()
- .getSpeciesId() );
+ .getSpeciesId(), protein.getLength() );
final List<Domain> sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein );
final List<Boolean> covered_positions = new ArrayList<Boolean>();
for( final Domain domain : sorted ) {
+ all_pfams_encountered.size() );
ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : "
+ pfams_without_mappings_counter + " ["
- + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
+ pfams_without_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with a mapping : " + pfams_with_mappings_counter
- + " ["
- + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() )
- + "%]" );
+ + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : "
+ + pfams_with_mappings_counter + " ["
+ + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : "
+ pfams_with_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with mapping to biological process: " + biological_process_counter
- + " ["
- + ( 100 * biological_process_counter / all_pfams_encountered.size() )
- + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with mapping to molecular function: " + molecular_function_counter
- + " ["
- + ( 100 * molecular_function_counter / all_pfams_encountered.size() )
- + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with mapping to cellular component: " + cellular_component_counter
- + " ["
- + ( 100 * cellular_component_counter / all_pfams_encountered.size() )
- + "%]" );
+ + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
+ + biological_process_counter + " ["
+ + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
+ + molecular_function_counter + " ["
+ + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
+ + cellular_component_counter + " ["
+ + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter
- + " [" + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+ + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams without mapping to proc. or func. : "
+ pfams_without_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " ["
- + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with a mapping to proc. or func. : "
+ pfams_with_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
- + ( 100 * biological_process_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
- + ( 100 * molecular_function_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
- + ( 100 * cellular_component_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.close();
}
ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
}
- public static void writeBinaryStatesMatrixToListORIGIG( final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
- final Map<GoId, GoTerm> go_id_to_term_map,
- final GoNameSpace go_namespace_limit,
- final boolean domain_combinations,
- final CharacterStateMatrix<CharacterStateMatrix.GainLossStates> matrix,
- final CharacterStateMatrix.GainLossStates state,
- final String filename,
- final String indentifier_characters_separator,
- final String character_separator,
- final String title_for_html,
- final String prefix_for_html,
- final Map<DomainId, Set<String>>[] domain_id_to_secondary_features_maps,
- final SortedSet<String> all_pfams_encountered,
- final SortedSet<String> pfams_gained_or_lost,
- final String suffix_for_per_node_events_file ) {
- if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
- throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" );
- }
- else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) {
- throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" );
- }
- else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) {
- throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" );
- }
- final File outfile = new File( filename );
- checkForOutputFileWriteability( outfile );
- final SortedSet<String> sorted_ids = new TreeSet<String>();
- for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) {
- sorted_ids.add( matrix.getIdentifier( i ) );
- }
- try {
- final Writer out = new BufferedWriter( new FileWriter( outfile ) );
- final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES,
- domain_combinations,
- state,
- filename );
- Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null;
- File per_node_go_mapped_domain_gain_loss_outfile = null;
- int per_node_counter = 0;
- out.write( "<html>" );
- out.write( SurfacingConstants.NL );
- addHtmlHead( out, title_for_html );
- out.write( SurfacingConstants.NL );
- out.write( "<body>" );
- out.write( SurfacingConstants.NL );
- out.write( "<h1>" );
- out.write( SurfacingConstants.NL );
- out.write( title_for_html );
- out.write( SurfacingConstants.NL );
- out.write( "</h1>" );
- out.write( SurfacingConstants.NL );
- out.write( "<table>" );
- out.write( SurfacingConstants.NL );
- for( final String id : sorted_ids ) {
- out.write( "<tr>" );
- out.write( "<td>" );
- out.write( "<a href=\"#" + id + "\">" + id + "</a>" );
- writeTaxonomyLinks( out, id );
- out.write( "</td>" );
- out.write( "</tr>" );
- out.write( SurfacingConstants.NL );
- }
- out.write( "</table>" );
- out.write( SurfacingConstants.NL );
- for( final String id : sorted_ids ) {
- out.write( SurfacingConstants.NL );
- out.write( "<h2>" );
- out.write( "<a name=\"" + id + "\">" + id + "</a>" );
- writeTaxonomyLinks( out, id );
- out.write( "</h2>" );
- out.write( SurfacingConstants.NL );
- out.write( "<table>" );
- out.write( SurfacingConstants.NL );
- out.write( "<tr>" );
- out.write( "<td><b>" );
- out.write( "Pfam domain(s)" );
- out.write( "</b></td><td><b>" );
- out.write( "GO term acc" );
- out.write( "</b></td><td><b>" );
- out.write( "GO term" );
- out.write( "</b></td><td><b>" );
- out.write( "Penultimate GO term" );
- out.write( "</b></td><td><b>" );
- out.write( "GO namespace" );
- out.write( "</b></td>" );
- out.write( "</tr>" );
- out.write( SurfacingConstants.NL );
- out.write( "</tr>" );
- out.write( SurfacingConstants.NL );
- per_node_counter = 0;
- if ( matrix.getNumberOfCharacters() > 0 ) {
- per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir
- + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file );
- SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile );
- per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil
- .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile );
- }
- else {
- per_node_go_mapped_domain_gain_loss_outfile = null;
- per_node_go_mapped_domain_gain_loss_outfile_writer = null;
- }
- for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) {
- // Not nice:
- // using null to indicate either UNCHANGED_PRESENT or GAIN.
- if ( ( matrix.getState( id, c ) == state )
- || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix
- .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) {
- final String character = matrix.getCharacter( c );
- String domain_0 = "";
- String domain_1 = "";
- if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) {
- final String[] s = character.split( BinaryDomainCombination.SEPARATOR );
- if ( s.length != 2 ) {
- throw new AssertionError( "this should not have happened: unexpected format for domain combination: ["
- + character + "]" );
- }
- domain_0 = s[ 0 ];
- domain_1 = s[ 1 ];
- }
- else {
- domain_0 = character;
- }
- writeDomainData( domain_id_to_go_ids_map,
- go_id_to_term_map,
- go_namespace_limit,
- out,
- domain_0,
- domain_1,
- prefix_for_html,
- character_separator,
- domain_id_to_secondary_features_maps,
- null );
- all_pfams_encountered.add( domain_0 );
- if ( pfams_gained_or_lost != null ) {
- pfams_gained_or_lost.add( domain_0 );
- }
- if ( !ForesterUtil.isEmpty( domain_1 ) ) {
- all_pfams_encountered.add( domain_1 );
- if ( pfams_gained_or_lost != null ) {
- pfams_gained_or_lost.add( domain_1 );
- }
- }
- if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
- writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer,
- domain_0,
- domain_1 );
- per_node_counter++;
- }
- }
- }
- if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) {
- per_node_go_mapped_domain_gain_loss_outfile_writer.close();
- if ( per_node_counter < 1 ) {
- per_node_go_mapped_domain_gain_loss_outfile.delete();
- }
- per_node_counter = 0;
- }
- out.write( "</table>" );
- out.write( SurfacingConstants.NL );
- out.write( "<hr>" );
- out.write( SurfacingConstants.NL );
- } // for( final String id : sorted_ids ) {
- out.write( "</body>" );
- out.write( SurfacingConstants.NL );
- out.write( "</html>" );
- out.write( SurfacingConstants.NL );
- out.flush();
- out.close();
- }
- catch ( final IOException e ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
- }
- ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" );
- }
-
public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties,
final File output_dir,
final Writer per_genome_domain_promiscuity_statistics_writer,
e.printStackTrace();
}
}
+
+ final static class DomainComparator implements Comparator<Domain> {
+
+ final private boolean _ascending;
+
+ public DomainComparator( final boolean ascending ) {
+ _ascending = ascending;
+ }
+
+ @Override
+ public final int compare( final Domain d0, final Domain d1 ) {
+ if ( d0.getFrom() < d1.getFrom() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( d0.getFrom() > d1.getFrom() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ }
+ }
}