import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
+import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.matrix.distance.DistanceMatrix;
import org.forester.go.GoId;
import org.forester.go.GoNameSpace;
final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
- final DescriptiveStatistics gained_multiple_times_domain_length_stats = new BasicDescriptiveStatistics();
- final DescriptiveStatistics gained_once_domain_length_stats = new BasicDescriptiveStatistics();
+ long gained_multiple_times_domain_length_sum = 0;
+ long gained_once_domain_length_sum = 0;
+ long gained_multiple_times_domain_length_count = 0;
+ long gained_once_domain_length_count = 0;
for( final String dc : dcs ) {
final int count = dc_gain_counts.get( dc );
if ( histogram.containsKey( count ) ) {
more_than_once.add( dc );
if ( protein_length_stats_by_dc != null ) {
final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
- final double[] a = s.getDataAsDoubleArray();
- for( final double element : a ) {
+ for( final double element : s.getData() ) {
gained_multiple_times_lengths_stats.addValue( element );
}
}
if ( domain_number_stats_by_dc != null ) {
final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
- final double[] a = s.getDataAsDoubleArray();
- for( final double element : a ) {
+ for( final double element : s.getData() ) {
gained_multiple_times_domain_count_stats.addValue( element );
}
}
final String[] ds = dc.split( "=" );
final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
- final double[] a0 = s0.getDataAsDoubleArray();
- final double[] a1 = s1.getDataAsDoubleArray();
- for( final double element : a0 ) {
- gained_multiple_times_domain_length_stats.addValue( element );
+ for( final double element : s0.getData() ) {
+ gained_multiple_times_domain_length_sum += element;
+ ++gained_multiple_times_domain_length_count;
}
- for( final double element : a1 ) {
- gained_multiple_times_domain_length_stats.addValue( element );
+ for( final double element : s1.getData() ) {
+ gained_multiple_times_domain_length_sum += element;
+ ++gained_multiple_times_domain_length_count;
}
}
}
else {
if ( protein_length_stats_by_dc != null ) {
final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
- final double[] a = s.getDataAsDoubleArray();
- for( final double element : a ) {
+ for( final double element : s.getData() ) {
gained_once_lengths_stats.addValue( element );
}
}
if ( domain_number_stats_by_dc != null ) {
final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
- final double[] a = s.getDataAsDoubleArray();
- for( final double element : a ) {
+ for( final double element : s.getData() ) {
gained_once_domain_count_stats.addValue( element );
}
}
final String[] ds = dc.split( "=" );
final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
- final double[] a0 = s0.getDataAsDoubleArray();
- final double[] a1 = s1.getDataAsDoubleArray();
- for( final double element : a0 ) {
- gained_once_domain_length_stats.addValue( element );
+ for( final double element : s0.getData() ) {
+ gained_once_domain_length_sum += element;
+ ++gained_once_domain_length_count;
}
- for( final double element : a1 ) {
- gained_once_domain_length_stats.addValue( element );
+ for( final double element : s1.getData() ) {
+ gained_once_domain_length_sum += element;
+ ++gained_once_domain_length_count;
}
}
}
nodes.add( n );
}
}
- for( int i = 0; i < nodes.size() - 1; ++i ) {
+ for( int i = 0; i < ( nodes.size() - 1 ); ++i ) {
for( int j = i + 1; j < nodes.size(); ++j ) {
- final PhylogenyNode lca = PhylogenyMethods.getInstance().obtainLCA( nodes.get( i ),
- nodes.get( j ) );
+ final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) );
String rank = "unknown";
if ( lca.getNodeData().isHasTaxonomy()
&& !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) {
out_for_rank_counts.close();
out_for_ancestor_species_counts.close();
if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats )
- && ( ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
+ && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) {
final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) );
w.write( "Domain Lengths: " );
w.write( "\n" );
w.write( "\n" );
w.write( "Gained once, domain lengths:" );
w.write( "\n" );
- w.write( gained_once_domain_length_stats.toString() );
+ w.write( "N: " + gained_once_domain_length_count );
+ w.write( "\n" );
+ w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) );
w.write( "\n" );
w.write( "\n" );
w.write( "Gained multiple times, domain lengths:" );
w.write( "\n" );
- w.write( gained_multiple_times_domain_length_stats.toString() );
+ w.write( "N: " + gained_multiple_times_domain_length_count );
+ w.write( "\n" );
+ w.write( "Avg: "
+ + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) );
w.write( "\n" );
w.write( "\n" );
w.write( "\n" );
public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) {
checkForOutputFileWriteability( nj_tree_outfile );
final NeighborJoining nj = NeighborJoining.createInstance();
- final Phylogeny phylogeny = nj.execute( distance );
+ final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance );
phylogeny.setName( nj_tree_outfile.getName() );
writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() );
return phylogeny;
final DomainId domain_id,
final Writer out,
final String separator,
- final String limit_to_species ) throws IOException {
+ final String limit_to_species,
+ final double domain_e_cutoff ) throws IOException {
+ System.out.println( "Per domain E-value: " + domain_e_cutoff );
for( final Species species : protein_lists_per_species.keySet() ) {
+ System.out.println( species + ":" );
for( final Protein protein : protein_lists_per_species.get( species ) ) {
if ( ForesterUtil.isEmpty( limit_to_species )
|| protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
final List<Domain> domains = protein.getProteinDomains( domain_id );
if ( domains.size() > 0 ) {
- final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
- for( final Domain domain : domains ) {
- stats.addValue( domain.getPerSequenceEvalue() );
- }
out.write( protein.getSpecies().getSpeciesId() );
out.write( separator );
out.write( protein.getProteinId().getId() );
out.write( separator );
- out.write( "[" + FORMATTER.format( stats.median() ) + "]" );
+ out.write( domain_id.toString() );
+ out.write( separator );
+ int prev_to = -1;
+ for( final Domain domain : domains ) {
+ if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
+ out.write( "/" );
+ out.write( domain.getFrom() + "-" + domain.getTo() );
+ if ( prev_to >= 0 ) {
+ final int l = domain.getFrom() - prev_to;
+ System.out.println( l );
+ }
+ prev_to = domain.getTo();
+ }
+ }
+ out.write( "/" );
out.write( separator );
+ final List<Domain> domain_list = new ArrayList<Domain>();
+ for( final Domain domain : protein.getProteinDomains() ) {
+ if ( ( domain_e_cutoff < 0 ) || ( domain.getPerDomainEvalue() <= domain_e_cutoff ) ) {
+ domain_list.add( domain );
+ }
+ }
+ final Domain domain_ary[] = new Domain[ domain_list.size() ];
+ for( int i = 0; i < domain_list.size(); ++i ) {
+ domain_ary[ i ] = domain_list.get( i );
+ }
+ Arrays.sort( domain_ary, new DomainComparator( true ) );
+ out.write( "{" );
+ boolean first = true;
+ for( final Domain domain : domain_ary ) {
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( "," );
+ }
+ out.write( domain.getDomainId().toString() );
+ out.write( ":" + domain.getFrom() + "-" + domain.getTo() );
+ out.write( ":" + domain.getPerDomainEvalue() );
+ }
+ out.write( "}" );
if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
.equals( SurfacingConstants.NONE ) ) ) {
out.write( protein.getDescription() );
+ all_pfams_encountered.size() );
ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : "
+ pfams_without_mappings_counter + " ["
- + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : "
+ pfams_without_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with a mapping : " + pfams_with_mappings_counter
- + " ["
- + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() )
- + "%]" );
+ + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : "
+ + pfams_with_mappings_counter + " ["
+ + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : "
+ pfams_with_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with mapping to biological process: " + biological_process_counter
- + " ["
- + ( 100 * biological_process_counter / all_pfams_encountered.size() )
- + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with mapping to molecular function: " + molecular_function_counter
- + " ["
- + ( 100 * molecular_function_counter / all_pfams_encountered.size() )
- + "%]" );
- ForesterUtil.programMessage( surfacing.PRG_NAME,
- "Pfams with mapping to cellular component: " + cellular_component_counter
- + " ["
- + ( 100 * cellular_component_counter / all_pfams_encountered.size() )
- + "%]" );
+ + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: "
+ + biological_process_counter + " ["
+ + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: "
+ + molecular_function_counter + " ["
+ + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: "
+ + cellular_component_counter + " ["
+ + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter
- + " [" + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+ + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams without mapping to proc. or func. : "
+ pfams_without_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " ["
- + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with a mapping to proc. or func. : "
+ pfams_with_mappings_to_bp_or_mf_counter + " ["
- + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " ["
- + ( 100 * biological_process_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " ["
- + ( 100 * molecular_function_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " ["
- + ( 100 * cellular_component_counter / all_pfams_encountered.size() ) + "%]" );
+ + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" );
summary_writer.write( ForesterUtil.LINE_SEPARATOR );
summary_writer.close();
}
e.printStackTrace();
}
}
+
+ final static class DomainComparator implements Comparator<Domain> {
+
+ final private boolean _ascending;
+
+ public DomainComparator( final boolean ascending ) {
+ _ascending = ascending;
+ }
+
+ @Override
+ public final int compare( final Domain d0, final Domain d1 ) {
+ if ( d0.getFrom() < d1.getFrom() ) {
+ return _ascending ? -1 : 1;
+ }
+ else if ( d0.getFrom() > d1.getFrom() ) {
+ return _ascending ? 1 : -1;
+ }
+ return 0;
+ }
+ }
}