final static private String STEP_FOR_DIAGNOSTICS_OPTION = "sd";
final static private String MIN_LENGTH_OPTION = "ml";
final static private String GAP_RATIO_LENGTH_OPTION = "gr";
- final static private String REPORT_ALN_MEAN_IDENTITY = "q";
+ final static private String REPORT_ENTROPY = "e";
final static private String OUTPUT_FORMAT_PHYLIP_OPTION = "p";
final static private String OUTPUT_REMOVED_SEQS_OPTION = "ro";
final static private String MAFFT_OPTIONS = "mo";
int step_for_diagnostics = 1;
int min_length = -1;
double gap_ratio = -1;
- boolean report_aln_mean_identity = false;
+ boolean report_entropy = false;
MSA_FORMAT output_format = MSA_FORMAT.FASTA;
File removed_seqs_out_base = null;
String mafft_options = "--auto";
allowed_options.add( STEP_FOR_DIAGNOSTICS_OPTION );
allowed_options.add( MIN_LENGTH_OPTION );
allowed_options.add( GAP_RATIO_LENGTH_OPTION );
- allowed_options.add( REPORT_ALN_MEAN_IDENTITY );
+ allowed_options.add( REPORT_ENTROPY );
allowed_options.add( OUTPUT_FORMAT_PHYLIP_OPTION );
allowed_options.add( OUTPUT_REMOVED_SEQS_OPTION );
allowed_options.add( MAFFT_OPTIONS );
ForesterUtil.fatalError( PRG_NAME, "gap ratio is out of range: " + gap_ratio );
}
}
- if ( cla.isOptionSet( REPORT_ALN_MEAN_IDENTITY ) ) {
- report_aln_mean_identity = true;
+ if ( cla.isOptionSet( REPORT_ENTROPY ) ) {
+ report_entropy = true;
}
if ( cla.isOptionSet( OUTPUT_FORMAT_PHYLIP_OPTION ) ) {
output_format = MSA_FORMAT.PHYLIP;
}
}
System.out.println( "Step for diagnostics reports : " + step_for_diagnostics );
- System.out.println( "Calculate mean identity : " + report_aln_mean_identity );
+ System.out.println( "Calculate normalized Shannon Entropy : " + report_entropy );
if ( !norm ) {
System.out.println( "Normalize : " + norm );
}
}
mc.setStep( step );
mc.setStepForDiagnostics( step_for_diagnostics );
- mc.setReportAlnMeanIdentity( report_aln_mean_identity );
+ mc.setCalculateNormalizedShannonEntropy( report_entropy );
mc.setPeformPhylogenticInference( perform_phylogenetic_inference );
if ( ( worst_remove > 0 ) || ( av_gap > 0 ) || ( length > 0 ) ) {
mc.setOutputFormat( output_format );
else {
msa_props = mc.chart( step, realign, norm );
}
- Chart.display( msa_props, initial_number_of_seqs, report_aln_mean_identity, in.getName() );
+ Chart.display( msa_props, initial_number_of_seqs, report_entropy, in.getName() );
}
catch ( final IllegalArgumentException iae ) {
// iae.printStackTrace(); //TODO remove me
System.out.println( " -" + STEP_OPTION + "=<integer> step for output and re-aligning (default: 1)" );
System.out.println( " -" + STEP_FOR_DIAGNOSTICS_OPTION
+ "=<integer> step for diagnostics reports (default: 1)" );
- System.out
- .println( " -"
- + REPORT_ALN_MEAN_IDENTITY
- + " to calculate mean MSA column identity (\"MSA quality\") (not recommended for very large alignments)" );
+ System.out.println( " -" + REPORT_ENTROPY
+ + " to calculate normalized Shannon Entropy (not recommended for very large alignments)" );
System.out.println( " -" + OUTPUT_FORMAT_PHYLIP_OPTION
+ " to write output alignments in phylip format instead of fasta" );
System.out.println( " -" + OUTPUT_REMOVED_SEQS_OPTION + "=<file> to output the removed sequences" );
package org.forester.msa;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
private ArrayList<String> _ignored_seqs_ids;
- synchronized public ArrayList<String> getIgnoredSequenceIds() {
- return _ignored_seqs_ids;
- }
-
- synchronized public static MsaMethods createInstance() {
- return new MsaMethods();
- }
-
private MsaMethods() {
init();
}
- synchronized private void init() {
- _ignored_seqs_ids = new ArrayList<String>();
- }
-
@Override
public Object clone() {
throw new NoSuchMethodError();
}
- public static int calcGapSumPerColumn( final Msa msa, final int col ) {
- int gap_rows = 0;
- for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
- if ( msa.isGapAt( j, col ) ) {
- gap_rows++;
- }
- }
- return gap_rows;
- }
-
- final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
- final List<Sequence> seqs = new ArrayList<Sequence>();
- for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
- if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
- seqs.add( msa.getSequence( row ) );
- }
- }
- if ( seqs.size() < 1 ) {
- return null;
- }
- return BasicMsa.createInstance( seqs );
- }
-
- final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
- final List<Sequence> seqs = new ArrayList<Sequence>();
- for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
- if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
- seqs.add( msa.getSequence( row ) );
- }
- }
- if ( seqs.size() < 1 ) {
- return null;
- }
- return BasicMsa.createInstance( seqs );
- }
-
- final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
- final List<Sequence> seqs = new ArrayList<Sequence>();
- for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
- if ( !to_remove_rows.contains( row ) ) {
- seqs.add( msa.getSequence( row ) );
- }
- }
- if ( seqs.size() < 1 ) {
- return null;
- }
- return BasicMsa.createInstance( seqs );
- }
-
synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio,
final int min_allowed_length,
final Msa msa ) {
return BasicMsa.createInstance( seqs );
}
- final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
+ synchronized public ArrayList<String> getIgnoredSequenceIds() {
+ return _ignored_seqs_ids;
+ }
+
+ synchronized private void init() {
+ _ignored_seqs_ids = new ArrayList<String>();
+ }
+
+ public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
- for( int c = from; c <= to; ++c ) {
- stats.addValue( calculateIdentityRatio( msa, c ) );
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() );
}
return stats;
}
+ public static double calcGapRatio( final Msa msa ) {
+ int gaps = 0;
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
+ gaps++;
+ }
+ }
+ }
+ return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+ }
+
+ public static int calcGapSumPerColumn( final Msa msa, final int col ) {
+ int gap_rows = 0;
+ for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
+ if ( msa.isGapAt( j, col ) ) {
+ gap_rows++;
+ }
+ }
+ return gap_rows;
+ }
+
+ final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa ) {
+ double s = 0;
+ for( int col = 0; col < msa.getLength(); ++col ) {
+ s += calcNormalizedShannonsEntropy( k, msa, col );
+ }
+ return s / msa.getLength();
+ }
+
+ final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa, final int col ) {
+ // http://www.ebi.ac.uk/thornton-srv/databases/valdarprograms/scorecons_server_help.html
+ // n: number of residues in column
+ // k: number of residue types
+ // na: number of residues of type a
+ // pa = na/n
+ // S=-sum pa log2 pa
+ double s = 0;
+ final double n = msa.getNumberOfSequences();
+ HashMap<Character, Integer> dist = null;
+ if ( k == 6 ) {
+ dist = calcResidueDistribution6( msa, col );
+ }
+ else if ( k == 7 ) {
+ dist = calcResidueDistribution7( msa, col );
+ }
+ else if ( k == 20 ) {
+ dist = calcResidueDistribution20( msa, col );
+ }
+ else if ( k == 21 ) {
+ dist = calcResidueDistribution21( msa, col );
+ }
+ else {
+ throw new IllegalArgumentException( "illegal value for k: " + k );
+ }
+ if ( dist.size() == 1 ) {
+ return 0;
+ }
+ // if ( dist.size() == n ) {
+ // return 0;
+ // }
+ for( final int na : dist.values() ) {
+ final double pa = na / n;
+ s += pa * Math.log( pa );
+ }
+ if ( n < k ) {
+ return -( s / ( Math.log( n ) ) );
+ }
+ else {
+ return -( s / ( Math.log( k ) ) );
+ }
+ }
+
final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) {
final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
return stats;
}
+ final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( int c = from; c <= to; ++c ) {
+ stats.addValue( calculateIdentityRatio( msa, c ) );
+ }
+ return stats;
+ }
+
final public static double calculateIdentityRatio( final Msa msa, final int column ) {
final SortedMap<Character, Integer> dist = calculateResidueDestributionPerColumn( msa, column );
int majority_count = 0;
return map;
}
- public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
- final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
- for( int i = 0; i < msa.getLength(); ++i ) {
- stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() );
+ synchronized public static MsaMethods createInstance() {
+ return new MsaMethods();
+ }
+
+ final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
+ seqs.add( msa.getSequence( row ) );
+ }
}
- return stats;
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
+ seqs.add( msa.getSequence( row ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
}
public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) {
return removeSequencesByRow( msa, to_remove_rows );
}
- public static double calcGapRatio( final Msa msa ) {
- int gaps = 0;
- for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
- for( int i = 0; i < msa.getLength(); ++i ) {
- if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
- gaps++;
+ final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_rows.contains( row ) ) {
+ seqs.add( msa.getSequence( row ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ final private static HashMap<Character, Integer> calcResidueDistribution20( final Msa msa, final int col ) {
+ final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ final char c = msa.getResidueAt( row, col );
+ if ( c != Sequence.GAP ) {
+ if ( !counts.containsKey( c ) ) {
+ counts.put( c, 1 );
+ }
+ else {
+ counts.put( c, 1 + counts.get( c ) );
}
}
}
- return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+ return counts;
+ }
+
+ final private static HashMap<Character, Integer> calcResidueDistribution21( final Msa msa, final int col ) {
+ final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ final char c = msa.getResidueAt( row, col );
+ if ( !counts.containsKey( c ) ) {
+ counts.put( c, 1 );
+ }
+ else {
+ counts.put( c, 1 + counts.get( c ) );
+ }
+ }
+ return counts;
+ }
+
+ final private static HashMap<Character, Integer> calcResidueDistribution6( final Msa msa, final int col ) {
+ // Residues are classified into one of tex2html_wrap199 types:
+ // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
+ // special conformations [GP] and gaps. This convention follows that
+ // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
+ final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ final char c = msa.getResidueAt( row, col );
+ char x;
+ if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) {
+ // aliphatic
+ x = 'a';
+ }
+ else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) {
+ // aromatic
+ x = 'r';
+ }
+ else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) {
+ // polar
+ x = 'p';
+ }
+ else if ( ( c == 'K' ) || ( c == 'R' ) ) {
+ // positive
+ x = 'o';
+ }
+ else if ( ( c == 'D' ) || ( c == 'E' ) ) {
+ // negative
+ x = 'e';
+ }
+ else if ( ( c == 'G' ) || ( c == 'P' ) ) {
+ // aliphatic - special conformation
+ x = 's';
+ }
+ else {
+ continue;
+ }
+ if ( !counts.containsKey( x ) ) {
+ counts.put( x, 1 );
+ }
+ else {
+ counts.put( x, 1 + counts.get( x ) );
+ }
+ }
+ return counts;
+ }
+
+ final private static HashMap<Character, Integer> calcResidueDistribution7( final Msa msa, final int col ) {
+ // Residues are classified into one of tex2html_wrap199 types:
+ // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
+ // special conformations [GP] and gaps. This convention follows that
+ // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
+ final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ final char c = msa.getResidueAt( row, col );
+ char x = '-';
+ if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) {
+ // aliphatic
+ x = 'a';
+ }
+ else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) {
+ // aromatic
+ x = 'r';
+ }
+ else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) {
+ // polar
+ x = 'p';
+ }
+ else if ( ( c == 'K' ) || ( c == 'R' ) ) {
+ // positive
+ x = 'o';
+ }
+ else if ( ( c == 'D' ) || ( c == 'E' ) ) {
+ // negative
+ x = 'e';
+ }
+ else if ( ( c == 'G' ) || ( c == 'P' ) ) {
+ // aliphatic - special conformation
+ x = 's';
+ }
+ if ( !counts.containsKey( x ) ) {
+ counts.put( x, 1 );
+ }
+ else {
+ counts.put( x, 1 + counts.get( x ) );
+ }
+ }
+ return counts;
}
}
import java.awt.BorderLayout;
import java.awt.event.ActionListener;
-import java.util.ArrayList;
import java.util.List;
import javax.swing.JDialog;
public final class Chart extends JDialog implements ActionListener {
- private static final long serialVersionUID = -5292420246132943515L;
- private ChartPanel _chart_panel = null;
- private final JMenuItem _m_exit = new JMenuItem();
- private List<MsaProperties> _msa_props;
- private final boolean _show_msa_qual;
- private final int _initial_number_of_seqs;
- private final String _title;
+ private static final long serialVersionUID = -5292420246132943515L;
+ private ChartPanel _chart_panel = null;
+ private final int _initial_number_of_seqs;
+ private final JMenuItem _m_exit = new JMenuItem();
+ private final List<MsaProperties> _msa_props;
+ private final boolean _show_msa_qual;
+ private final String _title;
private Chart( final List<MsaProperties> msa_props,
final int initial_number_of_seqs,
private ChartPanel obtainChartPanel() {
if ( _chart_panel == null ) {
final MultiScatterDataModel model = new MultiScatterDataModel();
- if ( ( _msa_props == null ) || _msa_props.isEmpty() ) {
- _msa_props = new ArrayList<MsaProperties>();
- final MsaProperties p0 = new MsaProperties( 10, 200, 0.5, 0.1, "" );
- final MsaProperties p1 = new MsaProperties( 9, 190, 0.49, 0.2, "" );
- final MsaProperties p2 = new MsaProperties( 8, 150, 0.2, 0.3, "" );
- final MsaProperties p3 = new MsaProperties( 7, 145, 0.2, 0.4, "" );
- _msa_props.add( p0 );
- _msa_props.add( p1 );
- _msa_props.add( p2 );
- _msa_props.add( p3 );
- }
final double[][] seqs_length = new double[ _msa_props.size() ][ 2 ];
+ int max_length = -1;
for( int i = 0; i < _msa_props.size(); ++i ) {
seqs_length[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
seqs_length[ i ][ 1 ] = _msa_props.get( i ).getLength();
+ if ( _msa_props.get( i ).getLength() > max_length ) {
+ max_length = _msa_props.get( i ).getLength();
+ }
}
model.addData( seqs_length, "Length" );
model.setSeriesLine( "Series " + "Length", true );
model.setSeriesMarker( "Series " + "Length", false );
final double[][] seqs_gaps = new double[ _msa_props.size() ][ 2 ];
+ double max_gap_ratio = -1;
+ double max_ent7 = -1;
+ double max_ent21 = -1;
+ for( int i = 0; i < _msa_props.size(); ++i ) {
+ if ( _msa_props.get( i ).getGapRatio() > max_gap_ratio ) {
+ max_gap_ratio = _msa_props.get( i ).getGapRatio();
+ }
+ if ( _show_msa_qual ) {
+ if ( _msa_props.get( i ).getEntropy7() > max_ent7 ) {
+ max_ent7 = _msa_props.get( i ).getEntropy7();
+ }
+ if ( _msa_props.get( i ).getEntropy21() > max_ent21 ) {
+ max_ent21 = _msa_props.get( i ).getEntropy21();
+ }
+ }
+ }
+ final double gap_ratio_factor = ( max_length / 2.0 ) / max_gap_ratio;
+ final double ent7_factor = ( max_length / 2.0 ) / max_ent7;
+ final double ent21_factor = ( max_length / 2.0 ) / max_ent21;
for( int i = 0; i < _msa_props.size(); ++i ) {
seqs_gaps[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
- seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * 200 );
+ seqs_gaps[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getGapRatio() * gap_ratio_factor );
}
model.addData( seqs_gaps, "Gap ratio" );
model.setSeriesLine( "Series " + "Gap ratio", true );
model.setSeriesMarker( "Series " + "Gap ratio", false );
if ( _show_msa_qual ) {
- final double[][] seqs_identity = new double[ _msa_props.size() ][ 2 ];
+ final double[][] entropy7 = new double[ _msa_props.size() ][ 2 ];
+ for( int i = 0; i < _msa_props.size(); ++i ) {
+ entropy7[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
+ entropy7[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy7() * ent7_factor );
+ }
+ model.addData( entropy7, "Entropy norm 7" );
+ model.setSeriesLine( "Series " + "Entropy norm 7", true );
+ model.setSeriesMarker( "Series " + "Entropy norm 7", false );
+ //
+ final double[][] entropy21 = new double[ _msa_props.size() ][ 2 ];
for( int i = 0; i < _msa_props.size(); ++i ) {
- seqs_identity[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
- seqs_identity[ i ][ 1 ] = ForesterUtil
- .roundToInt( _msa_props.get( i ).getAverageIdentityRatio() * 200 );
+ entropy21[ i ][ 0 ] = _initial_number_of_seqs - _msa_props.get( i ).getNumberOfSequences();
+ entropy21[ i ][ 1 ] = ForesterUtil.roundToInt( _msa_props.get( i ).getEntropy21() * ent21_factor );
}
- model.addData( seqs_identity, "mean MSA column identity" );
- model.setSeriesLine( "Series " + "mean MSA column identity", true );
- model.setSeriesMarker( "Series " + "mean MSA column identity", false );
+ model.addData( entropy21, "Entropy norm 21" );
+ model.setSeriesLine( "Series " + "Entropy norm 21", true );
+ model.setSeriesMarker( "Series " + "Entropy norm 21", false );
}
final BoxCoordSystem coord = new BoxCoordSystem( model );
coord.setUnitFont( coord.getUnitFont().deriveFont( 16.0f ) );
public class MsaCompactor {
- final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
- final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
- private double _gap_ratio = -1;
+ final private static NumberFormat NF_3 = new DecimalFormat( "#.###" );
+ final private static NumberFormat NF_4 = new DecimalFormat( "#.####" );
+ private boolean _calculate_shannon_entropy = false;
//
- private String _infile_name = null;
+ private String _infile_name = null;
private final short _longest_id_length;
//
- private String _maffts_opts = "--auto";
- private int _min_length = -1;
- private DeleteableMsa _msa = null;
- private boolean _norm = true;
- private File _out_file_base = null;
- private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
- private String _path_to_mafft = null;
- private boolean _phylogentic_inference = false;
+ private String _maffts_opts = "--auto";
+ private DeleteableMsa _msa = null;
+ private boolean _norm = true;
+ private File _out_file_base = null;
+ private MSA_FORMAT _output_format = MSA_FORMAT.FASTA;
+ private String _path_to_mafft = null;
+ private boolean _phylogentic_inference = false;
//
- private boolean _realign = false;
+ private boolean _realign = false;
private final SortedSet<String> _removed_seq_ids;
private final ArrayList<Sequence> _removed_seqs;
- private File _removed_seqs_out_base = null;
- private boolean _report_aln_mean_identity = false;
- private int _step = -1;
- private int _step_for_diagnostics = -1;
+ private File _removed_seqs_out_base = null;
+ private int _step = -1;
+ private int _step_for_diagnostics = -1;
static {
NF_4.setRoundingMode( RoundingMode.HALF_UP );
NF_3.setRoundingMode( RoundingMode.HALF_UP );
if ( !_realign ) {
_step = -1;
}
- int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 20.0 );
- if ( x < 1 ) {
- x = 1;
+ int x = ForesterUtil.roundToInt( _msa.getNumberOfSequences() / 10.0 );
+ if ( x < 2 ) {
+ x = 2;
}
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printTableHeader();
printMsaProperties( msa_prop );
if ( realign && isPrintMsaStatsWriteOutfileAndRealign( i ) ) {
removeGapColumns();
realignWithMafft();
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.print( "(realigned)" );
}
else if ( isPrintMsaStats( i ) ) {
removeGapColumns();
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
}
public final void removeSequencesByMinimalLength( final int min_effective_length ) {
- printMsaProperties( new MsaProperties( _msa, "", _report_aln_mean_identity ) );
+ printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) );
System.out.println();
_msa = DeleteableMsa.createInstance( MsaMethods.removeSequencesByMinimalLength( _msa, min_effective_length ) );
removeGapColumns();
- printMsaProperties( new MsaProperties( _msa, "", _report_aln_mean_identity ) );
+ printMsaProperties( new MsaProperties( _msa, "", _calculate_shannon_entropy ) );
System.out.println();
}
phy = calcTree();
}
printTableHeader();
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
System.out.println();
}
else if ( isPrintMsaStats( i ) ) {
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
phy = calcTree();
}
printTableHeader();
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
System.out.println();
}
else if ( isPrintMsaStats( i ) ) {
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
printMsaProperties( msa_prop );
msa_props.add( msa_prop );
System.out.println();
phy = calcTree();
}
printTableHeader();
- MsaProperties msa_prop = new MsaProperties( _msa, "", _report_aln_mean_identity );
+ MsaProperties msa_prop = new MsaProperties( _msa, "", _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
System.out.println();
}
else if ( isPrintMsaStats( i ) ) {
- msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
msa_props.add( msa_prop );
printMsaProperties( msa_prop );
System.out.println();
return msa_props;
}
- public final void setGapRatio( final double gap_ratio ) {
- _gap_ratio = gap_ratio;
+ public final void setCalculateNormalizedShannonEntropy( final boolean calculate_shannon_entropy ) {
+ _calculate_shannon_entropy = calculate_shannon_entropy;
}
public void setInfileName( final String infile_name ) {
_maffts_opts = maffts_opts;
}
- public final void setMinLength( final int min_length ) {
- _min_length = min_length;
- }
-
public final void setNorm( final boolean norm ) {
_norm = norm;
}
_removed_seqs_out_base = removed_seqs_out_base;
}
- public final void setReportAlnMeanIdentity( final boolean report_aln_mean_identity ) {
- _report_aln_mean_identity = report_aln_mean_identity;
- }
-
public final void setStep( final int step ) {
_step = step;
}
sb.append( msa_properties.getLength() );
sb.append( "\t" );
sb.append( NF_4.format( msa_properties.getGapRatio() ) );
- if ( _report_aln_mean_identity ) {
+ if ( _calculate_shannon_entropy ) {
+ sb.append( "\t" );
+ sb.append( NF_4.format( msa_properties.getEntropy7() ) );
sb.append( "\t" );
- sb.append( NF_4.format( msa_properties.getAverageIdentityRatio() ) );
+ sb.append( NF_4.format( msa_properties.getEntropy21() ) );
}
return sb;
}
if ( realign ) {
realignWithMafft();
}
- final MsaProperties msa_prop = new MsaProperties( _msa, id, _report_aln_mean_identity );
+ final MsaProperties msa_prop = new MsaProperties( _msa, id, _calculate_shannon_entropy );
printMsaProperties( msa_prop );
final String s = writeOutfile();
System.out.print( "-> " + s + ( realign ? "\t(realigned)" : "" ) );
System.out.print( "\t" );
System.out.print( "Gaps" );
System.out.print( "\t" );
- if ( _report_aln_mean_identity ) {
- System.out.print( "MSA qual" );
+ if ( _calculate_shannon_entropy ) {
+ System.out.print( "entn7" );
+ System.out.print( "\t" );
+ System.out.print( "entn21" );
System.out.print( "\t" );
}
System.out.println();
public final class MsaProperties {
- final private double _average_identity_ratio;
+ final private double _entropy21;
+ final private double _entropy7;
final private double _gap_ratio;
final private int _length;
final private int _number_of_sequences;
public MsaProperties( final int number_of_sequences,
final int length,
final double gap_ratio,
- final double average_identity_ratio,
+ final double entropy7,
+ final double entropy21,
final String removed_seq ) {
_number_of_sequences = number_of_sequences;
_length = length;
_gap_ratio = gap_ratio;
- _average_identity_ratio = average_identity_ratio;
+ _entropy7 = entropy7;
+ _entropy21 = entropy21;
_removed_seq = removed_seq;
}
- public MsaProperties( final Msa msa, final String removed_seq, final boolean calculate_aln_mean_identity ) {
+ public MsaProperties( final Msa msa, final String removed_seq, final boolean calculate_normalized_shannon_entropy ) {
_number_of_sequences = msa.getNumberOfSequences();
_length = msa.getLength();
_gap_ratio = MsaMethods.calcGapRatio( msa );
_removed_seq = removed_seq;
- if ( calculate_aln_mean_identity ) {
- _average_identity_ratio = MsaMethods.calculateIdentityRatio( 0, msa.getLength() - 1, msa ).arithmeticMean();
+ if ( calculate_normalized_shannon_entropy ) {
+ _entropy7 = MsaMethods.calcNormalizedShannonsEntropy( 7, msa );
+ _entropy21 = MsaMethods.calcNormalizedShannonsEntropy( 21, msa );
}
else {
- _average_identity_ratio = -1;
+ _entropy7 = -1;
+ _entropy21 = -1;
}
}
- public final double getAverageIdentityRatio() {
- return _average_identity_ratio;
+ public final double getEntropy21() {
+ return _entropy21;
+ }
+
+ public final double getEntropy7() {
+ return _entropy7;
}
public final double getGapRatio() {
System.exit( -1 );
}
final long start_time = new Date().getTime();
+ System.out.print( "MSA entropy: " );
+ if ( Test.testMsaEntropy() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
+ System.exit( 0 );
System.out.print( "Basic node methods: " );
if ( Test.testBasicNodeMethods() ) {
System.out.println( "OK." );
return true;
}
+ private static boolean testMsaEntropy() {
+ try {
+ final Sequence s0 = BasicSequence.createAaSequence( "a", "AAAAAAA" );
+ final Sequence s1 = BasicSequence.createAaSequence( "b", "AAAIACC" );
+ final Sequence s2 = BasicSequence.createAaSequence( "c", "AAIIIIF" );
+ final Sequence s3 = BasicSequence.createAaSequence( "d", "AIIIVVW" );
+ final List<Sequence> l = new ArrayList<Sequence>();
+ l.add( s0 );
+ l.add( s1 );
+ l.add( s2 );
+ l.add( s3 );
+ final Msa msa = BasicMsa.createInstance( l );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 0 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 1 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 2 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 3 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 4 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 5 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa, 6 ) );
+ System.out.println();
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 0 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 1 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 2 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 3 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 4 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 5 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 6, msa, 6 ) );
+ final List<Sequence> l2 = new ArrayList<Sequence>();
+ l2.add( BasicSequence.createAaSequence( "1", "AAAAAAA" ) );
+ l2.add( BasicSequence.createAaSequence( "2", "AAAIACC" ) );
+ l2.add( BasicSequence.createAaSequence( "3", "AAIIIIF" ) );
+ l2.add( BasicSequence.createAaSequence( "4", "AIIIVVW" ) );
+ l2.add( BasicSequence.createAaSequence( "5", "AAAAAAA" ) );
+ l2.add( BasicSequence.createAaSequence( "6", "AAAIACC" ) );
+ l2.add( BasicSequence.createAaSequence( "7", "AAIIIIF" ) );
+ l2.add( BasicSequence.createAaSequence( "8", "AIIIVVW" ) );
+ l2.add( BasicSequence.createAaSequence( "9", "AAAAAAA" ) );
+ l2.add( BasicSequence.createAaSequence( "10", "AAAIACC" ) );
+ l2.add( BasicSequence.createAaSequence( "11", "AAIIIIF" ) );
+ l2.add( BasicSequence.createAaSequence( "12", "AIIIVVW" ) );
+ l2.add( BasicSequence.createAaSequence( "13", "AAIIIIF" ) );
+ l2.add( BasicSequence.createAaSequence( "14", "AIIIVVW" ) );
+ l2.add( BasicSequence.createAaSequence( "15", "AAAAAAA" ) );
+ l2.add( BasicSequence.createAaSequence( "16", "AAAIACC" ) );
+ l2.add( BasicSequence.createAaSequence( "17", "AAIIIIF" ) );
+ l2.add( BasicSequence.createAaSequence( "18", "AIIIVVW" ) );
+ l2.add( BasicSequence.createAaSequence( "19", "AAAAAAA" ) );
+ l2.add( BasicSequence.createAaSequence( "20", "AAAIACC" ) );
+ l2.add( BasicSequence.createAaSequence( "21", "AAIIIIF" ) );
+ l2.add( BasicSequence.createAaSequence( "22", "AIIIVVW" ) );
+ final Msa msa2 = BasicMsa.createInstance( l2 );
+ System.out.println();
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa2, 0 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa2, 1 ) );
+ System.out.println( MsaMethods.calcNormalizedShannonsEntropy( 20, msa2, 2 ) );
+ }
+ catch ( final Exception e ) {
+ e.printStackTrace( System.out );
+ return false;
+ }
+ return true;
+ }
+
private static boolean testDeleteableMsa() {
try {
final Sequence s0 = BasicSequence.createAaSequence( "a", "AAAA" );