+ synchronized public ArrayList<String> getIgnoredSequenceIds() {
+ return _ignored_seqs_ids;
+ }
+
+ synchronized private void init() {
+ _ignored_seqs_ids = new ArrayList<String>();
+ }
+
+ public static final DescriptiveStatistics calcNumberOfGapsStats( final Msa msa ) {
+ final int[] gaps = calcNumberOfGapsInMsa( msa );
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( final int gap : gaps ) {
+ stats.addValue( gap );
+ }
+ return stats;
+ }
+
+ public static final int[] calcNumberOfGapsInMsa( final Msa msa ) {
+ final int seqs = msa.getNumberOfSequences();
+ final int[] gaps= new int[ seqs ];
+ for( int i = 0; i < seqs; ++i ) {
+ gaps[ i ] = calcNumberOfGaps( msa.getSequence( i ) );
+ }
+ return gaps;
+ }
+
+
+
+ public final static int calcNumberOfGaps( final MolecularSequence seq ) {
+ int gaps = 0;
+ boolean was_gap = false;
+ for( int i = 0; i < seq.getLength(); ++i ) {
+ if ( seq.isGapAt( i ) ) {
+ if ( !was_gap ) {
+ ++gaps;
+ was_gap = true;
+ }
+ }
+ else {
+ was_gap = false;
+ }
+ }
+ return gaps;
+ }
+
+ public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() );
+ }
+ return stats;
+ }
+
+ public static double calcGapRatio( final Msa msa ) {
+ int gaps = 0;
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) == MolecularSequence.GAP ) {
+ gaps++;
+ }
+ }
+ }
+ return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+ }
+
+ public static int calcGapSumPerColumn( final Msa msa, final int col ) {
+ int gap_rows = 0;
+ for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
+ if ( msa.isGapAt( j, col ) ) {
+ gap_rows++;
+ }
+ }
+ return gap_rows;
+ }
+
+ final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa ) {
+ double s = 0;
+ for( int col = 0; col < msa.getLength(); ++col ) {
+ s += calcNormalizedShannonsEntropy( k, msa, col );
+ }
+ return s / msa.getLength();
+ }
+
+ final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa, final int col ) {
+ // http://www.ebi.ac.uk/thornton-srv/databases/valdarprograms/scorecons_server_help.html
+ // n: number of residues in column
+ // k: number of residue types
+ // na: number of residues of type a
+ // pa = na/n
+ // S=-sum pa log2 pa
+ double s = 0;
+ final double n = msa.getNumberOfSequences();
+ HashMap<Character, Integer> dist = null;
+ if ( k == 6 ) {
+ dist = calcResidueDistribution6( msa, col );
+ }
+ else if ( k == 7 ) {
+ dist = calcResidueDistribution7( msa, col );
+ }
+ else if ( k == 20 ) {
+ dist = calcResidueDistribution20( msa, col );
+ }
+ else if ( k == 21 ) {
+ dist = calcResidueDistribution21( msa, col );
+ }
+ else {
+ throw new IllegalArgumentException( "illegal value for k: " + k );
+ }
+ if ( dist.size() == 1 ) {
+ return 0;
+ }
+ // if ( dist.size() == n ) {
+ // return 0;
+ // }
+ for( final int na : dist.values() ) {
+ final double pa = na / n;
+ s += pa * Math.log( pa );
+ }
+ if ( n < k ) {
+ return -( s / ( Math.log( n ) ) );
+ }
+ else {
+ return -( s / ( Math.log( k ) ) );
+ }
+ }
+
+ final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) {
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ final MolecularSequence s = msa.getSequence( row );
+ stats.addValue( s.getLength() - s.getNumberOfGapResidues() );
+ }
+ return stats;
+ }
+
+ final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {