2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.msa;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.Iterator;
31 import java.util.List;
33 import java.util.SortedMap;
34 import java.util.TreeMap;
36 import org.forester.sequence.BasicSequence;
37 import org.forester.sequence.MolecularSequence;
38 import org.forester.util.BasicDescriptiveStatistics;
39 import org.forester.util.DescriptiveStatistics;
41 public final class MsaMethods {
43 private ArrayList<String> _ignored_seqs_ids;
45 private MsaMethods() {
50 public Object clone() {
51 throw new NoSuchMethodError();
54 synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio,
55 final int min_allowed_length,
58 if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
59 throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
61 final boolean ignore_too_short_seqs = min_allowed_length > 0;
62 final boolean[] delete_cols = new boolean[ msa.getLength() ];
64 for( int col = 0; col < msa.getLength(); ++col ) {
65 delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) > max_allowed_gap_ratio;
66 if ( !delete_cols[ col ] ) {
70 final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>( msa.getNumberOfSequences() );
71 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
72 final char[] mol_seq = new char[ new_length ];
74 int non_gap_cols_sum = 0;
75 for( int col = 0; col < msa.getLength(); ++col ) {
76 if ( !delete_cols[ col ] ) {
77 final char residue = msa.getResidueAt( row, col );
78 mol_seq[ new_col++ ] = ( residue );
79 if ( residue != MolecularSequence.GAP ) {
84 if ( ignore_too_short_seqs ) {
85 if ( non_gap_cols_sum >= min_allowed_length ) {
86 seqs.add( new BasicSequence( msa.getIdentifier( row ), mol_seq, msa.getType() ) );
89 _ignored_seqs_ids.add( msa.getIdentifier( row ).toString() );
93 seqs.add( new BasicSequence( msa.getIdentifier( row ), mol_seq, msa.getType() ) );
96 if ( seqs.size() < 1 ) {
99 return BasicMsa.createInstance( seqs );
102 synchronized public ArrayList<String> getIgnoredSequenceIds() {
103 return _ignored_seqs_ids;
106 synchronized private void init() {
107 _ignored_seqs_ids = new ArrayList<String>();
110 public static final DescriptiveStatistics calcNumberOfGapsPer100Stats( final Msa msa ) {
111 final int[] gaps = calcNumberOfGapsInMsa( msa );
112 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
113 final double n = 100.0 / msa.getLength();
114 for( final int gap : gaps ) {
115 stats.addValue( n * gap );
120 public static final int[] calcNumberOfGapsInMsa( final Msa msa ) {
121 final int seqs = msa.getNumberOfSequences();
122 final int[] gaps= new int[ seqs ];
123 for( int i = 0; i < seqs; ++i ) {
124 gaps[ i ] = calcNumberOfGaps( msa.getSequence( i ) );
131 public final static int calcNumberOfGaps( final MolecularSequence seq ) {
133 boolean was_gap = false;
134 for( int i = 0; i < seq.getLength(); ++i ) {
135 if ( seq.isGapAt( i ) ) {
148 public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
149 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
150 for( int i = 0; i < msa.getLength(); ++i ) {
151 stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() );
156 public static double calcGapRatio( final Msa msa ) {
158 for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
159 for( int i = 0; i < msa.getLength(); ++i ) {
160 if ( msa.getResidueAt( seq, i ) == MolecularSequence.GAP ) {
165 return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
168 public static int calcGapSumPerColumn( final Msa msa, final int col ) {
170 for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
171 if ( msa.isGapAt( j, col ) ) {
178 final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa ) {
180 for( int col = 0; col < msa.getLength(); ++col ) {
181 s += calcNormalizedShannonsEntropy( k, msa, col );
183 return s / msa.getLength();
186 final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa, final int col ) {
187 // http://www.ebi.ac.uk/thornton-srv/databases/valdarprograms/scorecons_server_help.html
188 // n: number of residues in column
189 // k: number of residue types
190 // na: number of residues of type a
194 final double n = msa.getNumberOfSequences();
195 HashMap<Character, Integer> dist = null;
197 dist = calcResidueDistribution6( msa, col );
200 dist = calcResidueDistribution7( msa, col );
202 else if ( k == 20 ) {
203 dist = calcResidueDistribution20( msa, col );
205 else if ( k == 21 ) {
206 dist = calcResidueDistribution21( msa, col );
209 throw new IllegalArgumentException( "illegal value for k: " + k );
211 if ( dist.size() == 1 ) {
214 // if ( dist.size() == n ) {
217 for( final int na : dist.values() ) {
218 final double pa = na / n;
219 s += pa * Math.log( pa );
222 return -( s / ( Math.log( n ) ) );
225 return -( s / ( Math.log( k ) ) );
229 final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) {
230 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
231 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
232 final MolecularSequence s = msa.getSequence( row );
233 stats.addValue( s.getLength() - s.getNumberOfGapResidues() );
238 final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
239 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
240 for( int c = from; c <= to; ++c ) {
241 stats.addValue( calculateIdentityRatio( msa, c ) );
246 final public static double calculateIdentityRatio( final Msa msa, final int column ) {
247 final SortedMap<Character, Integer> dist = calculateResidueDestributionPerColumn( msa, column );
248 int majority_count = 0;
249 final Iterator<Map.Entry<Character, Integer>> it = dist.entrySet().iterator();
250 while ( it.hasNext() ) {
251 final Map.Entry<Character, Integer> pair = it.next();
252 if ( pair.getValue() > majority_count ) {
253 majority_count = pair.getValue();
256 return ( double ) majority_count / msa.getNumberOfSequences();
259 public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int column ) {
260 final SortedMap<Character, Integer> map = new TreeMap<Character, Integer>();
261 for( final Character r : msa.getColumnAt( column ) ) {
262 if ( r != MolecularSequence.GAP ) {
263 if ( !map.containsKey( r ) ) {
267 map.put( r, map.get( r ) + 1 );
274 synchronized public static MsaMethods createInstance() {
275 return new MsaMethods();
278 final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
279 final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
280 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
281 if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
282 seqs.add( msa.getSequence( row ) );
285 if ( seqs.size() < 1 ) {
288 return BasicMsa.createInstance( seqs );
291 final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
292 final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
293 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
294 if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
295 seqs.add( msa.getSequence( row ) );
298 if ( seqs.size() < 1 ) {
301 return BasicMsa.createInstance( seqs );
304 public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) {
305 final List<Integer> to_remove_rows = new ArrayList<Integer>();
306 for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
308 for( int i = 0; i < msa.getLength(); ++i ) {
309 if ( msa.getResidueAt( seq, i ) != MolecularSequence.GAP ) {
313 if ( eff_length < min_effective_length ) {
314 to_remove_rows.add( seq );
317 return removeSequencesByRow( msa, to_remove_rows );
320 final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
321 final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
322 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
323 if ( !to_remove_rows.contains( row ) ) {
324 seqs.add( msa.getSequence( row ) );
327 if ( seqs.size() < 1 ) {
330 return BasicMsa.createInstance( seqs );
333 final private static HashMap<Character, Integer> calcResidueDistribution20( final Msa msa, final int col ) {
334 final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
335 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
336 final char c = msa.getResidueAt( row, col );
337 if ( c != MolecularSequence.GAP ) {
338 if ( !counts.containsKey( c ) ) {
342 counts.put( c, 1 + counts.get( c ) );
349 final private static HashMap<Character, Integer> calcResidueDistribution21( final Msa msa, final int col ) {
350 final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
351 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
352 final char c = msa.getResidueAt( row, col );
353 if ( !counts.containsKey( c ) ) {
357 counts.put( c, 1 + counts.get( c ) );
363 final private static HashMap<Character, Integer> calcResidueDistribution6( final Msa msa, final int col ) {
364 // Residues are classified into one of tex2html_wrap199 types:
365 // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
366 // special conformations [GP] and gaps. This convention follows that
367 // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
368 final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
369 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
370 final char c = msa.getResidueAt( row, col );
372 if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) {
376 else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) {
380 else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) {
384 else if ( ( c == 'K' ) || ( c == 'R' ) ) {
388 else if ( ( c == 'D' ) || ( c == 'E' ) ) {
392 else if ( ( c == 'G' ) || ( c == 'P' ) ) {
393 // aliphatic - special conformation
399 if ( !counts.containsKey( x ) ) {
403 counts.put( x, 1 + counts.get( x ) );
409 final private static HashMap<Character, Integer> calcResidueDistribution7( final Msa msa, final int col ) {
410 // Residues are classified into one of tex2html_wrap199 types:
411 // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
412 // special conformations [GP] and gaps. This convention follows that
413 // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
414 final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
415 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
416 final char c = msa.getResidueAt( row, col );
418 if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) {
422 else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) {
426 else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) {
430 else if ( ( c == 'K' ) || ( c == 'R' ) ) {
434 else if ( ( c == 'D' ) || ( c == 'E' ) ) {
438 else if ( ( c == 'G' ) || ( c == 'P' ) ) {
439 // aliphatic - special conformation
442 if ( !counts.containsKey( x ) ) {
446 counts.put( x, 1 + counts.get( x ) );