2 // forester -- software libraries and applications
3 // for genomics and evolutionary biology research.
5 // Copyright (C) 2010 Christian M Zmasek
6 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.msa;
28 import java.util.ArrayList;
29 import java.util.Iterator;
30 import java.util.List;
32 import java.util.SortedMap;
33 import java.util.TreeMap;
35 import org.forester.sequence.BasicSequence;
36 import org.forester.sequence.Sequence;
37 import org.forester.util.BasicDescriptiveStatistics;
38 import org.forester.util.DescriptiveStatistics;
40 public final class MsaMethods {
42 private ArrayList<String> _ignored_seqs_ids;
44 synchronized public ArrayList<String> getIgnoredSequenceIds() {
45 return _ignored_seqs_ids;
48 synchronized public static MsaMethods createInstance() {
49 return new MsaMethods();
52 private MsaMethods() {
56 synchronized private void init() {
57 _ignored_seqs_ids = new ArrayList<String>();
61 public Object clone() {
62 throw new NoSuchMethodError();
65 public static int calcGapSumPerColumn( final Msa msa, final int col ) {
67 for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
68 if ( msa.isGapAt( j, col ) ) {
75 final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
76 final List<Sequence> seqs = new ArrayList<Sequence>();
77 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
78 if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
79 seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
82 if ( seqs.size() < 1 ) {
85 return BasicMsa.createInstance( seqs );
88 final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
89 final List<Sequence> seqs = new ArrayList<Sequence>();
90 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
91 if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
92 seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
95 if ( seqs.size() < 1 ) {
98 return BasicMsa.createInstance( seqs );
101 final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
102 final List<Sequence> seqs = new ArrayList<Sequence>();
103 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
104 if ( !to_remove_rows.contains( row ) ) {
105 seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
108 if ( seqs.size() < 1 ) {
111 return BasicMsa.createInstance( seqs );
114 synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio,
115 final int min_allowed_length,
118 if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
119 throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
121 final boolean ignore_too_short_seqs = min_allowed_length > 0;
122 final boolean[] delete_cols = new boolean[ msa.getLength() ];
124 for( int col = 0; col < msa.getLength(); ++col ) {
125 delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio;
126 if ( !delete_cols[ col ] ) {
130 final List<Sequence> seqs = new ArrayList<Sequence>( msa.getNumberOfSequences() );
131 for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
132 final char[] mol_seq = new char[ new_length ];
134 int non_gap_cols_sum = 0;
135 for( int col = 0; col < msa.getLength(); ++col ) {
136 if ( !delete_cols[ col ] ) {
137 final char residue = msa.getResidueAt( row, col );
138 mol_seq[ new_col++ ] = ( residue );
139 if ( residue != Sequence.GAP ) {
144 if ( ignore_too_short_seqs ) {
145 if ( non_gap_cols_sum >= min_allowed_length ) {
146 seqs.add( new BasicSequence( msa.getIdentifier( row ), mol_seq, msa.getType() ) );
149 _ignored_seqs_ids.add( msa.getIdentifier( row ).toString() );
153 seqs.add( new BasicSequence( msa.getIdentifier( row ), mol_seq, msa.getType() ) );
156 if ( seqs.size() < 1 ) {
159 return BasicMsa.createInstance( seqs );
162 public static double calculateIdentityRatio( final Msa msa, final int column ) {
163 final SortedMap<Character, Integer> dist = calculateResidueDestributionPerColumn( msa, column );
164 int majority_count = 0;
165 final Iterator<Map.Entry<Character, Integer>> it = dist.entrySet().iterator();
166 while ( it.hasNext() ) {
167 final Map.Entry<Character, Integer> pair = it.next();
168 if ( pair.getValue() > majority_count ) {
169 majority_count = pair.getValue();
172 return ( double ) majority_count / msa.getNumberOfSequences();
175 public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int column ) {
176 final SortedMap<Character, Integer> map = new TreeMap<Character, Integer>();
177 for( final Character r : msa.getColumnAt( column ) ) {
178 if ( !map.containsKey( r ) ) {
182 map.put( r, map.get( r ) + 1 );
188 public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
189 final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
190 for( int i = 0; i < msa.getLength(); ++i ) {
191 stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() );
196 public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) {
197 final List<Integer> to_remove_rows = new ArrayList<Integer>();
198 for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
200 for( int i = 0; i < msa.getLength(); ++i ) {
201 if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) {
205 if ( eff_length < min_effective_length ) {
206 to_remove_rows.add( seq );
209 return removeSequencesByRow( msa, to_remove_rows );
212 public static double calcGapRatio( final Msa msa ) {
214 for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
215 for( int i = 0; i < msa.getLength(); ++i ) {
216 if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
221 return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );