// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.msa;
import java.util.ArrayList;
+import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
public static int calcGapSumPerColumn( final Msa msa, final int col ) {
int gap_rows = 0;
for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
- if ( msa.getResidueAt( j, col ) == Sequence.GAP ) {
+ if ( msa.isGapAt( j, col ) ) {
gap_rows++;
}
}
return gap_rows;
}
- synchronized public Msa removeGapColumns( final double max_allowed_gap_ratio,
- final int min_allowed_length,
- final Msa msa ) {
+ final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
+ seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
+ seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_rows.contains( row ) ) {
+ seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio,
+ final int min_allowed_length,
+ final Msa msa ) {
init();
if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
final boolean[] delete_cols = new boolean[ msa.getLength() ];
int new_length = 0;
for( int col = 0; col < msa.getLength(); ++col ) {
- delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) > max_allowed_gap_ratio;
+ delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio;
if ( !delete_cols[ col ] ) {
++new_length;
}
return BasicMsa.createInstance( seqs );
}
- public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int c ) {
+ public static double calculateIdentityRatio( final Msa msa, final int column ) {
+ final SortedMap<Character, Integer> dist = calculateResidueDestributionPerColumn( msa, column );
+ int majority_count = 0;
+ final Iterator<Map.Entry<Character, Integer>> it = dist.entrySet().iterator();
+ while ( it.hasNext() ) {
+ final Map.Entry<Character, Integer> pair = it.next();
+ if ( pair.getValue() > majority_count ) {
+ majority_count = pair.getValue();
+ }
+ }
+ return ( double ) majority_count / msa.getNumberOfSequences();
+ }
+
+ public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int column ) {
final SortedMap<Character, Integer> map = new TreeMap<Character, Integer>();
- for( final Character r : msa.getColumnAt( c ) ) {
+ for( final Character r : msa.getColumnAt( column ) ) {
if ( !map.containsKey( r ) ) {
map.put( r, 1 );
}
}
return stats;
}
+
+ public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) {
+ final List<Integer> to_remove_rows = new ArrayList<Integer>();
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ int eff_length = 0;
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) {
+ eff_length++;
+ }
+ }
+ if ( eff_length < min_effective_length ) {
+ to_remove_rows.add( seq );
+ }
+ }
+ return removeSequencesByRow( msa, to_remove_rows );
+ }
+
+ public static double calcGapRatio( final Msa msa ) {
+ int gaps = 0;
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
+ gaps++;
+ }
+ }
+ }
+ return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+ }
}