// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
-// WWW: www.phylosoft.org/forester
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.msa;
public static int calcGapSumPerColumn( final Msa msa, final int col ) {
int gap_rows = 0;
for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
- if ( msa.getResidueAt( j, col ) == Sequence.GAP ) {
+ if ( msa.isGapAt( j, col ) ) {
gap_rows++;
}
}
return gap_rows;
}
- synchronized public Msa removeGapColumns( final double max_allowed_gap_ratio,
- final int min_allowed_length,
- final Msa msa ) {
+ final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
+ seqs.add( msa.getSequence( row ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
+ seqs.add( msa.getSequence( row ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
+ final List<Sequence> seqs = new ArrayList<Sequence>();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ if ( !to_remove_rows.contains( row ) ) {
+ seqs.add( msa.getSequence( row ) );
+ }
+ }
+ if ( seqs.size() < 1 ) {
+ return null;
+ }
+ return BasicMsa.createInstance( seqs );
+ }
+
+ synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio,
+ final int min_allowed_length,
+ final Msa msa ) {
init();
if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) {
throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio );
return BasicMsa.createInstance( seqs );
}
- public static double calculateIdentityRatio( final Msa msa, final int column ) {
+ final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( int c = from; c <= to; ++c ) {
+ stats.addValue( calculateIdentityRatio( msa, c ) );
+ }
+ return stats;
+ }
+
+ final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) {
+ final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+ for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+ final Sequence s = msa.getSequence( row );
+ stats.addValue( s.getLength() - s.getNumberOfGapResidues() );
+ }
+ return stats;
+ }
+
+ final public static double calculateIdentityRatio( final Msa msa, final int column ) {
final SortedMap<Character, Integer> dist = calculateResidueDestributionPerColumn( msa, column );
int majority_count = 0;
final Iterator<Map.Entry<Character, Integer>> it = dist.entrySet().iterator();
public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int column ) {
final SortedMap<Character, Integer> map = new TreeMap<Character, Integer>();
for( final Character r : msa.getColumnAt( column ) ) {
- if ( !map.containsKey( r ) ) {
- map.put( r, 1 );
- }
- else {
- map.put( r, map.get( r ) + 1 );
+ if ( r != Sequence.GAP ) {
+ if ( !map.containsKey( r ) ) {
+ map.put( r, 1 );
+ }
+ else {
+ map.put( r, map.get( r ) + 1 );
+ }
}
}
return map;
}
return stats;
}
+
+ public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) {
+ final List<Integer> to_remove_rows = new ArrayList<Integer>();
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ int eff_length = 0;
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) {
+ eff_length++;
+ }
+ }
+ if ( eff_length < min_effective_length ) {
+ to_remove_rows.add( seq );
+ }
+ }
+ return removeSequencesByRow( msa, to_remove_rows );
+ }
+
+ public static double calcGapRatio( final Msa msa ) {
+ int gaps = 0;
+ for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+ for( int i = 0; i < msa.getLength(); ++i ) {
+ if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
+ gaps++;
+ }
+ }
+ }
+ return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+ }
}