X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FMsaMethods.java;h=1a94b74f7ce0fa7d22ec49fe20d5ce39177ad899;hb=fda4b2cd36f7c4d7edf6381268ebcf5fbbc77297;hp=716c8431211cffb62118b3f183c7667bb61ac36d;hpb=7476d4a39fc5840955b57289b862bf08b8419fd7;p=jalview.git diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index 716c843..1a94b74 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -21,7 +21,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.msa; @@ -65,16 +65,55 @@ public final class MsaMethods { public static int calcGapSumPerColumn( final Msa msa, final int col ) { int gap_rows = 0; for( int j = 0; j < msa.getNumberOfSequences(); ++j ) { - if ( msa.getResidueAt( j, col ) == Sequence.GAP ) { + if ( msa.isGapAt( j, col ) ) { gap_rows++; } } return gap_rows; } - synchronized public Msa removeGapColumns( final double max_allowed_gap_ratio, - final int min_allowed_length, - final Msa msa ) { + final public static Msa removeSequence( final Msa msa, final String to_remove_id ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) { + seqs.add( msa.getSequence( row ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + final public static Msa removeSequences( final Msa msa, final List to_remove_ids ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { + seqs.add( msa.getSequence( row ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + final public static Msa removeSequencesByRow( final Msa msa, final List to_remove_rows ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_rows.contains( row ) ) { + seqs.add( msa.getSequence( row ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio, + final int min_allowed_length, + final Msa msa ) { init(); if ( ( max_allowed_gap_ratio < 0 ) || ( max_allowed_gap_ratio > 1 ) ) { throw new IllegalArgumentException( "max allowed gap ration is out of range: " + max_allowed_gap_ratio ); @@ -120,7 +159,24 @@ public final class MsaMethods { return BasicMsa.createInstance( seqs ); } - public static double calculateIdentityRatio( final Msa msa, final int column ) { + final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( int c = from; c <= to; ++c ) { + stats.addValue( calculateIdentityRatio( msa, c ) ); + } + return stats; + } + + final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final Sequence s = msa.getSequence( row ); + stats.addValue( s.getLength() - s.getNumberOfGapResidues() ); + } + return stats; + } + + final public static double calculateIdentityRatio( final Msa msa, final int column ) { final SortedMap dist = calculateResidueDestributionPerColumn( msa, column ); int majority_count = 0; final Iterator> it = dist.entrySet().iterator(); @@ -136,11 +192,13 @@ public final class MsaMethods { public static SortedMap calculateResidueDestributionPerColumn( final Msa msa, final int column ) { final SortedMap map = new TreeMap(); for( final Character r : msa.getColumnAt( column ) ) { - if ( !map.containsKey( r ) ) { - map.put( r, 1 ); - } - else { - map.put( r, map.get( r ) + 1 ); + if ( r != Sequence.GAP ) { + if ( !map.containsKey( r ) ) { + map.put( r, 1 ); + } + else { + map.put( r, map.get( r ) + 1 ); + } } } return map; @@ -153,4 +211,32 @@ public final class MsaMethods { } return stats; } + + public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) { + final List to_remove_rows = new ArrayList(); + for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { + int eff_length = 0; + for( int i = 0; i < msa.getLength(); ++i ) { + if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) { + eff_length++; + } + } + if ( eff_length < min_effective_length ) { + to_remove_rows.add( seq ); + } + } + return removeSequencesByRow( msa, to_remove_rows ); + } + + public static double calcGapRatio( final Msa msa ) { + int gaps = 0; + for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { + for( int i = 0; i < msa.getLength(); ++i ) { + if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) { + gaps++; + } + } + } + return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() ); + } }