X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fmsa%2FMsaMethods.java;h=2b470203d611294f1de19114e5226e0102d17208;hb=fbb7c0a322111e5221773fed19591da29296efb5;hp=6499d928235ffb1cffd83ba93b5f47d0e21b5d1c;hpb=db8b10baefc3e1b5fee9b8192657ecc97ca5b86d;p=jalview.git diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java index 6499d92..2b47020 100644 --- a/forester/java/src/org/forester/msa/MsaMethods.java +++ b/forester/java/src/org/forester/msa/MsaMethods.java @@ -26,6 +26,7 @@ package org.forester.msa; import java.util.ArrayList; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -33,7 +34,7 @@ import java.util.SortedMap; import java.util.TreeMap; import org.forester.sequence.BasicSequence; -import org.forester.sequence.Sequence; +import org.forester.sequence.MolecularSequence; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; @@ -41,76 +42,15 @@ public final class MsaMethods { private ArrayList _ignored_seqs_ids; - synchronized public ArrayList getIgnoredSequenceIds() { - return _ignored_seqs_ids; - } - - synchronized public static MsaMethods createInstance() { - return new MsaMethods(); - } - private MsaMethods() { init(); } - synchronized private void init() { - _ignored_seqs_ids = new ArrayList(); - } - @Override public Object clone() { throw new NoSuchMethodError(); } - public static int calcGapSumPerColumn( final Msa msa, final int col ) { - int gap_rows = 0; - for( int j = 0; j < msa.getNumberOfSequences(); ++j ) { - if ( msa.isGapAt( j, col ) ) { - gap_rows++; - } - } - return gap_rows; - } - - final public static Msa removeSequence( final Msa msa, final String to_remove_id ) { - final List seqs = new ArrayList(); - for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { - if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) { - seqs.add( msa.getSequence( row ) ); - } - } - if ( seqs.size() < 1 ) { - return null; - } - return BasicMsa.createInstance( seqs ); - } - - final public static Msa removeSequences( final Msa msa, final List to_remove_ids ) { - final List seqs = new ArrayList(); - for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { - if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { - seqs.add( msa.getSequence( row ) ); - } - } - if ( seqs.size() < 1 ) { - return null; - } - return BasicMsa.createInstance( seqs ); - } - - final public static Msa removeSequencesByRow( final Msa msa, final List to_remove_rows ) { - final List seqs = new ArrayList(); - for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { - if ( !to_remove_rows.contains( row ) ) { - seqs.add( msa.getSequence( row ) ); - } - } - if ( seqs.size() < 1 ) { - return null; - } - return BasicMsa.createInstance( seqs ); - } - synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio, final int min_allowed_length, final Msa msa ) { @@ -127,7 +67,7 @@ public final class MsaMethods { ++new_length; } } - final List seqs = new ArrayList( msa.getNumberOfSequences() ); + final List seqs = new ArrayList( msa.getNumberOfSequences() ); for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { final char[] mol_seq = new char[ new_length ]; int new_col = 0; @@ -136,7 +76,7 @@ public final class MsaMethods { if ( !delete_cols[ col ] ) { final char residue = msa.getResidueAt( row, col ); mol_seq[ new_col++ ] = ( residue ); - if ( residue != Sequence.GAP ) { + if ( residue != MolecularSequence.GAP ) { ++non_gap_cols_sum; } } @@ -159,7 +99,142 @@ public final class MsaMethods { return BasicMsa.createInstance( seqs ); } - public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { + synchronized public ArrayList getIgnoredSequenceIds() { + return _ignored_seqs_ids; + } + + synchronized private void init() { + _ignored_seqs_ids = new ArrayList(); + } + + public static final DescriptiveStatistics calcNumberOfGapsStats( final Msa msa ) { + final int[] gaps = calcNumberOfGapsInMsa( msa ); + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final int gap : gaps ) { + stats.addValue( gap ); + } + return stats; + } + + public static final int[] calcNumberOfGapsInMsa( final Msa msa ) { + final int seqs = msa.getNumberOfSequences(); + final int[] gaps= new int[ seqs ]; + for( int i = 0; i < seqs; ++i ) { + gaps[ i ] = calcNumberOfGaps( msa.getSequence( i ) ); + } + return gaps; + } + + + + public final static int calcNumberOfGaps( final MolecularSequence seq ) { + int gaps = 0; + boolean was_gap = false; + for( int i = 0; i < seq.getLength(); ++i ) { + if ( seq.isGapAt( i ) ) { + if ( !was_gap ) { + ++gaps; + was_gap = true; + } + } + else { + was_gap = false; + } + } + return gaps; + } + + public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( int i = 0; i < msa.getLength(); ++i ) { + stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() ); + } + return stats; + } + + public static double calcGapRatio( final Msa msa ) { + int gaps = 0; + for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { + for( int i = 0; i < msa.getLength(); ++i ) { + if ( msa.getResidueAt( seq, i ) == MolecularSequence.GAP ) { + gaps++; + } + } + } + return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() ); + } + + public static int calcGapSumPerColumn( final Msa msa, final int col ) { + int gap_rows = 0; + for( int j = 0; j < msa.getNumberOfSequences(); ++j ) { + if ( msa.isGapAt( j, col ) ) { + gap_rows++; + } + } + return gap_rows; + } + + final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa ) { + double s = 0; + for( int col = 0; col < msa.getLength(); ++col ) { + s += calcNormalizedShannonsEntropy( k, msa, col ); + } + return s / msa.getLength(); + } + + final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa, final int col ) { + // http://www.ebi.ac.uk/thornton-srv/databases/valdarprograms/scorecons_server_help.html + // n: number of residues in column + // k: number of residue types + // na: number of residues of type a + // pa = na/n + // S=-sum pa log2 pa + double s = 0; + final double n = msa.getNumberOfSequences(); + HashMap dist = null; + if ( k == 6 ) { + dist = calcResidueDistribution6( msa, col ); + } + else if ( k == 7 ) { + dist = calcResidueDistribution7( msa, col ); + } + else if ( k == 20 ) { + dist = calcResidueDistribution20( msa, col ); + } + else if ( k == 21 ) { + dist = calcResidueDistribution21( msa, col ); + } + else { + throw new IllegalArgumentException( "illegal value for k: " + k ); + } + if ( dist.size() == 1 ) { + return 0; + } + // if ( dist.size() == n ) { + // return 0; + // } + for( final int na : dist.values() ) { + final double pa = na / n; + s += pa * Math.log( pa ); + } + if ( n < k ) { + return -( s / ( Math.log( n ) ) ); + } + else { + return -( s / ( Math.log( k ) ) ); + } + } + + final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final MolecularSequence s = msa.getSequence( row ); + stats.addValue( s.getLength() - s.getNumberOfGapResidues() ); + } + return stats; + } + + final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( int c = from; c <= to; ++c ) { stats.addValue( calculateIdentityRatio( msa, c ) ); @@ -167,7 +242,7 @@ public final class MsaMethods { return stats; } - public static double calculateIdentityRatio( final Msa msa, final int column ) { + final public static double calculateIdentityRatio( final Msa msa, final int column ) { final SortedMap dist = calculateResidueDestributionPerColumn( msa, column ); int majority_count = 0; final Iterator> it = dist.entrySet().iterator(); @@ -183,7 +258,7 @@ public final class MsaMethods { public static SortedMap calculateResidueDestributionPerColumn( final Msa msa, final int column ) { final SortedMap map = new TreeMap(); for( final Character r : msa.getColumnAt( column ) ) { - if ( r != Sequence.GAP ) { + if ( r != MolecularSequence.GAP ) { if ( !map.containsKey( r ) ) { map.put( r, 1 ); } @@ -195,12 +270,34 @@ public final class MsaMethods { return map; } - public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( int i = 0; i < msa.getLength(); ++i ) { - stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() ); + synchronized public static MsaMethods createInstance() { + return new MsaMethods(); + } + + final public static Msa removeSequence( final Msa msa, final String to_remove_id ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) { + seqs.add( msa.getSequence( row ) ); + } } - return stats; + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + final public static Msa removeSequences( final Msa msa, final List to_remove_ids ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) { + seqs.add( msa.getSequence( row ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); } public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) { @@ -208,7 +305,7 @@ public final class MsaMethods { for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { int eff_length = 0; for( int i = 0; i < msa.getLength(); ++i ) { - if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) { + if ( msa.getResidueAt( seq, i ) != MolecularSequence.GAP ) { eff_length++; } } @@ -219,15 +316,135 @@ public final class MsaMethods { return removeSequencesByRow( msa, to_remove_rows ); } - public static double calcGapRatio( final Msa msa ) { - int gaps = 0; - for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) { - for( int i = 0; i < msa.getLength(); ++i ) { - if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) { - gaps++; + final public static Msa removeSequencesByRow( final Msa msa, final List to_remove_rows ) { + final List seqs = new ArrayList(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + if ( !to_remove_rows.contains( row ) ) { + seqs.add( msa.getSequence( row ) ); + } + } + if ( seqs.size() < 1 ) { + return null; + } + return BasicMsa.createInstance( seqs ); + } + + final private static HashMap calcResidueDistribution20( final Msa msa, final int col ) { + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + if ( c != MolecularSequence.GAP ) { + if ( !counts.containsKey( c ) ) { + counts.put( c, 1 ); + } + else { + counts.put( c, 1 + counts.get( c ) ); } } } - return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() ); + return counts; + } + + final private static HashMap calcResidueDistribution21( final Msa msa, final int col ) { + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + if ( !counts.containsKey( c ) ) { + counts.put( c, 1 ); + } + else { + counts.put( c, 1 + counts.get( c ) ); + } + } + return counts; + } + + final private static HashMap calcResidueDistribution6( final Msa msa, final int col ) { + // Residues are classified into one of tex2html_wrap199 types: + // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], + // special conformations [GP] and gaps. This convention follows that + // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196). + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + char x; + if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) { + // aliphatic + x = 'a'; + } + else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) { + // aromatic + x = 'r'; + } + else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) { + // polar + x = 'p'; + } + else if ( ( c == 'K' ) || ( c == 'R' ) ) { + // positive + x = 'o'; + } + else if ( ( c == 'D' ) || ( c == 'E' ) ) { + // negative + x = 'e'; + } + else if ( ( c == 'G' ) || ( c == 'P' ) ) { + // aliphatic - special conformation + x = 's'; + } + else { + continue; + } + if ( !counts.containsKey( x ) ) { + counts.put( x, 1 ); + } + else { + counts.put( x, 1 + counts.get( x ) ); + } + } + return counts; + } + + final private static HashMap calcResidueDistribution7( final Msa msa, final int col ) { + // Residues are classified into one of tex2html_wrap199 types: + // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE], + // special conformations [GP] and gaps. This convention follows that + // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196). + final HashMap counts = new HashMap(); + for( int row = 0; row < msa.getNumberOfSequences(); ++row ) { + final char c = msa.getResidueAt( row, col ); + char x = '-'; + if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) { + // aliphatic + x = 'a'; + } + else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) { + // aromatic + x = 'r'; + } + else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) { + // polar + x = 'p'; + } + else if ( ( c == 'K' ) || ( c == 'R' ) ) { + // positive + x = 'o'; + } + else if ( ( c == 'D' ) || ( c == 'E' ) ) { + // negative + x = 'e'; + } + else if ( ( c == 'G' ) || ( c == 'P' ) ) { + // aliphatic - special conformation + x = 's'; + } + if ( !counts.containsKey( x ) ) { + counts.put( x, 1 ); + } + else { + counts.put( x, 1 + counts.get( x ) ); + } + } + return counts; } }