in progress...

[jalview.git] / forester / java / src / org / forester / msa / MsaMethods.java
diff --git a/forester/java/src/org/forester/msa/MsaMethods.java b/forester/java/src/org/forester/msa/MsaMethods.java

index 45c9697..2b47020 100644 (file)
--- a/forester/java/src/org/forester/msa/MsaMethods.java
+++ b/forester/java/src/org/forester/msa/MsaMethods.java
@@ -26,6 +26,7 @@
  package org.forester.msa;
  
  import java.util.ArrayList;
+import java.util.HashMap;
  import java.util.Iterator;
  import java.util.List;
  import java.util.Map;
@@ -33,7 +34,7 @@ import java.util.SortedMap;
  import java.util.TreeMap;
  
  import org.forester.sequence.BasicSequence;
-import org.forester.sequence.Sequence;
+import org.forester.sequence.MolecularSequence;
  import org.forester.util.BasicDescriptiveStatistics;
  import org.forester.util.DescriptiveStatistics;
  
@@ -41,77 +42,16 @@ public final class MsaMethods {
  
      private ArrayList<String> _ignored_seqs_ids;
  
-    synchronized public ArrayList<String> getIgnoredSequenceIds() {
-        return _ignored_seqs_ids;
-    }
-
-    synchronized public static MsaMethods createInstance() {
-        return new MsaMethods();
-    }
-
      private MsaMethods() {
          init();
      }
  
-    synchronized private void init() {
-        _ignored_seqs_ids = new ArrayList<String>();
-    }
-
      @Override
      public Object clone() {
          throw new NoSuchMethodError();
      }
  
-    public static int calcGapSumPerColumn( final Msa msa, final int col ) {
-        int gap_rows = 0;
-        for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
-            if ( msa.isGapAt( j, col ) ) {
-                gap_rows++;
-            }
-        }
-        return gap_rows;
-    }
-
-    final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
-        final List<Sequence> seqs = new ArrayList<Sequence>();
-        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
-            if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
-                seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
-            }
-        }
-        if ( seqs.size() < 1 ) {
-            return null;
-        }
-        return BasicMsa.createInstance( seqs );
-    }
-
-    final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
-        final List<Sequence> seqs = new ArrayList<Sequence>();
-        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
-            if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
-                seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
-            }
-        }
-        if ( seqs.size() < 1 ) {
-            return null;
-        }
-        return BasicMsa.createInstance( seqs );
-    }
-
-    final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
-        final List<Sequence> seqs = new ArrayList<Sequence>();
-        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
-            if ( !to_remove_rows.contains( row ) ) {
-                seqs.add( BasicSequence.copySequence( msa.getSequence( row ) ) );
-            }
-        }
-        if ( seqs.size() < 1 ) {
-            return null;
-        }
-        return BasicMsa.createInstance( seqs );
-    }
-
-    synchronized final public Msa removeGapColumns( final double max_allowed_gap_ratio,
+    synchronized final public Msa deleteGapColumns( final double max_allowed_gap_ratio,
                                                      final int min_allowed_length,
                                                      final Msa msa ) {
          init();
@@ -122,12 +62,12 @@ public final class MsaMethods {
          final boolean[] delete_cols = new boolean[ msa.getLength() ];
          int new_length = 0;
          for( int col = 0; col < msa.getLength(); ++col ) {
-            delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) >= max_allowed_gap_ratio;
+            delete_cols[ col ] = ( ( double ) calcGapSumPerColumn( msa, col ) / msa.getNumberOfSequences() ) > max_allowed_gap_ratio;
              if ( !delete_cols[ col ] ) {
                  ++new_length;
              }
          }
-        final List<Sequence> seqs = new ArrayList<Sequence>( msa.getNumberOfSequences() );
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>( msa.getNumberOfSequences() );
          for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
              final char[] mol_seq = new char[ new_length ];
              int new_col = 0;
@@ -136,7 +76,7 @@ public final class MsaMethods {
                  if ( !delete_cols[ col ] ) {
                      final char residue = msa.getResidueAt( row, col );
                      mol_seq[ new_col++ ] = ( residue );
-                    if ( residue != Sequence.GAP ) {
+                    if ( residue != MolecularSequence.GAP ) {
                          ++non_gap_cols_sum;
                      }
                  }
@@ -159,7 +99,142 @@ public final class MsaMethods {
          return BasicMsa.createInstance( seqs );
      }
  
-    public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
+    synchronized public ArrayList<String> getIgnoredSequenceIds() {
+        return _ignored_seqs_ids;
+    }
+
+    synchronized private void init() {
+        _ignored_seqs_ids = new ArrayList<String>();
+    }
+
+    public static final DescriptiveStatistics calcNumberOfGapsStats( final Msa msa ) {
+        final int[] gaps = calcNumberOfGapsInMsa( msa );
+        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+        for( final int gap : gaps ) {
+            stats.addValue(  gap );
+        }
+        return stats;
+    }
+
+    public static final int[] calcNumberOfGapsInMsa( final Msa msa ) {
+        final int seqs = msa.getNumberOfSequences();
+        final int[]  gaps= new int[ seqs ];
+        for( int i = 0; i < seqs; ++i ) {
+            gaps[ i ] =  calcNumberOfGaps( msa.getSequence( i ) );
+        }
+        return gaps;
+    }
+    
+    
+
+    public final static int calcNumberOfGaps( final MolecularSequence seq  ) {
+        int gaps = 0;
+        boolean was_gap = false;
+        for( int i = 0; i < seq.getLength(); ++i ) {
+            if ( seq.isGapAt( i ) ) {
+               if ( !was_gap ) {
+                   ++gaps;
+                   was_gap = true;
+               }
+            }
+            else {
+                was_gap = false;
+            }
+        }
+        return gaps;
+    }
+
+    public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
+        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+        for( int i = 0; i < msa.getLength(); ++i ) {
+            stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() );
+        }
+        return stats;
+    }
+
+    public static double calcGapRatio( final Msa msa ) {
+        int gaps = 0;
+        for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
+            for( int i = 0; i < msa.getLength(); ++i ) {
+                if ( msa.getResidueAt( seq, i ) == MolecularSequence.GAP ) {
+                    gaps++;
+                }
+            }
+        }
+        return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+    }
+
+    public static int calcGapSumPerColumn( final Msa msa, final int col ) {
+        int gap_rows = 0;
+        for( int j = 0; j < msa.getNumberOfSequences(); ++j ) {
+            if ( msa.isGapAt( j, col ) ) {
+                gap_rows++;
+            }
+        }
+        return gap_rows;
+    }
+
+    final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa ) {
+        double s = 0;
+        for( int col = 0; col < msa.getLength(); ++col ) {
+            s += calcNormalizedShannonsEntropy( k, msa, col );
+        }
+        return s / msa.getLength();
+    }
+
+    final public static double calcNormalizedShannonsEntropy( final int k, final Msa msa, final int col ) {
+        // http://www.ebi.ac.uk/thornton-srv/databases/valdarprograms/scorecons_server_help.html
+        // n: number of residues in column
+        // k: number of residue types
+        // na: number of residues of type a
+        // pa = na/n
+        // S=-sum pa log2 pa
+        double s = 0;
+        final double n = msa.getNumberOfSequences();
+        HashMap<Character, Integer> dist = null;
+        if ( k == 6 ) {
+            dist = calcResidueDistribution6( msa, col );
+        }
+        else if ( k == 7 ) {
+            dist = calcResidueDistribution7( msa, col );
+        }
+        else if ( k == 20 ) {
+            dist = calcResidueDistribution20( msa, col );
+        }
+        else if ( k == 21 ) {
+            dist = calcResidueDistribution21( msa, col );
+        }
+        else {
+            throw new IllegalArgumentException( "illegal value for k: " + k );
+        }
+        if ( dist.size() == 1 ) {
+            return 0;
+        }
+        //        if ( dist.size() == n ) {
+        //            return 0;
+        //        }
+        for( final int na : dist.values() ) {
+            final double pa = na / n;
+            s += pa * Math.log( pa );
+        }
+        if ( n < k ) {
+            return -( s / ( Math.log( n ) ) );
+        }
+        else {
+            return -( s / ( Math.log( k ) ) );
+        }
+    }
+
+    final public static DescriptiveStatistics calculateEffectiveLengthStatistics( final Msa msa ) {
+        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            final MolecularSequence s = msa.getSequence( row );
+            stats.addValue( s.getLength() - s.getNumberOfGapResidues() );
+        }
+        return stats;
+    }
+
+    final public static DescriptiveStatistics calculateIdentityRatio( final int from, final int to, final Msa msa ) {
          final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
          for( int c = from; c <= to; ++c ) {
              stats.addValue( calculateIdentityRatio( msa, c ) );
@@ -167,7 +242,7 @@ public final class MsaMethods {
          return stats;
      }
  
-    public static double calculateIdentityRatio( final Msa msa, final int column ) {
+    final public static double calculateIdentityRatio( final Msa msa, final int column ) {
          final SortedMap<Character, Integer> dist = calculateResidueDestributionPerColumn( msa, column );
          int majority_count = 0;
          final Iterator<Map.Entry<Character, Integer>> it = dist.entrySet().iterator();
@@ -183,7 +258,7 @@ public final class MsaMethods {
      public static SortedMap<Character, Integer> calculateResidueDestributionPerColumn( final Msa msa, final int column ) {
          final SortedMap<Character, Integer> map = new TreeMap<Character, Integer>();
          for( final Character r : msa.getColumnAt( column ) ) {
-            if ( r != Sequence.GAP ) {
+            if ( r != MolecularSequence.GAP ) {
                  if ( !map.containsKey( r ) ) {
                      map.put( r, 1 );
                  }
@@ -195,12 +270,34 @@ public final class MsaMethods {
          return map;
      }
  
-    public static DescriptiveStatistics calcBasicGapinessStatistics( final Msa msa ) {
-        final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
-        for( int i = 0; i < msa.getLength(); ++i ) {
-            stats.addValue( ( double ) calcGapSumPerColumn( msa, i ) / msa.getNumberOfSequences() );
+    synchronized public static MsaMethods createInstance() {
+        return new MsaMethods();
+    }
+
+    final public static Msa removeSequence( final Msa msa, final String to_remove_id ) {
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            if ( !to_remove_id.equals( msa.getIdentifier( row ) ) ) {
+                seqs.add( msa.getSequence( row ) );
+            }
          }
-        return stats;
+        if ( seqs.size() < 1 ) {
+            return null;
+        }
+        return BasicMsa.createInstance( seqs );
+    }
+
+    final public static Msa removeSequences( final Msa msa, final List<String> to_remove_ids ) {
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            if ( !to_remove_ids.contains( msa.getIdentifier( row ) ) ) {
+                seqs.add( msa.getSequence( row ) );
+            }
+        }
+        if ( seqs.size() < 1 ) {
+            return null;
+        }
+        return BasicMsa.createInstance( seqs );
      }
  
      public static Msa removeSequencesByMinimalLength( final Msa msa, final int min_effective_length ) {
@@ -208,7 +305,7 @@ public final class MsaMethods {
          for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
              int eff_length = 0;
              for( int i = 0; i < msa.getLength(); ++i ) {
-                if ( msa.getResidueAt( seq, i ) != Sequence.GAP ) {
+                if ( msa.getResidueAt( seq, i ) != MolecularSequence.GAP ) {
                      eff_length++;
                  }
              }
@@ -219,15 +316,135 @@ public final class MsaMethods {
          return removeSequencesByRow( msa, to_remove_rows );
      }
  
-    public static double calcGapRatio( final Msa msa ) {
-        int gaps = 0;
-        for( int seq = 0; seq < msa.getNumberOfSequences(); ++seq ) {
-            for( int i = 0; i < msa.getLength(); ++i ) {
-                if ( msa.getResidueAt( seq, i ) == Sequence.GAP ) {
-                    gaps++;
+    final public static Msa removeSequencesByRow( final Msa msa, final List<Integer> to_remove_rows ) {
+        final List<MolecularSequence> seqs = new ArrayList<MolecularSequence>();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            if ( !to_remove_rows.contains( row ) ) {
+                seqs.add( msa.getSequence( row ) );
+            }
+        }
+        if ( seqs.size() < 1 ) {
+            return null;
+        }
+        return BasicMsa.createInstance( seqs );
+    }
+
+    final private static HashMap<Character, Integer> calcResidueDistribution20( final Msa msa, final int col ) {
+        final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            final char c = msa.getResidueAt( row, col );
+            if ( c != MolecularSequence.GAP ) {
+                if ( !counts.containsKey( c ) ) {
+                    counts.put( c, 1 );
+                }
+                else {
+                    counts.put( c, 1 + counts.get( c ) );
                  }
              }
          }
-        return ( double ) gaps / ( msa.getLength() * msa.getNumberOfSequences() );
+        return counts;
+    }
+
+    final private static HashMap<Character, Integer> calcResidueDistribution21( final Msa msa, final int col ) {
+        final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            final char c = msa.getResidueAt( row, col );
+            if ( !counts.containsKey( c ) ) {
+                counts.put( c, 1 );
+            }
+            else {
+                counts.put( c, 1 + counts.get( c ) );
+            }
+        }
+        return counts;
+    }
+
+    final private static HashMap<Character, Integer> calcResidueDistribution6( final Msa msa, final int col ) {
+        // Residues are classified into one of tex2html_wrap199 types:
+        // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
+        // special conformations [GP] and gaps. This convention follows that
+        // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
+        final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            final char c = msa.getResidueAt( row, col );
+            char x;
+            if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) {
+                // aliphatic
+                x = 'a';
+            }
+            else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) {
+                // aromatic
+                x = 'r';
+            }
+            else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) {
+                // polar
+                x = 'p';
+            }
+            else if ( ( c == 'K' ) || ( c == 'R' ) ) {
+                // positive
+                x = 'o';
+            }
+            else if ( ( c == 'D' ) || ( c == 'E' ) ) {
+                // negative
+                x = 'e';
+            }
+            else if ( ( c == 'G' ) || ( c == 'P' ) ) {
+                // aliphatic - special conformation
+                x = 's';
+            }
+            else {
+                continue;
+            }
+            if ( !counts.containsKey( x ) ) {
+                counts.put( x, 1 );
+            }
+            else {
+                counts.put( x, 1 + counts.get( x ) );
+            }
+        }
+        return counts;
+    }
+
+    final private static HashMap<Character, Integer> calcResidueDistribution7( final Msa msa, final int col ) {
+        // Residues are classified into one of tex2html_wrap199 types:
+        // aliphatic [AVLIMC], aromatic [FWYH], polar [STNQ], positive [KR], negative [DE],
+        // special conformations [GP] and gaps. This convention follows that
+        // of Mirny & Shakhnovich (1999, J Mol Biol 291:177-196).
+        final HashMap<Character, Integer> counts = new HashMap<Character, Integer>();
+        for( int row = 0; row < msa.getNumberOfSequences(); ++row ) {
+            final char c = msa.getResidueAt( row, col );
+            char x = '-';
+            if ( ( c == 'A' ) || ( c == 'V' ) || ( c == 'L' ) || ( c == 'I' ) || ( c == 'M' ) || ( c == 'C' ) ) {
+                // aliphatic
+                x = 'a';
+            }
+            else if ( ( c == 'F' ) || ( c == 'W' ) || ( c == 'Y' ) || ( c == 'H' ) ) {
+                // aromatic
+                x = 'r';
+            }
+            else if ( ( c == 'S' ) || ( c == 'T' ) || ( c == 'N' ) || ( c == 'Q' ) ) {
+                // polar
+                x = 'p';
+            }
+            else if ( ( c == 'K' ) || ( c == 'R' ) ) {
+                // positive
+                x = 'o';
+            }
+            else if ( ( c == 'D' ) || ( c == 'E' ) ) {
+                // negative
+                x = 'e';
+            }
+            else if ( ( c == 'G' ) || ( c == 'P' ) ) {
+                // aliphatic - special conformation
+                x = 's';
+            }
+            if ( !counts.containsKey( x ) ) {
+                counts.put( x, 1 );
+            }
+            else {
+                counts.put( x, 1 + counts.get( x ) );
+            }
+        }
+        return counts;
      }
  }