src/jalview/analysis/scoremodels/PIDModel.java

   1 package jalview.analysis.scoremodels;
   2
   3 import jalview.api.analysis.PairwiseScoreModelI;
   4 import jalview.api.analysis.SimilarityParamsI;
   5 import jalview.datamodel.AlignmentView;
   6 import jalview.math.Matrix;
   7 import jalview.math.MatrixI;
   8 import jalview.util.Comparison;
   9
  10 /**
  11  * A class to provide sequence pairwise similarity based on residue identity.
  12  * Instances of this class are immutable and thread-safe.
  13  */
  14 public class PIDModel extends SimilarityScoreModel implements
  15         PairwiseScoreModelI
  16 {
  17   private static final String NAME = "PID";
  18
  19   /**
  20    * Constructor
  21    */
  22   public PIDModel()
  23   {
  24   }
  25
  26   @Override
  27   public String getName()
  28   {
  29     return NAME;
  30   }
  31
  32   /**
  33    * Answers null for description. If a display name is needed, use getName() or
  34    * an internationalized string built from the name.
  35    */
  36   @Override
  37   public String getDescription()
  38   {
  39     return null;
  40   }
  41
  42   @Override
  43   public boolean isDNA()
  44   {
  45     return true;
  46   }
  47
  48   @Override
  49   public boolean isProtein()
  50   {
  51     return true;
  52   }
  53
  54   /**
  55    * Answers 1 if c and d are the same residue (ignoring case), and not gap
  56    * characters. Answers 0 for non-matching or gap characters.
  57    */
  58   @Override
  59   public float getPairwiseScore(char c, char d)
  60   {
  61     c = toUpper(c);
  62     d = toUpper(d);
  63     if (c == d && !Comparison.isGap(c))
  64     {
  65       return 1f;
  66     }
  67     return 0f;
  68   }
  69
  70   /**
  71    * @param c
  72    */
  73   protected static char toUpper(char c)
  74   {
  75     if ('a' <= c && c <= 'z')
  76     {
  77       c += 'A' - 'a';
  78     }
  79     return c;
  80   }
  81
  82   /**
  83    * Computes similarity scores based on pairwise percentage identity of
  84    * sequences. For consistency with Jalview 2.10.1's SeqSpace mode PCA
  85    * calculation, the percentage scores are rescaled to the width of the
  86    * sequences (as if counts of identical residues). This method is thread-safe.
  87    */
  88   @Override
  89   public MatrixI findSimilarities(AlignmentView seqData,
  90           SimilarityParamsI options)
  91   {
  92     String[] seqs = seqData.getSequenceStrings(Comparison.GAP_DASH);
  93
  94     MatrixI result = findSimilarities(seqs, options);
  95
  96     result.multiply(seqData.getWidth() / 100d);
  97
  98     return result;
  99   }
 100
 101   /**
 102    * A distance score is computed in the usual way (by reversing the range of
 103    * the similarity score results), and then rescaled to percentage values
 104    * (reversing the rescaling to count values done in findSimilarities). This
 105    * method is thread-safe.
 106    */
 107   @Override
 108   public MatrixI findDistances(AlignmentView seqData,
 109           SimilarityParamsI options)
 110   {
 111     MatrixI result = super.findDistances(seqData, options);
 112
 113     if (seqData.getWidth() != 0)
 114     {
 115       result.multiply(100d / seqData.getWidth());
 116     }
 117
 118     return result;
 119   }
 120
 121   /**
 122    * Compute percentage identity scores, using the gap treatment and
 123    * normalisation specified by the options parameter
 124    *
 125    * @param seqs
 126    * @param options
 127    * @return
 128    */
 129   protected MatrixI findSimilarities(String[] seqs,
 130           SimilarityParamsI options)
 131   {
 132     // TODO reuse code in ScoreMatrix instead somehow
 133     double[][] values = new double[seqs.length][];
 134     for (int row = 0; row < seqs.length; row++)
 135     {
 136       values[row] = new double[seqs.length];
 137       for (int col = 0; col < seqs.length; col++)
 138       {
 139         double total = computePID(seqs[row], seqs[col], options);
 140         values[row][col] = total;
 141       }
 142     }
 143     return new Matrix(values);
 144   }
 145
 146   /**
 147    * Computes a percentage identity for two sequences, using the algorithm
 148    * choices specified by the options parameter
 149    *
 150    * @param seq1
 151    * @param seq2
 152    * @param options
 153    * @return
 154    */
 155   public static double computePID(String seq1, String seq2,
 156           SimilarityParamsI options)
 157   {
 158     int len1 = seq1.length();
 159     int len2 = seq2.length();
 160     int width = Math.max(len1, len2);
 161     int total = 0;
 162     int divideBy = 0;
 163
 164     for (int i = 0; i < width; i++)
 165     {
 166       if (i >= len1 || i >= len2)
 167       {
 168         /*
 169          * off the end of one sequence; stop if we are only matching
 170          * on the shorter sequence length, else treat as trailing gap
 171          */
 172         if (options.denominateByShortestLength())
 173         {
 174           break;
 175         }
 176         if (options.includeGaps())
 177         {
 178           divideBy++;
 179         }
 180         if (options.matchGaps())
 181         {
 182           total++;
 183         }
 184         continue;
 185       }
 186       char c1 = seq1.charAt(i);
 187       char c2 = seq2.charAt(i);
 188       boolean gap1 = Comparison.isGap(c1);
 189       boolean gap2 = Comparison.isGap(c2);
 190
 191       if (gap1 && gap2)
 192       {
 193         /*
 194          * gap-gap: include if options say so, if so
 195          * have to score as identity; else ignore
 196          */
 197         if (options.includeGappedColumns())
 198         {
 199           divideBy++;
 200           total++;
 201         }
 202         continue;
 203       }
 204
 205       if (gap1 || gap2)
 206       {
 207         /*
 208          * gap-residue: include if options say so,
 209          * count as match if options say so
 210          */
 211         if (options.includeGaps())
 212         {
 213           divideBy++;
 214         }
 215         if (options.matchGaps())
 216         {
 217           total++;
 218         }
 219         continue;
 220       }
 221
 222       /*
 223        * remaining case is gap-residue
 224        */
 225       if (toUpper(c1) == toUpper(c2))
 226       {
 227         total++;
 228       }
 229       divideBy++;
 230     }
 231
 232     return divideBy == 0 ? 0D : 100D * total / divideBy;
 233   }
 234 }