src/jalview/analysis/scoremodels/PIDModel.java

   1 package jalview.analysis.scoremodels;
   2
   3 import jalview.api.analysis.PairwiseScoreModelI;
   4 import jalview.api.analysis.SimilarityParamsI;
   5 import jalview.datamodel.AlignmentView;
   6 import jalview.math.Matrix;
   7 import jalview.math.MatrixI;
   8 import jalview.util.Comparison;
   9
  10 /**
  11  * A class to provide sequence pairwise similarity based on residue identity
  12  */
  13 public class PIDModel extends SimilarityScoreModel implements
  14         PairwiseScoreModelI
  15 {
  16   private static final String NAME = "PID";
  17
  18   private String description;
  19
  20   /**
  21    * Constructor
  22    */
  23   public PIDModel()
  24   {
  25   }
  26
  27   @Override
  28   public String getName()
  29   {
  30     return NAME;
  31   }
  32
  33   @Override
  34   public String getDescription()
  35   {
  36     return description;
  37   }
  38
  39   @Override
  40   public boolean isDNA()
  41   {
  42     return true;
  43   }
  44
  45   @Override
  46   public boolean isProtein()
  47   {
  48     return true;
  49   }
  50
  51   /**
  52    * Answers 1 if c and d are the same residue (ignoring case), and not gap
  53    * characters. Answers 0 for non-matching or gap characters.
  54    */
  55   @Override
  56   public float getPairwiseScore(char c, char d)
  57   {
  58     c = toUpper(c);
  59     d = toUpper(d);
  60     if (c == d && !Comparison.isGap(c))
  61     {
  62       return 1f;
  63     }
  64     return 0f;
  65   }
  66
  67   /**
  68    * @param c
  69    */
  70   protected static char toUpper(char c)
  71   {
  72     if ('a' <= c && c <= 'z')
  73     {
  74       c += 'A' - 'a';
  75     }
  76     return c;
  77   }
  78
  79   /**
  80    * Computes similarity scores based on pairwise percentage identity of
  81    * sequences. For consistency with Jalview 2.10.1's SeqSpace mode PCA
  82    * calculation, the percentage scores are rescaled to the width of the
  83    * sequences (as if counts of identical residues).
  84    */
  85   @Override
  86   public MatrixI findSimilarities(AlignmentView seqData,
  87           SimilarityParamsI options)
  88   {
  89     String[] seqs = seqData.getSequenceStrings(Comparison.GAP_DASH);
  90
  91     MatrixI result = findSimilarities(seqs, options);
  92
  93     result.multiply(seqData.getWidth() / 100d);
  94
  95     return result;
  96   }
  97
  98   /**
  99    * A distance score is computed in the usual way (by reversing the range of
 100    * the similarity score results), and then rescaled to percentage values
 101    * (reversing the rescaling to count values done in findSimilarities)
 102    */
 103   @Override
 104   public MatrixI findDistances(AlignmentView seqData,
 105           SimilarityParamsI options)
 106   {
 107     MatrixI result = super.findDistances(seqData, options);
 108
 109     if (seqData.getWidth() != 0)
 110     {
 111       result.multiply(100d / seqData.getWidth());
 112     }
 113
 114     return result;
 115   }
 116
 117   /**
 118    * Compute percentage identity scores, using the gap treatment and
 119    * normalisation specified by the options parameter
 120    *
 121    * @param seqs
 122    * @param options
 123    * @return
 124    */
 125   protected MatrixI findSimilarities(String[] seqs,
 126           SimilarityParamsI options)
 127   {
 128     // TODO reuse code in ScoreMatrix instead somehow
 129     double[][] values = new double[seqs.length][];
 130     for (int row = 0; row < seqs.length; row++)
 131     {
 132       values[row] = new double[seqs.length];
 133       for (int col = 0; col < seqs.length; col++)
 134       {
 135         double total = computePID(seqs[row], seqs[col], options);
 136         values[row][col] = total;
 137       }
 138     }
 139     return new Matrix(values);
 140   }
 141
 142   /**
 143    * Computes a percentage identity for two sequences, using the algorithm
 144    * choices specified by the options parameter
 145    *
 146    * @param seq1
 147    * @param seq2
 148    * @param options
 149    * @return
 150    */
 151   public static double computePID(String seq1, String seq2,
 152           SimilarityParamsI options)
 153   {
 154     int len1 = seq1.length();
 155     int len2 = seq2.length();
 156     int width = Math.max(len1, len2);
 157     int total = 0;
 158     int divideBy = 0;
 159
 160     for (int i = 0; i < width; i++)
 161     {
 162       if (i >= len1 || i >= len2)
 163       {
 164         /*
 165          * off the end of one sequence; stop if we are only matching
 166          * on the shorter sequence length, else treat as trailing gap
 167          */
 168         if (options.denominateByShortestLength())
 169         {
 170           break;
 171         }
 172         if (options.includeGaps())
 173         {
 174           divideBy++;
 175         }
 176         if (options.matchGaps())
 177         {
 178           total++;
 179         }
 180         continue;
 181       }
 182       char c1 = seq1.charAt(i);
 183       char c2 = seq2.charAt(i);
 184       boolean gap1 = Comparison.isGap(c1);
 185       boolean gap2 = Comparison.isGap(c2);
 186
 187       if (gap1 && gap2)
 188       {
 189         /*
 190          * gap-gap: include if options say so, if so
 191          * have to score as identity; else ignore
 192          */
 193         if (options.includeGappedColumns())
 194         {
 195           divideBy++;
 196           total++;
 197         }
 198         continue;
 199       }
 200
 201       if (gap1 || gap2)
 202       {
 203         /*
 204          * gap-residue: include if options say so,
 205          * count as match if options say so
 206          */
 207         if (options.includeGaps())
 208         {
 209           divideBy++;
 210         }
 211         if (options.matchGaps())
 212         {
 213           total++;
 214         }
 215         continue;
 216       }
 217
 218       /*
 219        * remaining case is gap-residue
 220        */
 221       if (toUpper(c1) == toUpper(c2))
 222       {
 223         total++;
 224       }
 225       divideBy++;
 226     }
 227
 228     return divideBy == 0 ? 0D : 100D * total / divideBy;
 229   }
 230 }