src/jalview/util/Comparison.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import jalview.datamodel.SequenceI;
  24
  25 import java.util.ArrayList;
  26 import java.util.List;
  27
  28 /**
  29  * Assorted methods for analysing or comparing sequences.
  30  */
  31 public class Comparison
  32 {
  33   private static final int EIGHTY_FIVE = 85;
  34
  35   private static final int TO_UPPER_CASE = 'a' - 'A';
  36
  37   public static final char GAP_SPACE = ' ';
  38
  39   public static final char GAP_DOT = '.';
  40
  41   public static final char GAP_DASH = '-';
  42
  43   public static final char GAP_TILDE = '~';
  44
  45   public static final char GAP_PIPE = '|';
  46
  47   public static final char GAP_COLON = ':';
  48
  49   public static final char GAP_LPAREN = '(';
  50
  51   public static final char GAP_RPAREN = ')';
  52
  53   public static final char GAP_LSQBR = '[';
  54
  55   public static final char GAP_RSQBR = ']';
  56
  57   public static final char GAP_LBRACE = '{';
  58
  59   public static final char GAP_RBRACE = '}';
  60
  61   public static final String GapChars = new String(
  62           new char[]
  63           { GAP_SPACE, GAP_DOT, GAP_DASH, GAP_TILDE, GAP_PIPE, GAP_COLON,
  64               GAP_LPAREN,
  65               GAP_RPAREN, GAP_LSQBR, GAP_RSQBR, GAP_LBRACE, GAP_RBRACE });
  66
  67   /**
  68    * DOCUMENT ME!
  69    *
  70    * @param ii
  71    *          DOCUMENT ME!
  72    * @param jj
  73    *          DOCUMENT ME!
  74    *
  75    * @return DOCUMENT ME!
  76    */
  77   public static final float compare(SequenceI ii, SequenceI jj)
  78   {
  79     return Comparison.compare(ii, jj, 0, ii.getLength() - 1);
  80   }
  81
  82   /**
  83    * this was supposed to be an ungapped pid calculation
  84    *
  85    * @param ii
  86    *          SequenceI
  87    * @param jj
  88    *          SequenceI
  89    * @param start
  90    *          int
  91    * @param end
  92    *          int
  93    * @return float
  94    */
  95   public static float compare(SequenceI ii, SequenceI jj, int start,
  96           int end)
  97   {
  98     String si = ii.getSequenceAsString();
  99     String sj = jj.getSequenceAsString();
 100
 101     int ilen = si.length() - 1;
 102     int jlen = sj.length() - 1;
 103
 104     while (Comparison.isGap(si.charAt(start + ilen)))
 105     {
 106       ilen--;
 107     }
 108
 109     while (Comparison.isGap(sj.charAt(start + jlen)))
 110     {
 111       jlen--;
 112     }
 113
 114     int count = 0;
 115     int match = 0;
 116     float pid = -1;
 117
 118     if (ilen > jlen)
 119     {
 120       for (int j = 0; j < jlen; j++)
 121       {
 122         if (si.substring(start + j, start + j + 1)
 123                 .equals(sj.substring(start + j, start + j + 1)))
 124         {
 125           match++;
 126         }
 127
 128         count++;
 129       }
 130
 131       pid = (float) match / (float) ilen * 100;
 132     }
 133     else
 134     {
 135       for (int j = 0; j < jlen; j++)
 136       {
 137         if (si.substring(start + j, start + j + 1)
 138                 .equals(sj.substring(start + j, start + j + 1)))
 139         {
 140           match++;
 141         }
 142
 143         count++;
 144       }
 145
 146       pid = (float) match / (float) jlen * 100;
 147     }
 148
 149     return pid;
 150   }
 151
 152   /**
 153    * this is a gapped PID calculation
 154    *
 155    * @param s1
 156    *          SequenceI
 157    * @param s2
 158    *          SequenceI
 159    * @return float
 160    * @deprecated use PIDModel.computePID()
 161    */
 162   @Deprecated
 163   public final static float PID(String seq1, String seq2)
 164   {
 165     return PID(seq1, seq2, 0, seq1.length());
 166   }
 167
 168   static final int caseShift = 'a' - 'A';
 169
 170   // Another pid with region specification
 171   /**
 172    * @deprecated use PIDModel.computePID()
 173    */
 174   @Deprecated
 175   public final static float PID(String seq1, String seq2, int start,
 176           int end)
 177   {
 178     return PID(seq1, seq2, start, end, true, false);
 179   }
 180
 181   /**
 182    * Calculate percent identity for a pair of sequences over a particular range,
 183    * with different options for ignoring gaps.
 184    *
 185    * @param seq1
 186    * @param seq2
 187    * @param start
 188    *          - position in seqs
 189    * @param end
 190    *          - position in seqs
 191    * @param wcGaps
 192    *          - if true - gaps match any character, if false, do not match
 193    *          anything
 194    * @param ungappedOnly
 195    *          - if true - only count PID over ungapped columns
 196    * @return
 197    * @deprecated use PIDModel.computePID()
 198    */
 199   @Deprecated
 200   public final static float PID(String seq1, String seq2, int start,
 201           int end, boolean wcGaps, boolean ungappedOnly)
 202   {
 203     int s1len = seq1.length();
 204     int s2len = seq2.length();
 205
 206     int len = Math.min(s1len, s2len);
 207
 208     if (end < len)
 209     {
 210       len = end;
 211     }
 212
 213     if (len < start)
 214     {
 215       start = len - 1; // we just use a single residue for the difference
 216     }
 217
 218     int elen = len - start, bad = 0;
 219     char chr1;
 220     char chr2;
 221     boolean agap;
 222     for (int i = start; i < len; i++)
 223     {
 224       chr1 = seq1.charAt(i);
 225
 226       chr2 = seq2.charAt(i);
 227       agap = isGap(chr1) || isGap(chr2);
 228       if ('a' <= chr1 && chr1 <= 'z')
 229       {
 230         // TO UPPERCASE !!!
 231         // Faster than toUpperCase
 232         chr1 -= caseShift;
 233       }
 234       if ('a' <= chr2 && chr2 <= 'z')
 235       {
 236         // TO UPPERCASE !!!
 237         // Faster than toUpperCase
 238         chr2 -= caseShift;
 239       }
 240
 241       if (chr1 != chr2)
 242       {
 243         if (agap)
 244         {
 245           if (ungappedOnly)
 246           {
 247             elen--;
 248           }
 249           else if (!wcGaps)
 250           {
 251             bad++;
 252           }
 253         }
 254         else
 255         {
 256           bad++;
 257         }
 258       }
 259
 260     }
 261     if (elen < 1)
 262     {
 263       return 0f;
 264     }
 265     return ((float) 100 * (elen - bad)) / elen;
 266   }
 267
 268   /**
 269    * Answers true if the supplied character is a recognised gap character, else
 270    * false. Currently hard-coded to recognise '-', '-' or ' ' (hyphen / dot /
 271    * space).
 272    *
 273    * @param c
 274    *
 275    * @return
 276    */
 277   public static final boolean isGap(char c)
 278   {
 279     switch (c)
 280     {
 281     case GAP_SPACE:
 282     case GAP_DOT:
 283     case GAP_DASH:
 284     case GAP_TILDE:
 285     case GAP_PIPE:
 286     case GAP_COLON:
 287     case GAP_LPAREN:
 288     case GAP_RPAREN:
 289     case GAP_LSQBR:
 290     case GAP_RSQBR:
 291     case GAP_LBRACE:
 292     case GAP_RBRACE:
 293       return true;
 294     default:
 295       return false;
 296     }
 297   }
 298
 299   /**
 300    * Overloaded method signature to test whether a single sequence is nucleotide
 301    * (that is, more than 85% CGTA)
 302    *
 303    * @param seq
 304    * @return
 305    */
 306   public static final boolean isNucleotide(SequenceI seq)
 307   {
 308     return isNucleotide(new SequenceI[] { seq });
 309   }
 310
 311   /**
 312    * Answers true if more than 85% of the sequence residues (ignoring gaps) are
 313    * A, G, C, T or U, else false. This is just a heuristic guess and may give a
 314    * wrong answer (as AGCT are also amino acid codes).
 315    *
 316    * @param seqs
 317    * @return
 318    */
 319   public static final boolean isNucleotide(SequenceI[] seqs)
 320   {
 321     if (seqs == null)
 322     {
 323       return false;
 324     }
 325
 326     int ntCount = 0;
 327     int aaCount = 0;
 328     for (SequenceI seq : seqs)
 329     {
 330       if (seq == null)
 331       {
 332         continue;
 333       }
 334       // TODO could possibly make an informed guess just from the first sequence
 335       // to save a lengthy calculation
 336       int len = seq.getLength();
 337       for (int i = 0; i < len; i++)
 338       {
 339         char c = seq.getCharAt(i);
 340         if (isNucleotide(c))
 341         {
 342           ntCount++;
 343         }
 344         else if (!isGap(c))
 345         {
 346           aaCount++;
 347         }
 348       }
 349     }
 350
 351     /*
 352      * Check for nucleotide count > 85% of total count (in a form that evades
 353      * int / float conversion or divide by zero).
 354      */
 355     if (ntCount * 100 > EIGHTY_FIVE * (ntCount + aaCount))
 356     {
 357       return true;
 358     }
 359     else
 360     {
 361       return false;
 362     }
 363
 364   }
 365
 366   /**
 367    * Answers true if the character is one of aAcCgGtTuU
 368    *
 369    * @param c
 370    * @return
 371    */
 372   public static boolean isNucleotide(char c)
 373   {
 374     if ('a' <= c && c <= 'z')
 375     {
 376       c -= TO_UPPER_CASE;
 377     }
 378
 379     switch (c)
 380     {
 381     case 'A':
 382     case 'C':
 383     case 'G':
 384     case 'T':
 385     case 'U':
 386       return true;
 387     }
 388     return false;
 389   }
 390
 391   /**
 392    * Answers true if every character in the string is one of aAcCgGtTuU, or
 393    * (optionally) a gap character (dot, dash, space), else false
 394    *
 395    * @param s
 396    * @param allowGaps
 397    * @return
 398    */
 399   public static boolean isNucleotideSequence(String s, boolean allowGaps)
 400   {
 401     if (s == null)
 402     {
 403       return false;
 404     }
 405     for (int i = 0; i < s.length(); i++)
 406     {
 407       char c = s.charAt(i);
 408       if (!isNucleotide(c))
 409       {
 410         if (!allowGaps || !isGap(c))
 411         {
 412           return false;
 413         }
 414       }
 415     }
 416     return true;
 417   }
 418
 419   /**
 420    * Convenience overload of isNucleotide
 421    *
 422    * @param seqs
 423    * @return
 424    */
 425   public static boolean isNucleotide(SequenceI[][] seqs)
 426   {
 427     if (seqs == null)
 428     {
 429       return false;
 430     }
 431     List<SequenceI> flattened = new ArrayList<>();
 432     for (SequenceI[] ss : seqs)
 433     {
 434       for (SequenceI s : ss)
 435       {
 436         flattened.add(s);
 437       }
 438     }
 439     final SequenceI[] oneDArray = flattened
 440             .toArray(new SequenceI[flattened.size()]);
 441     return isNucleotide(oneDArray);
 442   }
 443
 444   /**
 445    * Compares two residues either case sensitively or case insensitively
 446    * depending on the caseSensitive flag
 447    *
 448    * @param c1
 449    *          first char
 450    * @param c2
 451    *          second char to compare with
 452    * @param caseSensitive
 453    *          if true comparison will be case sensitive otherwise its not
 454    * @return
 455    */
 456   public static boolean isSameResidue(char c1, char c2,
 457           boolean caseSensitive)
 458   {
 459     if (caseSensitive)
 460     {
 461       return (c1 == c2);
 462     }
 463     else
 464     {
 465       return Character.toUpperCase(c1) == Character.toUpperCase(c2);
 466     }
 467   }
 468 }