src/jalview/util/Comparison.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.ArrayList;
  24 import java.util.List;
  25
  26 import jalview.datamodel.SequenceI;
  27
  28 /**
  29  * Assorted methods for analysing or comparing sequences.
  30  */
  31 public class Comparison
  32 {
  33   private static final int EIGHTY_FIVE = 85;
  34
  35   private static final int NINETY_NINE = 99;
  36
  37   private static final int TO_UPPER_CASE = 'a' - 'A';
  38
  39   public static final char GAP_SPACE = ' ';
  40
  41   public static final char GAP_DOT = '.';
  42
  43   public static final char GAP_DASH = '-';
  44
  45   public static final String GapChars = new String(
  46           new char[]
  47           { GAP_SPACE, GAP_DOT, GAP_DASH });
  48
  49   /**
  50    * DOCUMENT ME!
  51    *
  52    * @param ii
  53    *          DOCUMENT ME!
  54    * @param jj
  55    *          DOCUMENT ME!
  56    *
  57    * @return DOCUMENT ME!
  58    */
  59   public static final float compare(SequenceI ii, SequenceI jj)
  60   {
  61     return Comparison.compare(ii, jj, 0, ii.getLength() - 1);
  62   }
  63
  64   /**
  65    * this was supposed to be an ungapped pid calculation
  66    *
  67    * @param ii
  68    *          SequenceI
  69    * @param jj
  70    *          SequenceI
  71    * @param start
  72    *          int
  73    * @param end
  74    *          int
  75    * @return float
  76    */
  77   public static float compare(SequenceI ii, SequenceI jj, int start,
  78           int end)
  79   {
  80     String si = ii.getSequenceAsString();
  81     String sj = jj.getSequenceAsString();
  82
  83     int ilen = si.length() - 1;
  84     int jlen = sj.length() - 1;
  85
  86     while (Comparison.isGap(si.charAt(start + ilen)))
  87     {
  88       ilen--;
  89     }
  90
  91     while (Comparison.isGap(sj.charAt(start + jlen)))
  92     {
  93       jlen--;
  94     }
  95
  96     int count = 0;
  97     int match = 0;
  98     float pid = -1;
  99
 100     if (ilen > jlen)
 101     {
 102       for (int j = 0; j < jlen; j++)
 103       {
 104         if (si.substring(start + j, start + j + 1)
 105                 .equals(sj.substring(start + j, start + j + 1)))
 106         {
 107           match++;
 108         }
 109
 110         count++;
 111       }
 112
 113       pid = (float) match / (float) ilen * 100;
 114     }
 115     else
 116     {
 117       for (int j = 0; j < jlen; j++)
 118       {
 119         if (si.substring(start + j, start + j + 1)
 120                 .equals(sj.substring(start + j, start + j + 1)))
 121         {
 122           match++;
 123         }
 124
 125         count++;
 126       }
 127
 128       pid = (float) match / (float) jlen * 100;
 129     }
 130
 131     return pid;
 132   }
 133
 134   /**
 135    * this is a gapped PID calculation
 136    *
 137    * @param s1
 138    *          SequenceI
 139    * @param s2
 140    *          SequenceI
 141    * @return float
 142    * @deprecated use PIDModel.computePID()
 143    */
 144   @Deprecated
 145   public final static float PID(String seq1, String seq2)
 146   {
 147     return PID(seq1, seq2, 0, seq1.length());
 148   }
 149
 150   static final int caseShift = 'a' - 'A';
 151
 152   // Another pid with region specification
 153   /**
 154    * @deprecated use PIDModel.computePID()
 155    */
 156   @Deprecated
 157   public final static float PID(String seq1, String seq2, int start,
 158           int end)
 159   {
 160     return PID(seq1, seq2, start, end, true, false);
 161   }
 162
 163   /**
 164    * Calculate percent identity for a pair of sequences over a particular range,
 165    * with different options for ignoring gaps.
 166    *
 167    * @param seq1
 168    * @param seq2
 169    * @param start
 170    *          - position in seqs
 171    * @param end
 172    *          - position in seqs
 173    * @param wcGaps
 174    *          - if true - gaps match any character, if false, do not match
 175    *          anything
 176    * @param ungappedOnly
 177    *          - if true - only count PID over ungapped columns
 178    * @return
 179    * @deprecated use PIDModel.computePID()
 180    */
 181   @Deprecated
 182   public final static float PID(String seq1, String seq2, int start,
 183           int end, boolean wcGaps, boolean ungappedOnly)
 184   {
 185     int s1len = seq1.length();
 186     int s2len = seq2.length();
 187
 188     int len = Math.min(s1len, s2len);
 189
 190     if (end < len)
 191     {
 192       len = end;
 193     }
 194
 195     if (len < start)
 196     {
 197       start = len - 1; // we just use a single residue for the difference
 198     }
 199
 200     int elen = len - start, bad = 0;
 201     char chr1;
 202     char chr2;
 203     boolean agap;
 204     for (int i = start; i < len; i++)
 205     {
 206       chr1 = seq1.charAt(i);
 207
 208       chr2 = seq2.charAt(i);
 209       agap = isGap(chr1) || isGap(chr2);
 210       if ('a' <= chr1 && chr1 <= 'z')
 211       {
 212         // TO UPPERCASE !!!
 213         // Faster than toUpperCase
 214         chr1 -= caseShift;
 215       }
 216       if ('a' <= chr2 && chr2 <= 'z')
 217       {
 218         // TO UPPERCASE !!!
 219         // Faster than toUpperCase
 220         chr2 -= caseShift;
 221       }
 222
 223       if (chr1 != chr2)
 224       {
 225         if (agap)
 226         {
 227           if (ungappedOnly)
 228           {
 229             elen--;
 230           }
 231           else if (!wcGaps)
 232           {
 233             bad++;
 234           }
 235         }
 236         else
 237         {
 238           bad++;
 239         }
 240       }
 241
 242     }
 243     if (elen < 1)
 244     {
 245       return 0f;
 246     }
 247     return ((float) 100 * (elen - bad)) / elen;
 248   }
 249
 250   /**
 251    * Answers true if the supplied character is a recognised gap character, else
 252    * false. Currently hard-coded to recognise '-', '-' or ' ' (hyphen / dot /
 253    * space).
 254    *
 255    * @param c
 256    *
 257    * @return
 258    */
 259   public static final boolean isGap(char c)
 260   {
 261     return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE;
 262   }
 263
 264   /**
 265    * Overloaded method signature to test whether a single sequence is nucleotide
 266    * (that is, more than 85% CGTAUNX)
 267    *
 268    * @param seq
 269    * @return
 270    */
 271   public static final boolean isNucleotide(SequenceI seq)
 272   {
 273     if (seq == null)
 274     {
 275       return false;
 276     }
 277     long ntCount = 0;
 278     long aaCount = 0;
 279     long nCount = 0;
 280     long ntaCount = 0;
 281
 282     int len = seq.getLength();
 283     for (int i = 0; i < len; i++)
 284     {
 285       char c = seq.getCharAt(i);
 286       if (isNucleotide(c))
 287       {
 288         ntCount++;
 289       }
 290       else if (!isGap(c))
 291       {
 292         aaCount++;
 293         if (isN(c))
 294         {
 295           nCount++;
 296         }
 297         else
 298         {
 299           if (isNucleotideAmbiguity(c))
 300           {
 301             ntaCount++;
 302           }
 303         }
 304       }
 305     }
 306     /*
 307      * Check for nucleotide count > 85% of total count (in a form that evades
 308      * int / float conversion or divide by zero).
 309      */
 310     if ((ntCount + nCount) * 100 > EIGHTY_FIVE * (ntCount + aaCount))
 311     {
 312       return ntCount > 0; // all N is considered protein. Could use a threshold
 313                           // here too
 314     }
 315     else
 316     {
 317       // check for very large proportion of nucleotide and all ambiguity codes
 318       if ((ntCount + nCount + ntaCount) * 100 >= NINETY_NINE
 319               * (ntCount + aaCount))
 320       {
 321         return ntCount > 0;
 322       }
 323       return false;
 324     }
 325   }
 326
 327   /**
 328    * Answers true if more than 85% of the sequence residues (ignoring gaps) are
 329    * A, G, C, T or U, else false. This is just a heuristic guess and may give a
 330    * wrong answer (as AGCT are also amino acid codes).
 331    *
 332    * @param seqs
 333    * @return
 334    */
 335   public static final boolean isNucleotide(SequenceI[] seqs)
 336   {
 337     if (seqs == null)
 338     {
 339       return false;
 340     }
 341     // true if we have seen a nucleotide sequence
 342     boolean na = false;
 343     for (SequenceI seq : seqs)
 344     {
 345       if (seq == null)
 346       {
 347         continue;
 348       }
 349       na = true;
 350       // TODO could possibly make an informed guess just from the first sequence
 351       // to save a lengthy calculation
 352       if (seq.isProtein())
 353       {
 354         // if even one looks like protein, the alignment is protein
 355         return false;
 356       }
 357     }
 358     return na;
 359   }
 360
 361   /**
 362    * Answers true if the character is one of aAcCgGtTuU
 363    *
 364    * @param c
 365    * @return
 366    */
 367   public static boolean isNucleotide(char c)
 368   {
 369     return isNucleotide(c, false);
 370   }
 371
 372   public static boolean isNucleotide(char c, boolean countAmbiguity)
 373   {
 374     char C = Character.toUpperCase(c);
 375     switch (C)
 376     {
 377     case 'A':
 378     case 'C':
 379     case 'G':
 380     case 'T':
 381     case 'U':
 382       return true;
 383     }
 384     if (countAmbiguity)
 385     {
 386       boolean ambiguity = isNucleotideAmbiguity(C);
 387       if (ambiguity)
 388         return true;
 389     }
 390     return false;
 391   }
 392
 393   public static boolean isNucleotideAmbiguity(char c)
 394   {
 395     switch (Character.toUpperCase(c))
 396     {
 397     case 'I':
 398     case 'X':
 399     case 'R':
 400     case 'Y':
 401     case 'W':
 402     case 'S':
 403     case 'M':
 404     case 'K':
 405     case 'B':
 406     case 'H':
 407     case 'D':
 408     case 'V':
 409       return true;
 410     case 'N': // not counting N as nucleotide
 411     }
 412     return false;
 413   }
 414
 415   public static boolean isN(char c)
 416   {
 417     return 'n' == Character.toLowerCase(c);
 418   }
 419
 420   public static boolean isX(char c)
 421   {
 422     return 'x' == Character.toLowerCase(c);
 423   }
 424
 425   /**
 426    * Answers true if every character in the string is one of aAcCgGtTuU, or
 427    * (optionally) a gap character (dot, dash, space), else false
 428    *
 429    * @param s
 430    * @param allowGaps
 431    * @return
 432    */
 433   public static boolean isNucleotideSequence(String s, boolean allowGaps)
 434   {
 435     if (s == null)
 436     {
 437       return false;
 438     }
 439     for (int i = 0; i < s.length(); i++)
 440     {
 441       char c = s.charAt(i);
 442       if (!isNucleotide(c))
 443       {
 444         if (!allowGaps || !isGap(c))
 445         {
 446           return false;
 447         }
 448       }
 449     }
 450     return true;
 451   }
 452
 453   /**
 454    * Convenience overload of isNucleotide
 455    *
 456    * @param seqs
 457    * @return
 458    */
 459   public static boolean isNucleotide(SequenceI[][] seqs)
 460   {
 461     if (seqs == null)
 462     {
 463       return false;
 464     }
 465     List<SequenceI> flattened = new ArrayList<SequenceI>();
 466     for (SequenceI[] ss : seqs)
 467     {
 468       for (SequenceI s : ss)
 469       {
 470         flattened.add(s);
 471       }
 472     }
 473     final SequenceI[] oneDArray = flattened
 474             .toArray(new SequenceI[flattened.size()]);
 475     return isNucleotide(oneDArray);
 476   }
 477
 478   /**
 479    * Compares two residues either case sensitively or case insensitively
 480    * depending on the caseSensitive flag
 481    *
 482    * @param c1
 483    *          first char
 484    * @param c2
 485    *          second char to compare with
 486    * @param caseSensitive
 487    *          if true comparison will be case sensitive otherwise its not
 488    * @return
 489    */
 490   public static boolean isSameResidue(char c1, char c2,
 491           boolean caseSensitive)
 492   {
 493     return caseSensitive ? c1 == c2
 494             : Character.toUpperCase(c1) == Character.toUpperCase(c2);
 495   }
 496 }