src/jalview/analysis/AlignmentUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentAnnotation;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.SequenceI;
  27 import jalview.schemes.ResidueProperties;
  28 import jalview.util.MapList;
  29
  30 import java.util.ArrayList;
  31 import java.util.LinkedHashMap;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * grab bag of useful alignment manipulation operations Expect these to be
  37  * refactored elsewhere at some point.
  38  *
  39  * @author jimp
  40  *
  41  */
  42 public class AlignmentUtils
  43 {
  44
  45   /**
  46    * Represents the 3 possible results of trying to map one alignment to
  47    * another.
  48    */
  49   public enum MappingResult
  50   {
  51     Mapped, NotMapped, AlreadyMapped
  52   }
  53
  54   /**
  55    * given an existing alignment, create a new alignment including all, or up to
  56    * flankSize additional symbols from each sequence's dataset sequence
  57    *
  58    * @param core
  59    * @param flankSize
  60    * @return AlignmentI
  61    */
  62   public static AlignmentI expandContext(AlignmentI core, int flankSize)
  63   {
  64     List<SequenceI> sq = new ArrayList<SequenceI>();
  65     int maxoffset = 0;
  66     for (SequenceI s : core.getSequences())
  67     {
  68       SequenceI newSeq = s.deriveSequence();
  69       if (newSeq.getStart() > maxoffset
  70               && newSeq.getDatasetSequence().getStart() < s.getStart())
  71       {
  72         maxoffset = newSeq.getStart();
  73       }
  74       sq.add(newSeq);
  75     }
  76     if (flankSize > -1)
  77     {
  78       maxoffset = flankSize;
  79     }
  80     // now add offset to create a new expanded alignment
  81     for (SequenceI s : sq)
  82     {
  83       SequenceI ds = s;
  84       while (ds.getDatasetSequence() != null)
  85       {
  86         ds = ds.getDatasetSequence();
  87       }
  88       int s_end = s.findPosition(s.getStart() + s.getLength());
  89       // find available flanking residues for sequence
  90       int ustream_ds = s.getStart() - ds.getStart(), dstream_ds = ds
  91               .getEnd() - s_end;
  92
  93       // build new flanked sequence
  94
  95       // compute gap padding to start of flanking sequence
  96       int offset = maxoffset - ustream_ds;
  97
  98       // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
  99       if (flankSize >= 0)
 100       {
 101         if (flankSize < ustream_ds)
 102         {
 103           // take up to flankSize residues
 104           offset = maxoffset - flankSize;
 105           ustream_ds = flankSize;
 106         }
 107         if (flankSize < dstream_ds)
 108         {
 109           dstream_ds = flankSize;
 110         }
 111       }
 112       char[] upstream = new String(ds.getSequence(s.getStart() - 1
 113               - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray();
 114       char[] downstream = new String(ds.getSequence(s_end - 1, s_end + 1
 115               + dstream_ds)).toLowerCase().toCharArray();
 116       char[] coreseq = s.getSequence();
 117       char[] nseq = new char[offset + upstream.length + downstream.length
 118               + coreseq.length];
 119       char c = core.getGapCharacter();
 120       // TODO could lowercase the flanking regions
 121       int p = 0;
 122       for (; p < offset; p++)
 123       {
 124         nseq[p] = c;
 125       }
 126       // s.setSequence(new String(upstream).toLowerCase()+new String(coreseq) +
 127       // new String(downstream).toLowerCase());
 128       System.arraycopy(upstream, 0, nseq, p, upstream.length);
 129       System.arraycopy(coreseq, 0, nseq, p + upstream.length,
 130               coreseq.length);
 131       System.arraycopy(downstream, 0, nseq, p + coreseq.length
 132               + upstream.length, downstream.length);
 133       s.setSequence(new String(nseq));
 134       s.setStart(s.getStart() - ustream_ds);
 135       s.setEnd(s_end + downstream.length);
 136     }
 137     AlignmentI newAl = new jalview.datamodel.Alignment(
 138             sq.toArray(new SequenceI[0]));
 139     for (SequenceI s : sq)
 140     {
 141       if (s.getAnnotation() != null)
 142       {
 143         for (AlignmentAnnotation aa : s.getAnnotation())
 144         {
 145           newAl.addAnnotation(aa);
 146         }
 147       }
 148     }
 149     newAl.setDataset(core.getDataset());
 150     return newAl;
 151   }
 152
 153   /**
 154    * Returns the index (zero-based position) of a sequence in an alignment, or
 155    * -1 if not found.
 156    *
 157    * @param al
 158    * @param seq
 159    * @return
 160    */
 161   public static int getSequenceIndex(AlignmentI al, SequenceI seq)
 162   {
 163     int result = -1;
 164     int pos = 0;
 165     for (SequenceI alSeq : al.getSequences())
 166     {
 167       if (alSeq == seq)
 168       {
 169         result = pos;
 170         break;
 171       }
 172       pos++;
 173     }
 174     return result;
 175   }
 176
 177   /**
 178    * Returns a map of lists of sequences in the alignment, keyed by sequence
 179    * name. For use in mapping between different alignment views of the same
 180    * sequences.
 181    *
 182    * @see jalview.datamodel.AlignmentI#getSequencesByName()
 183    */
 184   public static Map<String, List<SequenceI>> getSequencesByName(
 185           AlignmentI al)
 186   {
 187     Map<String, List<SequenceI>> theMap = new LinkedHashMap<String, List<SequenceI>>();
 188     for (SequenceI seq : al.getSequences())
 189     {
 190       String name = seq.getName();
 191       if (name != null)
 192       {
 193         List<SequenceI> seqs = theMap.get(name);
 194         if (seqs == null)
 195         {
 196           seqs = new ArrayList<SequenceI>();
 197           theMap.put(name, seqs);
 198         }
 199         seqs.add(seq);
 200       }
 201     }
 202     return theMap;
 203   }
 204
 205   /**
 206    * Build mapping of protein to cDNA alignment. Mappings are made between
 207    * sequences which have the same name and compatible lengths. Has a 3-valued
 208    * result: either Mapped (at least one sequence mapping was created),
 209    * AlreadyMapped (all possible sequence mappings already exist), or NotMapped
 210    * (no possible sequence mappings exist).
 211    *
 212    * @param proteinAlignment
 213    * @param cdnaAlignment
 214    * @return
 215    */
 216   public static MappingResult mapProteinToCdna(
 217           final AlignmentI proteinAlignment,
 218           final AlignmentI cdnaAlignment)
 219   {
 220     boolean mappingPossible = false;
 221     boolean mappingPerformed = false;
 222
 223     List<SequenceI> thisSeqs = proteinAlignment.getSequences();
 224
 225     /*
 226      * Build a look-up of cDNA sequences by name, for matching purposes.
 227      */
 228     Map<String, List<SequenceI>> cdnaSeqs = cdnaAlignment
 229             .getSequencesByName();
 230
 231     for (SequenceI aaSeq : thisSeqs)
 232     {
 233       AlignedCodonFrame acf = new AlignedCodonFrame();
 234       List<SequenceI> candidates = cdnaSeqs.get(aaSeq.getName());
 235       if (candidates == null)
 236       {
 237         /*
 238          * No cDNA sequence with matching name, so no mapping possible for this
 239          * protein sequence
 240          */
 241         continue;
 242       }
 243       mappingPossible = true;
 244       for (SequenceI cdnaSeq : candidates)
 245       {
 246         if (!mappingExists(proteinAlignment.getCodonFrames(),
 247                 aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
 248         {
 249           MapList map = mapProteinToCdna(aaSeq, cdnaSeq);
 250           if (map != null)
 251           {
 252             acf.addMap(cdnaSeq, aaSeq, map);
 253             mappingPerformed = true;
 254           }
 255         }
 256       }
 257       proteinAlignment.addCodonFrame(acf);
 258     }
 259
 260     /*
 261      * If at least one mapping was possible but none was done, then the
 262      * alignments are already as mapped as they can be.
 263      */
 264     if (mappingPossible && !mappingPerformed)
 265     {
 266       return MappingResult.AlreadyMapped;
 267     }
 268     else
 269     {
 270       return mappingPerformed ? MappingResult.Mapped
 271               : MappingResult.NotMapped;
 272     }
 273   }
 274
 275   /**
 276    * Answers true if the mappings include one between the given (dataset)
 277    * sequences.
 278    */
 279   public static boolean mappingExists(AlignedCodonFrame[] codonFrames,
 280           SequenceI aaSeq, SequenceI cdnaSeq)
 281   {
 282     if (codonFrames != null)
 283     {
 284       for (AlignedCodonFrame acf : codonFrames)
 285       {
 286         if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
 287         {
 288           return true;
 289         }
 290       }
 291     }
 292     return false;
 293   }
 294
 295   /**
 296    * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
 297    * must be three times the length of the protein, possibly after ignoring
 298    * start and/or stop codons. Returns null if no mapping is determined.
 299    *
 300    * @param proteinSeqs
 301    * @param cdnaSeq
 302    * @return
 303    */
 304   public static MapList mapProteinToCdna(SequenceI proteinSeq,
 305           SequenceI cdnaSeq)
 306   {
 307     String aaSeqString = proteinSeq.getDatasetSequence()
 308             .getSequenceAsString();
 309     String cdnaSeqString = cdnaSeq.getDatasetSequence()
 310             .getSequenceAsString();
 311     if (aaSeqString == null || cdnaSeqString == null)
 312     {
 313       return null;
 314     }
 315
 316     final int mappedLength = 3 * aaSeqString.length();
 317     int cdnaLength = cdnaSeqString.length();
 318     int cdnaStart = 1;
 319     int cdnaEnd = cdnaLength;
 320     final int proteinStart = 1;
 321     final int proteinEnd = aaSeqString.length();
 322
 323     /*
 324      * If lengths don't match, try ignoring stop codon.
 325      */
 326     if (cdnaLength != mappedLength)
 327     {
 328       for (Object stop : ResidueProperties.STOP)
 329       {
 330         if (cdnaSeqString.toUpperCase().endsWith((String) stop))
 331         {
 332           cdnaEnd -= 3;
 333           cdnaLength -= 3;
 334           break;
 335         }
 336       }
 337     }
 338
 339     /*
 340      * If lengths still don't match, try ignoring start codon.
 341      */
 342     if (cdnaLength != mappedLength
 343             && cdnaSeqString.toUpperCase().startsWith(
 344                     ResidueProperties.START))
 345     {
 346       cdnaStart += 3;
 347       cdnaLength -= 3;
 348     }
 349
 350     if (cdnaLength == mappedLength)
 351     {
 352       MapList map = new MapList(new int[]
 353       { cdnaStart, cdnaEnd }, new int[]
 354       { proteinStart, proteinEnd }, 3, 1);
 355       return map;
 356     }
 357     else
 358     {
 359       return null;
 360     }
 361   }
 362
 363   /**
 364    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
 365    * currently assumes that we are aligning cDNA to match protein.
 366    *
 367    * @param seq
 368    *          the sequence to be realigned
 369    * @param al
 370    *          the alignment whose sequence alignment is to be 'copied'
 371    * @param gap
 372    *          character string represent a gap in the realigned sequence
 373    * @param preserveUnmappedGaps
 374    * @param preserveMappedGaps
 375    * @return true if the sequence was realigned, false if it could not be
 376    */
 377   public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
 378           String gap, boolean preserveMappedGaps,
 379           boolean preserveUnmappedGaps)
 380   {
 381     /*
 382      * Get any mappings from the source alignment to the target (dataset) sequence.
 383      */
 384     // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
 385     // all mappings. Would it help to constrain this?
 386     AlignedCodonFrame[] mappings = al.getCodonFrame(seq);
 387     if (mappings == null)
 388     {
 389       return false;
 390     }
 391
 392     /*
 393      * Locate the aligned source sequence whose dataset sequence is mapped. We
 394      * just take the first match here (as we can't align cDNA like more than one
 395      * protein sequence).
 396      */
 397     SequenceI alignFrom = null;
 398     AlignedCodonFrame mapping = null;
 399     for (AlignedCodonFrame mp : mappings)
 400     {
 401       alignFrom = mp.findAlignedSequence(seq.getDatasetSequence(), al);
 402       if (alignFrom != null)
 403       {
 404         mapping = mp;
 405         break;
 406       }
 407     }
 408
 409     if (alignFrom == null)
 410     {
 411       return false;
 412     }
 413     alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
 414             preserveMappedGaps, preserveUnmappedGaps);
 415     return true;
 416   }
 417
 418   /**
 419    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
 420    * match residues and codons. Flags control whether existing gaps in unmapped
 421    * (intron) and mapped (exon) regions are preserved or not. Gaps linking intro
 422    * and exon are only retained if both flags are set.
 423    *
 424    * @param alignTo
 425    * @param alignFrom
 426    * @param mapping
 427    * @param myGap
 428    * @param sourceGap
 429    * @param preserveUnmappedGaps
 430    * @param preserveMappedGaps
 431    */
 432   public static void alignSequenceAs(SequenceI alignTo,
 433           SequenceI alignFrom,
 434           AlignedCodonFrame mapping, String myGap, char sourceGap,
 435           boolean preserveMappedGaps, boolean preserveUnmappedGaps)
 436   {
 437     // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
 438     final char[] thisSeq = alignTo.getSequence();
 439     final char[] thatAligned = alignFrom.getSequence();
 440     StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
 441
 442     // aligned and dataset sequence positions, all base zero
 443     int thisSeqPos = 0;
 444     int sourceDsPos = 0;
 445
 446     int basesWritten = 0;
 447     char myGapChar = myGap.charAt(0);
 448     int ratio = myGap.length();
 449
 450     /*
 451      * Traverse the aligned protein sequence.
 452      */
 453     int sourceGapMappedLength = 0;
 454     boolean inExon = false;
 455     for (char sourceChar : thatAligned)
 456     {
 457       if (sourceChar == sourceGap)
 458       {
 459         sourceGapMappedLength += ratio;
 460         continue;
 461       }
 462
 463       /*
 464        * Found a residue. Locate its mapped codon (start) position.
 465        */
 466       sourceDsPos++;
 467       // Note mapping positions are base 1, our sequence positions base 0
 468       int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
 469               sourceDsPos);
 470       if (mappedPos == null)
 471       {
 472         /*
 473          * Abort realignment if unmapped protein. Or could ignore it??
 474          */
 475         System.err.println("Can't align: no codon mapping to residue "
 476                 + sourceDsPos + "(" + sourceChar + ")");
 477         return;
 478       }
 479
 480       int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
 481       int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
 482       StringBuilder trailingCopiedGap = new StringBuilder();
 483
 484       /*
 485        * Copy dna sequence up to and including this codon. Optionally, include
 486        * gaps before the codon starts (in introns) and/or after the codon starts
 487        * (in exons).
 488        *
 489        * Note this only works for 'linear' splicing, not reverse or interleaved.
 490        * But then 'align dna as protein' doesn't make much sense otherwise.
 491        */
 492       int intronLength = 0;
 493       while (basesWritten < mappedCodonEnd && thisSeqPos < thisSeq.length)
 494       {
 495         final char c = thisSeq[thisSeqPos++];
 496         if (c != myGapChar)
 497         {
 498           basesWritten++;
 499
 500           if (basesWritten < mappedCodonStart)
 501           {
 502             /*
 503              * Found an unmapped (intron) base. First add in any preceding gaps
 504              * (if wanted).
 505              */
 506             if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
 507             {
 508               thisAligned.append(trailingCopiedGap.toString());
 509               intronLength += trailingCopiedGap.length();
 510               trailingCopiedGap = new StringBuilder();
 511             }
 512             intronLength++;
 513             inExon = false;
 514           }
 515           else
 516           {
 517             final boolean startOfCodon = basesWritten == mappedCodonStart;
 518             int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
 519                     preserveUnmappedGaps, sourceGapMappedLength, inExon,
 520                     trailingCopiedGap.length(), intronLength, startOfCodon);
 521             for (int i = 0; i < gapsToAdd; i++)
 522             {
 523               thisAligned.append(myGapChar);
 524             }
 525             sourceGapMappedLength = 0;
 526             inExon = true;
 527           }
 528           thisAligned.append(c);
 529           trailingCopiedGap = new StringBuilder();
 530         }
 531         else
 532         {
 533           if (inExon && preserveMappedGaps)
 534           {
 535             trailingCopiedGap.append(myGapChar);
 536           }
 537           else if (!inExon && preserveUnmappedGaps)
 538           {
 539             trailingCopiedGap.append(myGapChar);
 540           }
 541         }
 542       }
 543     }
 544
 545     /*
 546      * At end of protein sequence. Copy any remaining dna sequence, optionally
 547      * including (intron) gaps. We do not copy trailing gaps in protein.
 548      */
 549     while (thisSeqPos < thisSeq.length)
 550     {
 551       final char c = thisSeq[thisSeqPos++];
 552       if (c != myGapChar || preserveUnmappedGaps)
 553       {
 554         thisAligned.append(c);
 555       }
 556     }
 557
 558     /*
 559      * All done aligning, set the aligned sequence.
 560      */
 561     alignTo.setSequence(new String(thisAligned));
 562   }
 563
 564   /**
 565    * Helper method to work out how many gaps to insert when realigning.
 566    *
 567    * @param preserveMappedGaps
 568    * @param preserveUnmappedGaps
 569    * @param sourceGapMappedLength
 570    * @param inExon
 571    * @param trailingCopiedGap
 572    * @param intronLength
 573    * @param startOfCodon
 574    * @return
 575    */
 576   protected static int calculateGapsToInsert(boolean preserveMappedGaps,
 577           boolean preserveUnmappedGaps, int sourceGapMappedLength,
 578           boolean inExon, int trailingGapLength,
 579           int intronLength, final boolean startOfCodon)
 580   {
 581     int gapsToAdd = 0;
 582     if (startOfCodon)
 583     {
 584       /*
 585        * Reached start of codon. Ignore trailing gaps in intron unless we are
 586        * preserving gaps in both exon and intron. Ignore them anyway if the
 587        * protein alignment introduces a gap at least as large as the intronic
 588        * region.
 589        */
 590       if (inExon && !preserveMappedGaps)
 591       {
 592         trailingGapLength = 0;
 593       }
 594       if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
 595       {
 596         trailingGapLength = 0;
 597       }
 598       if (inExon)
 599       {
 600         gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 601       }
 602       else
 603       {
 604         if (intronLength + trailingGapLength <= sourceGapMappedLength)
 605         {
 606           gapsToAdd = sourceGapMappedLength - intronLength;
 607         }
 608         else
 609         {
 610           gapsToAdd = Math.min(intronLength + trailingGapLength
 611                   - sourceGapMappedLength, trailingGapLength);
 612         }
 613       }
 614     }
 615     else
 616     {
 617       /*
 618        * second or third base of codon; check for any gaps in dna
 619        */
 620       if (!preserveMappedGaps)
 621       {
 622         trailingGapLength = 0;
 623       }
 624       gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 625     }
 626     return gapsToAdd;
 627   }
 628 }