src/jalview/analysis/AlignmentUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import jalview.datamodel.AlignedCodon;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.Alignment;
  26 import jalview.datamodel.AlignmentAnnotation;
  27 import jalview.datamodel.AlignmentI;
  28 import jalview.datamodel.DBRefEntry;
  29 import jalview.datamodel.DBRefSource;
  30 import jalview.datamodel.FeatureProperties;
  31 import jalview.datamodel.Mapping;
  32 import jalview.datamodel.SearchResults;
  33 import jalview.datamodel.Sequence;
  34 import jalview.datamodel.SequenceGroup;
  35 import jalview.datamodel.SequenceI;
  36 import jalview.schemes.ResidueProperties;
  37 import jalview.util.DBRefUtils;
  38 import jalview.util.MapList;
  39 import jalview.util.MappingUtils;
  40
  41 import java.util.ArrayList;
  42 import java.util.Arrays;
  43 import java.util.Collection;
  44 import java.util.HashMap;
  45 import java.util.HashSet;
  46 import java.util.Iterator;
  47 import java.util.LinkedHashMap;
  48 import java.util.List;
  49 import java.util.Map;
  50 import java.util.Map.Entry;
  51 import java.util.Set;
  52 import java.util.TreeMap;
  53
  54 /**
  55  * grab bag of useful alignment manipulation operations Expect these to be
  56  * refactored elsewhere at some point.
  57  *
  58  * @author jimp
  59  *
  60  */
  61 public class AlignmentUtils
  62 {
  63
  64   /**
  65    * given an existing alignment, create a new alignment including all, or up to
  66    * flankSize additional symbols from each sequence's dataset sequence
  67    *
  68    * @param core
  69    * @param flankSize
  70    * @return AlignmentI
  71    */
  72   public static AlignmentI expandContext(AlignmentI core, int flankSize)
  73   {
  74     List<SequenceI> sq = new ArrayList<SequenceI>();
  75     int maxoffset = 0;
  76     for (SequenceI s : core.getSequences())
  77     {
  78       SequenceI newSeq = s.deriveSequence();
  79       final int newSeqStart = newSeq.getStart() - 1;
  80       if (newSeqStart > maxoffset
  81               && newSeq.getDatasetSequence().getStart() < s.getStart())
  82       {
  83         maxoffset = newSeqStart;
  84       }
  85       sq.add(newSeq);
  86     }
  87     if (flankSize > -1)
  88     {
  89       maxoffset = Math.min(maxoffset, flankSize);
  90     }
  91
  92     /*
  93      * now add offset left and right to create an expanded alignment
  94      */
  95     for (SequenceI s : sq)
  96     {
  97       SequenceI ds = s;
  98       while (ds.getDatasetSequence() != null)
  99       {
 100         ds = ds.getDatasetSequence();
 101       }
 102       int s_end = s.findPosition(s.getStart() + s.getLength());
 103       // find available flanking residues for sequence
 104       int ustream_ds = s.getStart() - ds.getStart();
 105       int dstream_ds = ds.getEnd() - s_end;
 106
 107       // build new flanked sequence
 108
 109       // compute gap padding to start of flanking sequence
 110       int offset = maxoffset - ustream_ds;
 111
 112       // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
 113       if (flankSize >= 0)
 114       {
 115         if (flankSize < ustream_ds)
 116         {
 117           // take up to flankSize residues
 118           offset = maxoffset - flankSize;
 119           ustream_ds = flankSize;
 120         }
 121         if (flankSize <= dstream_ds)
 122         {
 123           dstream_ds = flankSize - 1;
 124         }
 125       }
 126       // TODO use Character.toLowerCase to avoid creating String objects?
 127       char[] upstream = new String(ds.getSequence(s.getStart() - 1
 128               - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray();
 129       char[] downstream = new String(ds.getSequence(s_end - 1, s_end
 130               + dstream_ds)).toLowerCase().toCharArray();
 131       char[] coreseq = s.getSequence();
 132       char[] nseq = new char[offset + upstream.length + downstream.length
 133               + coreseq.length];
 134       char c = core.getGapCharacter();
 135
 136       int p = 0;
 137       for (; p < offset; p++)
 138       {
 139         nseq[p] = c;
 140       }
 141
 142       System.arraycopy(upstream, 0, nseq, p, upstream.length);
 143       System.arraycopy(coreseq, 0, nseq, p + upstream.length,
 144               coreseq.length);
 145       System.arraycopy(downstream, 0, nseq, p + coreseq.length
 146               + upstream.length, downstream.length);
 147       s.setSequence(new String(nseq));
 148       s.setStart(s.getStart() - ustream_ds);
 149       s.setEnd(s_end + downstream.length);
 150     }
 151     AlignmentI newAl = new jalview.datamodel.Alignment(
 152             sq.toArray(new SequenceI[0]));
 153     for (SequenceI s : sq)
 154     {
 155       if (s.getAnnotation() != null)
 156       {
 157         for (AlignmentAnnotation aa : s.getAnnotation())
 158         {
 159           aa.adjustForAlignment(); // JAL-1712 fix
 160           newAl.addAnnotation(aa);
 161         }
 162       }
 163     }
 164     newAl.setDataset(core.getDataset());
 165     return newAl;
 166   }
 167
 168   /**
 169    * Returns the index (zero-based position) of a sequence in an alignment, or
 170    * -1 if not found.
 171    *
 172    * @param al
 173    * @param seq
 174    * @return
 175    */
 176   public static int getSequenceIndex(AlignmentI al, SequenceI seq)
 177   {
 178     int result = -1;
 179     int pos = 0;
 180     for (SequenceI alSeq : al.getSequences())
 181     {
 182       if (alSeq == seq)
 183       {
 184         result = pos;
 185         break;
 186       }
 187       pos++;
 188     }
 189     return result;
 190   }
 191
 192   /**
 193    * Returns a map of lists of sequences in the alignment, keyed by sequence
 194    * name. For use in mapping between different alignment views of the same
 195    * sequences.
 196    *
 197    * @see jalview.datamodel.AlignmentI#getSequencesByName()
 198    */
 199   public static Map<String, List<SequenceI>> getSequencesByName(
 200           AlignmentI al)
 201   {
 202     Map<String, List<SequenceI>> theMap = new LinkedHashMap<String, List<SequenceI>>();
 203     for (SequenceI seq : al.getSequences())
 204     {
 205       String name = seq.getName();
 206       if (name != null)
 207       {
 208         List<SequenceI> seqs = theMap.get(name);
 209         if (seqs == null)
 210         {
 211           seqs = new ArrayList<SequenceI>();
 212           theMap.put(name, seqs);
 213         }
 214         seqs.add(seq);
 215       }
 216     }
 217     return theMap;
 218   }
 219
 220   /**
 221    * Build mapping of protein to cDNA alignment. Mappings are made between
 222    * sequences where the cDNA translates to the protein sequence. Any new
 223    * mappings are added to the protein alignment. Returns true if any mappings
 224    * either already exist or were added, else false.
 225    *
 226    * @param proteinAlignment
 227    * @param cdnaAlignment
 228    * @return
 229    */
 230   public static boolean mapProteinAlignmentToCdna(
 231           final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
 232   {
 233     if (proteinAlignment == null || cdnaAlignment == null)
 234     {
 235       return false;
 236     }
 237
 238     Set<SequenceI> mappedDna = new HashSet<SequenceI>();
 239     Set<SequenceI> mappedProtein = new HashSet<SequenceI>();
 240
 241     /*
 242      * First pass - map sequences where cross-references exist. This include
 243      * 1-to-many mappings to support, for example, variant cDNA.
 244      */
 245     boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
 246             cdnaAlignment, mappedDna, mappedProtein, true);
 247
 248     /*
 249      * Second pass - map sequences where no cross-references exist. This only
 250      * does 1-to-1 mappings and assumes corresponding sequences are in the same
 251      * order in the alignments.
 252      */
 253     mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
 254             mappedDna, mappedProtein, false);
 255     return mappingPerformed;
 256   }
 257
 258   /**
 259    * Make mappings between compatible sequences (where the cDNA translation
 260    * matches the protein).
 261    *
 262    * @param proteinAlignment
 263    * @param cdnaAlignment
 264    * @param mappedDna
 265    *          a set of mapped DNA sequences (to add to)
 266    * @param mappedProtein
 267    *          a set of mapped Protein sequences (to add to)
 268    * @param xrefsOnly
 269    *          if true, only map sequences where xrefs exist
 270    * @return
 271    */
 272   protected static boolean mapProteinToCdna(
 273           final AlignmentI proteinAlignment,
 274           final AlignmentI cdnaAlignment, Set<SequenceI> mappedDna,
 275           Set<SequenceI> mappedProtein, boolean xrefsOnly)
 276   {
 277     boolean mappingExistsOrAdded = false;
 278     List<SequenceI> thisSeqs = proteinAlignment.getSequences();
 279     for (SequenceI aaSeq : thisSeqs)
 280     {
 281       boolean proteinMapped = false;
 282       AlignedCodonFrame acf = new AlignedCodonFrame();
 283
 284       for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
 285       {
 286         /*
 287          * Always try to map if sequences have xref to each other; this supports
 288          * variant cDNA or alternative splicing for a protein sequence.
 289          *
 290          * If no xrefs, try to map progressively, assuming that alignments have
 291          * mappable sequences in corresponding order. These are not
 292          * many-to-many, as that would risk mixing species with similar cDNA
 293          * sequences.
 294          */
 295         if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
 296         {
 297           continue;
 298         }
 299
 300         /*
 301          * Don't map non-xrefd sequences more than once each. This heuristic
 302          * allows us to pair up similar sequences in ordered alignments.
 303          */
 304         if (!xrefsOnly
 305                 && (mappedProtein.contains(aaSeq) || mappedDna
 306                         .contains(cdnaSeq)))
 307         {
 308           continue;
 309         }
 310         if (mappingExists(proteinAlignment.getCodonFrames(),
 311                 aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
 312         {
 313           mappingExistsOrAdded = true;
 314         }
 315         else
 316         {
 317           MapList map = mapProteinSequenceToCdna(aaSeq, cdnaSeq);
 318           if (map != null)
 319           {
 320             acf.addMap(cdnaSeq, aaSeq, map);
 321             mappingExistsOrAdded = true;
 322             proteinMapped = true;
 323             mappedDna.add(cdnaSeq);
 324             mappedProtein.add(aaSeq);
 325           }
 326         }
 327       }
 328       if (proteinMapped)
 329       {
 330         proteinAlignment.addCodonFrame(acf);
 331       }
 332     }
 333     return mappingExistsOrAdded;
 334   }
 335
 336   /**
 337    * Answers true if the mappings include one between the given (dataset)
 338    * sequences.
 339    */
 340   public static boolean mappingExists(List<AlignedCodonFrame> mappings,
 341           SequenceI aaSeq, SequenceI cdnaSeq)
 342   {
 343     if (mappings != null)
 344     {
 345       for (AlignedCodonFrame acf : mappings)
 346       {
 347         if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
 348         {
 349           return true;
 350         }
 351       }
 352     }
 353     return false;
 354   }
 355
 356   /**
 357    * Build a mapping (if possible) of a protein to a cDNA sequence. The cDNA
 358    * must be three times the length of the protein, possibly after ignoring
 359    * start and/or stop codons, and must translate to the protein. Returns null
 360    * if no mapping is determined.
 361    *
 362    * @param proteinSeqs
 363    * @param cdnaSeq
 364    * @return
 365    */
 366   public static MapList mapProteinSequenceToCdna(SequenceI proteinSeq,
 367           SequenceI cdnaSeq)
 368   {
 369     /*
 370      * Here we handle either dataset sequence set (desktop) or absent (applet).
 371      * Use only the char[] form of the sequence to avoid creating possibly large
 372      * String objects.
 373      */
 374     final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
 375     char[] aaSeqChars = proteinDataset != null ? proteinDataset
 376             .getSequence() : proteinSeq.getSequence();
 377     final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
 378     char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
 379             : cdnaSeq.getSequence();
 380     if (aaSeqChars == null || cdnaSeqChars == null)
 381     {
 382       return null;
 383     }
 384
 385     /*
 386      * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
 387      */
 388     final int mappedLength = 3 * aaSeqChars.length;
 389     int cdnaLength = cdnaSeqChars.length;
 390     int cdnaStart = cdnaSeq.getStart();
 391     int cdnaEnd = cdnaSeq.getEnd();
 392     final int proteinStart = proteinSeq.getStart();
 393     final int proteinEnd = proteinSeq.getEnd();
 394
 395     /*
 396      * If lengths don't match, try ignoring stop codon.
 397      */
 398     if (cdnaLength != mappedLength && cdnaLength > 2)
 399     {
 400       String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3)
 401               .toUpperCase();
 402       for (String stop : ResidueProperties.STOP)
 403       {
 404         if (lastCodon.equals(stop))
 405         {
 406           cdnaEnd -= 3;
 407           cdnaLength -= 3;
 408           break;
 409         }
 410       }
 411     }
 412
 413     /*
 414      * If lengths still don't match, try ignoring start codon.
 415      */
 416     int startOffset = 0;
 417     if (cdnaLength != mappedLength
 418             && cdnaLength > 2
 419             && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase()
 420                     .equals(ResidueProperties.START))
 421     {
 422       startOffset += 3;
 423       cdnaStart += 3;
 424       cdnaLength -= 3;
 425     }
 426
 427     if (cdnaLength != mappedLength)
 428     {
 429       return null;
 430     }
 431     if (!translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
 432     {
 433       return null;
 434     }
 435     MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] {
 436         proteinStart, proteinEnd }, 3, 1);
 437     return map;
 438   }
 439
 440   /**
 441    * Test whether the given cdna sequence, starting at the given offset,
 442    * translates to the given amino acid sequence, using the standard translation
 443    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
 444    *
 445    * @param cdnaSeqChars
 446    * @param cdnaStart
 447    * @param aaSeqChars
 448    * @return
 449    */
 450   protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
 451           char[] aaSeqChars)
 452   {
 453     if (cdnaSeqChars == null || aaSeqChars == null)
 454     {
 455       return false;
 456     }
 457
 458     int aaResidue = 0;
 459     for (int i = cdnaStart; i < cdnaSeqChars.length - 2
 460             && aaResidue < aaSeqChars.length; i += 3, aaResidue++)
 461     {
 462       String codon = String.valueOf(cdnaSeqChars, i, 3);
 463       final String translated = ResidueProperties.codonTranslate(codon);
 464       /*
 465        * allow * in protein to match untranslatable in dna
 466        */
 467       final char aaRes = aaSeqChars[aaResidue];
 468       if ((translated == null || "STOP".equals(translated)) && aaRes == '*')
 469       {
 470         continue;
 471       }
 472       if (translated == null || !(aaRes == translated.charAt(0)))
 473       {
 474         // debug
 475         // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": "
 476         // + codon + "(" + translated + ") != " + aaRes));
 477         return false;
 478       }
 479     }
 480     // fail if we didn't match all of the aa sequence
 481     return (aaResidue == aaSeqChars.length);
 482   }
 483
 484   /**
 485    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
 486    * currently assumes that we are aligning cDNA to match protein.
 487    *
 488    * @param seq
 489    *          the sequence to be realigned
 490    * @param al
 491    *          the alignment whose sequence alignment is to be 'copied'
 492    * @param gap
 493    *          character string represent a gap in the realigned sequence
 494    * @param preserveUnmappedGaps
 495    * @param preserveMappedGaps
 496    * @return true if the sequence was realigned, false if it could not be
 497    */
 498   public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
 499           String gap, boolean preserveMappedGaps,
 500           boolean preserveUnmappedGaps)
 501   {
 502     /*
 503      * Get any mappings from the source alignment to the target (dataset)
 504      * sequence.
 505      */
 506     // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
 507     // all mappings. Would it help to constrain this?
 508     List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
 509     if (mappings == null || mappings.isEmpty())
 510     {
 511       return false;
 512     }
 513
 514     /*
 515      * Locate the aligned source sequence whose dataset sequence is mapped. We
 516      * just take the first match here (as we can't align like more than one
 517      * sequence).
 518      */
 519     SequenceI alignFrom = null;
 520     AlignedCodonFrame mapping = null;
 521     for (AlignedCodonFrame mp : mappings)
 522     {
 523       alignFrom = mp.findAlignedSequence(seq.getDatasetSequence(), al);
 524       if (alignFrom != null)
 525       {
 526         mapping = mp;
 527         break;
 528       }
 529     }
 530
 531     if (alignFrom == null)
 532     {
 533       return false;
 534     }
 535     alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
 536             preserveMappedGaps, preserveUnmappedGaps);
 537     return true;
 538   }
 539
 540   /**
 541    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
 542    * match residues and codons. Flags control whether existing gaps in unmapped
 543    * (intron) and mapped (exon) regions are preserved or not. Gaps between
 544    * intron and exon are only retained if both flags are set.
 545    *
 546    * @param alignTo
 547    * @param alignFrom
 548    * @param mapping
 549    * @param myGap
 550    * @param sourceGap
 551    * @param preserveUnmappedGaps
 552    * @param preserveMappedGaps
 553    */
 554   public static void alignSequenceAs(SequenceI alignTo,
 555           SequenceI alignFrom, AlignedCodonFrame mapping, String myGap,
 556           char sourceGap, boolean preserveMappedGaps,
 557           boolean preserveUnmappedGaps)
 558   {
 559     // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
 560
 561     // aligned and dataset sequence positions, all base zero
 562     int thisSeqPos = 0;
 563     int sourceDsPos = 0;
 564
 565     int basesWritten = 0;
 566     char myGapChar = myGap.charAt(0);
 567     int ratio = myGap.length();
 568
 569     int fromOffset = alignFrom.getStart() - 1;
 570     int toOffset = alignTo.getStart() - 1;
 571     int sourceGapMappedLength = 0;
 572     boolean inExon = false;
 573     final char[] thisSeq = alignTo.getSequence();
 574     final char[] thatAligned = alignFrom.getSequence();
 575     StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
 576
 577     /*
 578      * Traverse the 'model' aligned sequence
 579      */
 580     for (char sourceChar : thatAligned)
 581     {
 582       if (sourceChar == sourceGap)
 583       {
 584         sourceGapMappedLength += ratio;
 585         continue;
 586       }
 587
 588       /*
 589        * Found a non-gap character. Locate its mapped region if any.
 590        */
 591       sourceDsPos++;
 592       // Note mapping positions are base 1, our sequence positions base 0
 593       int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
 594               sourceDsPos + fromOffset);
 595       if (mappedPos == null)
 596       {
 597         /*
 598          * unmapped position; treat like a gap
 599          */
 600         sourceGapMappedLength += ratio;
 601         // System.err.println("Can't align: no codon mapping to residue "
 602         // + sourceDsPos + "(" + sourceChar + ")");
 603         // return;
 604         continue;
 605       }
 606
 607       int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
 608       int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
 609       StringBuilder trailingCopiedGap = new StringBuilder();
 610
 611       /*
 612        * Copy dna sequence up to and including this codon. Optionally, include
 613        * gaps before the codon starts (in introns) and/or after the codon starts
 614        * (in exons).
 615        *
 616        * Note this only works for 'linear' splicing, not reverse or interleaved.
 617        * But then 'align dna as protein' doesn't make much sense otherwise.
 618        */
 619       int intronLength = 0;
 620       while (basesWritten + toOffset < mappedCodonEnd
 621               && thisSeqPos < thisSeq.length)
 622       {
 623         final char c = thisSeq[thisSeqPos++];
 624         if (c != myGapChar)
 625         {
 626           basesWritten++;
 627           int sourcePosition = basesWritten + toOffset;
 628           if (sourcePosition < mappedCodonStart)
 629           {
 630             /*
 631              * Found an unmapped (intron) base. First add in any preceding gaps
 632              * (if wanted).
 633              */
 634             if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
 635             {
 636               thisAligned.append(trailingCopiedGap.toString());
 637               intronLength += trailingCopiedGap.length();
 638               trailingCopiedGap = new StringBuilder();
 639             }
 640             intronLength++;
 641             inExon = false;
 642           }
 643           else
 644           {
 645             final boolean startOfCodon = sourcePosition == mappedCodonStart;
 646             int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
 647                     preserveUnmappedGaps, sourceGapMappedLength, inExon,
 648                     trailingCopiedGap.length(), intronLength, startOfCodon);
 649             for (int i = 0; i < gapsToAdd; i++)
 650             {
 651               thisAligned.append(myGapChar);
 652             }
 653             sourceGapMappedLength = 0;
 654             inExon = true;
 655           }
 656           thisAligned.append(c);
 657           trailingCopiedGap = new StringBuilder();
 658         }
 659         else
 660         {
 661           if (inExon && preserveMappedGaps)
 662           {
 663             trailingCopiedGap.append(myGapChar);
 664           }
 665           else if (!inExon && preserveUnmappedGaps)
 666           {
 667             trailingCopiedGap.append(myGapChar);
 668           }
 669         }
 670       }
 671     }
 672
 673     /*
 674      * At end of model aligned sequence. Copy any remaining target sequence, optionally
 675      * including (intron) gaps.
 676      */
 677     while (thisSeqPos < thisSeq.length)
 678     {
 679       final char c = thisSeq[thisSeqPos++];
 680       if (c != myGapChar || preserveUnmappedGaps)
 681       {
 682         thisAligned.append(c);
 683       }
 684       sourceGapMappedLength--;
 685     }
 686
 687     /*
 688      * finally add gaps to pad for any trailing source gaps or
 689      * unmapped characters
 690      */
 691     if (preserveUnmappedGaps)
 692     {
 693       while (sourceGapMappedLength > 0)
 694       {
 695         thisAligned.append(myGapChar);
 696         sourceGapMappedLength--;
 697       }
 698     }
 699
 700     /*
 701      * All done aligning, set the aligned sequence.
 702      */
 703     alignTo.setSequence(new String(thisAligned));
 704   }
 705
 706   /**
 707    * Helper method to work out how many gaps to insert when realigning.
 708    *
 709    * @param preserveMappedGaps
 710    * @param preserveUnmappedGaps
 711    * @param sourceGapMappedLength
 712    * @param inExon
 713    * @param trailingCopiedGap
 714    * @param intronLength
 715    * @param startOfCodon
 716    * @return
 717    */
 718   protected static int calculateGapsToInsert(boolean preserveMappedGaps,
 719           boolean preserveUnmappedGaps, int sourceGapMappedLength,
 720           boolean inExon, int trailingGapLength, int intronLength,
 721           final boolean startOfCodon)
 722   {
 723     int gapsToAdd = 0;
 724     if (startOfCodon)
 725     {
 726       /*
 727        * Reached start of codon. Ignore trailing gaps in intron unless we are
 728        * preserving gaps in both exon and intron. Ignore them anyway if the
 729        * protein alignment introduces a gap at least as large as the intronic
 730        * region.
 731        */
 732       if (inExon && !preserveMappedGaps)
 733       {
 734         trailingGapLength = 0;
 735       }
 736       if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
 737       {
 738         trailingGapLength = 0;
 739       }
 740       if (inExon)
 741       {
 742         gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 743       }
 744       else
 745       {
 746         if (intronLength + trailingGapLength <= sourceGapMappedLength)
 747         {
 748           gapsToAdd = sourceGapMappedLength - intronLength;
 749         }
 750         else
 751         {
 752           gapsToAdd = Math.min(intronLength + trailingGapLength
 753                   - sourceGapMappedLength, trailingGapLength);
 754         }
 755       }
 756     }
 757     else
 758     {
 759       /*
 760        * second or third base of codon; check for any gaps in dna
 761        */
 762       if (!preserveMappedGaps)
 763       {
 764         trailingGapLength = 0;
 765       }
 766       gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 767     }
 768     return gapsToAdd;
 769   }
 770
 771   /**
 772    * Returns a list of sequences mapped from the given sequences and aligned
 773    * (gapped) in the same way. For example, the cDNA for aligned protein, where
 774    * a single gap in protein generates three gaps in cDNA.
 775    *
 776    * @param sequences
 777    * @param gapCharacter
 778    * @param mappings
 779    * @return
 780    */
 781   public static List<SequenceI> getAlignedTranslation(
 782           List<SequenceI> sequences, char gapCharacter,
 783           Set<AlignedCodonFrame> mappings)
 784   {
 785     List<SequenceI> alignedSeqs = new ArrayList<SequenceI>();
 786
 787     for (SequenceI seq : sequences)
 788     {
 789       List<SequenceI> mapped = getAlignedTranslation(seq, gapCharacter,
 790               mappings);
 791       alignedSeqs.addAll(mapped);
 792     }
 793     return alignedSeqs;
 794   }
 795
 796   /**
 797    * Returns sequences aligned 'like' the source sequence, as mapped by the
 798    * given mappings. Normally we expect zero or one 'mapped' sequences, but this
 799    * will support 1-to-many as well.
 800    *
 801    * @param seq
 802    * @param gapCharacter
 803    * @param mappings
 804    * @return
 805    */
 806   protected static List<SequenceI> getAlignedTranslation(SequenceI seq,
 807           char gapCharacter, Set<AlignedCodonFrame> mappings)
 808   {
 809     List<SequenceI> result = new ArrayList<SequenceI>();
 810     for (AlignedCodonFrame mapping : mappings)
 811     {
 812       if (mapping.involvesSequence(seq))
 813       {
 814         SequenceI mapped = getAlignedTranslation(seq, gapCharacter, mapping);
 815         if (mapped != null)
 816         {
 817           result.add(mapped);
 818         }
 819       }
 820     }
 821     return result;
 822   }
 823
 824   /**
 825    * Returns the translation of 'seq' (as held in the mapping) with
 826    * corresponding alignment (gaps).
 827    *
 828    * @param seq
 829    * @param gapCharacter
 830    * @param mapping
 831    * @return
 832    */
 833   protected static SequenceI getAlignedTranslation(SequenceI seq,
 834           char gapCharacter, AlignedCodonFrame mapping)
 835   {
 836     String gap = String.valueOf(gapCharacter);
 837     boolean toDna = false;
 838     int fromRatio = 1;
 839     SequenceI mapTo = mapping.getDnaForAaSeq(seq);
 840     if (mapTo != null)
 841     {
 842       // mapping is from protein to nucleotide
 843       toDna = true;
 844       // should ideally get gap count ratio from mapping
 845       gap = String.valueOf(new char[] { gapCharacter, gapCharacter,
 846           gapCharacter });
 847     }
 848     else
 849     {
 850       // mapping is from nucleotide to protein
 851       mapTo = mapping.getAaForDnaSeq(seq);
 852       fromRatio = 3;
 853     }
 854     StringBuilder newseq = new StringBuilder(seq.getLength()
 855             * (toDna ? 3 : 1));
 856
 857     int residueNo = 0; // in seq, base 1
 858     int[] phrase = new int[fromRatio];
 859     int phraseOffset = 0;
 860     int gapWidth = 0;
 861     boolean first = true;
 862     final Sequence alignedSeq = new Sequence("", "");
 863
 864     for (char c : seq.getSequence())
 865     {
 866       if (c == gapCharacter)
 867       {
 868         gapWidth++;
 869         if (gapWidth >= fromRatio)
 870         {
 871           newseq.append(gap);
 872           gapWidth = 0;
 873         }
 874       }
 875       else
 876       {
 877         phrase[phraseOffset++] = residueNo + 1;
 878         if (phraseOffset == fromRatio)
 879         {
 880           /*
 881            * Have read a whole codon (or protein residue), now translate: map
 882            * source phrase to positions in target sequence add characters at
 883            * these positions to newseq Note mapping positions are base 1, our
 884            * sequence positions base 0.
 885            */
 886           SearchResults sr = new SearchResults();
 887           for (int pos : phrase)
 888           {
 889             mapping.markMappedRegion(seq, pos, sr);
 890           }
 891           newseq.append(sr.getCharacters());
 892           if (first)
 893           {
 894             first = false;
 895             // Hack: Copy sequence dataset, name and description from
 896             // SearchResults.match[0].sequence
 897             // TODO? carry over sequence names from original 'complement'
 898             // alignment
 899             SequenceI mappedTo = sr.getResultSequence(0);
 900             alignedSeq.setName(mappedTo.getName());
 901             alignedSeq.setDescription(mappedTo.getDescription());
 902             alignedSeq.setDatasetSequence(mappedTo);
 903           }
 904           phraseOffset = 0;
 905         }
 906         residueNo++;
 907       }
 908     }
 909     alignedSeq.setSequence(newseq.toString());
 910     return alignedSeq;
 911   }
 912
 913   /**
 914    * Realigns the given protein to match the alignment of the dna, using codon
 915    * mappings to translate aligned codon positions to protein residues.
 916    *
 917    * @param protein
 918    *          the alignment whose sequences are realigned by this method
 919    * @param dna
 920    *          the dna alignment whose alignment we are 'copying'
 921    * @return the number of sequences that were realigned
 922    */
 923   public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
 924   {
 925     List<SequenceI> unmappedProtein = new ArrayList<SequenceI>();
 926     unmappedProtein.addAll(protein.getSequences());
 927
 928     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
 929
 930     /*
 931      * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
 932      * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
 933      * comparator keeps the codon positions ordered.
 934      */
 935     Map<AlignedCodon, Map<SequenceI, String>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, String>>(
 936             new CodonComparator());
 937     for (SequenceI dnaSeq : dna.getSequences())
 938     {
 939       for (AlignedCodonFrame mapping : mappings)
 940       {
 941         Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
 942         SequenceI prot = mapping.findAlignedSequence(
 943                 dnaSeq.getDatasetSequence(), protein);
 944         if (prot != null)
 945         {
 946           addCodonPositions(dnaSeq, prot, protein.getGapCharacter(),
 947                   seqMap, alignedCodons);
 948           unmappedProtein.remove(prot);
 949         }
 950       }
 951     }
 952     return alignProteinAs(protein, alignedCodons, unmappedProtein);
 953   }
 954
 955   /**
 956    * Update the aligned protein sequences to match the codon alignments given in
 957    * the map.
 958    *
 959    * @param protein
 960    * @param alignedCodons
 961    *          an ordered map of codon positions (columns), with sequence/peptide
 962    *          values present in each column
 963    * @param unmappedProtein
 964    * @return
 965    */
 966   protected static int alignProteinAs(AlignmentI protein,
 967           Map<AlignedCodon, Map<SequenceI, String>> alignedCodons,
 968           List<SequenceI> unmappedProtein)
 969   {
 970     /*
 971      * Prefill aligned sequences with gaps before inserting aligned protein
 972      * residues.
 973      */
 974     int alignedWidth = alignedCodons.size();
 975     char[] gaps = new char[alignedWidth];
 976     Arrays.fill(gaps, protein.getGapCharacter());
 977     String allGaps = String.valueOf(gaps);
 978     for (SequenceI seq : protein.getSequences())
 979     {
 980       if (!unmappedProtein.contains(seq))
 981       {
 982         seq.setSequence(allGaps);
 983       }
 984     }
 985
 986     int column = 0;
 987     for (AlignedCodon codon : alignedCodons.keySet())
 988     {
 989       final Map<SequenceI, String> columnResidues = alignedCodons
 990               .get(codon);
 991       for (Entry<SequenceI, String> entry : columnResidues.entrySet())
 992       {
 993         // place translated codon at its column position in sequence
 994         entry.getKey().getSequence()[column] = entry.getValue().charAt(0);
 995       }
 996       column++;
 997     }
 998     return 0;
 999   }
1000
1001   /**
1002    * Populate the map of aligned codons by traversing the given sequence
1003    * mapping, locating the aligned positions of mapped codons, and adding those
1004    * positions and their translation products to the map.
1005    *
1006    * @param dna
1007    *          the aligned sequence we are mapping from
1008    * @param protein
1009    *          the sequence to be aligned to the codons
1010    * @param gapChar
1011    *          the gap character in the dna sequence
1012    * @param seqMap
1013    *          a mapping to a sequence translation
1014    * @param alignedCodons
1015    *          the map we are building up
1016    */
1017   static void addCodonPositions(SequenceI dna, SequenceI protein,
1018           char gapChar, Mapping seqMap,
1019           Map<AlignedCodon, Map<SequenceI, String>> alignedCodons)
1020   {
1021     Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1022     while (codons.hasNext())
1023     {
1024       AlignedCodon codon = codons.next();
1025       Map<SequenceI, String> seqProduct = alignedCodons.get(codon);
1026       if (seqProduct == null)
1027       {
1028         seqProduct = new HashMap<SequenceI, String>();
1029         alignedCodons.put(codon, seqProduct);
1030       }
1031       seqProduct.put(protein, codon.product);
1032     }
1033   }
1034
1035   /**
1036    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1037    * between at least one pair of sequences in the two alignments. Currently,
1038    * the logic is:
1039    * <ul>
1040    * <li>One alignment must be nucleotide, and the other protein</li>
1041    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1042    * <li>Mappable means the nucleotide translation matches the protein sequence</li>
1043    * <li>The translation may ignore start and stop codons if present in the
1044    * nucleotide</li>
1045    * </ul>
1046    *
1047    * @param al1
1048    * @param al2
1049    * @return
1050    */
1051   public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1052   {
1053     if (al1 == null || al2 == null)
1054     {
1055       return false;
1056     }
1057
1058     /*
1059      * Require one nucleotide and one protein
1060      */
1061     if (al1.isNucleotide() == al2.isNucleotide())
1062     {
1063       return false;
1064     }
1065     AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1066     AlignmentI protein = dna == al1 ? al2 : al1;
1067     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1068     for (SequenceI dnaSeq : dna.getSequences())
1069     {
1070       for (SequenceI proteinSeq : protein.getSequences())
1071       {
1072         if (isMappable(dnaSeq, proteinSeq, mappings))
1073         {
1074           return true;
1075         }
1076       }
1077     }
1078     return false;
1079   }
1080
1081   /**
1082    * Returns true if the dna sequence is mapped, or could be mapped, to the
1083    * protein sequence.
1084    *
1085    * @param dnaSeq
1086    * @param proteinSeq
1087    * @param mappings
1088    * @return
1089    */
1090   protected static boolean isMappable(SequenceI dnaSeq,
1091           SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1092   {
1093     if (dnaSeq == null || proteinSeq == null)
1094     {
1095       return false;
1096     }
1097
1098     SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq
1099             .getDatasetSequence();
1100     SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq
1101             : proteinSeq.getDatasetSequence();
1102
1103     for (AlignedCodonFrame mapping : mappings)
1104     {
1105       if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1106       {
1107         /*
1108          * already mapped
1109          */
1110         return true;
1111       }
1112     }
1113
1114     /*
1115      * Just try to make a mapping (it is not yet stored), test whether
1116      * successful.
1117      */
1118     return mapProteinSequenceToCdna(proteinDs, dnaDs) != null;
1119   }
1120
1121   /**
1122    * Finds any reference annotations associated with the sequences in
1123    * sequenceScope, that are not already added to the alignment, and adds them
1124    * to the 'candidates' map. Also populates a lookup table of annotation
1125    * labels, keyed by calcId, for use in constructing tooltips or the like.
1126    *
1127    * @param sequenceScope
1128    *          the sequences to scan for reference annotations
1129    * @param labelForCalcId
1130    *          (optional) map to populate with label for calcId
1131    * @param candidates
1132    *          map to populate with annotations for sequence
1133    * @param al
1134    *          the alignment to check for presence of annotations
1135    */
1136   public static void findAddableReferenceAnnotations(
1137           List<SequenceI> sequenceScope,
1138           Map<String, String> labelForCalcId,
1139           final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1140           AlignmentI al)
1141   {
1142     if (sequenceScope == null)
1143     {
1144       return;
1145     }
1146
1147     /*
1148      * For each sequence in scope, make a list of any annotations on the
1149      * underlying dataset sequence which are not already on the alignment.
1150      *
1151      * Add to a map of { alignmentSequence, <List of annotations to add> }
1152      */
1153     for (SequenceI seq : sequenceScope)
1154     {
1155       SequenceI dataset = seq.getDatasetSequence();
1156       if (dataset == null)
1157       {
1158         continue;
1159       }
1160       AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1161       if (datasetAnnotations == null)
1162       {
1163         continue;
1164       }
1165       final List<AlignmentAnnotation> result = new ArrayList<AlignmentAnnotation>();
1166       for (AlignmentAnnotation dsann : datasetAnnotations)
1167       {
1168         /*
1169          * Find matching annotations on the alignment. If none is found, then
1170          * add this annotation to the list of 'addable' annotations for this
1171          * sequence.
1172          */
1173         final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1174                 .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1175         if (!matchedAlignmentAnnotations.iterator().hasNext())
1176         {
1177           result.add(dsann);
1178           if (labelForCalcId != null)
1179           {
1180             labelForCalcId.put(dsann.getCalcId(), dsann.label);
1181           }
1182         }
1183       }
1184       /*
1185        * Save any addable annotations for this sequence
1186        */
1187       if (!result.isEmpty())
1188       {
1189         candidates.put(seq, result);
1190       }
1191     }
1192   }
1193
1194   /**
1195    * Adds annotations to the top of the alignment annotations, in the same order
1196    * as their related sequences.
1197    *
1198    * @param annotations
1199    *          the annotations to add
1200    * @param alignment
1201    *          the alignment to add them to
1202    * @param selectionGroup
1203    *          current selection group (or null if none)
1204    */
1205   public static void addReferenceAnnotations(
1206           Map<SequenceI, List<AlignmentAnnotation>> annotations,
1207           final AlignmentI alignment, final SequenceGroup selectionGroup)
1208   {
1209     for (SequenceI seq : annotations.keySet())
1210     {
1211       for (AlignmentAnnotation ann : annotations.get(seq))
1212       {
1213         AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1214         int startRes = 0;
1215         int endRes = ann.annotations.length;
1216         if (selectionGroup != null)
1217         {
1218           startRes = selectionGroup.getStartRes();
1219           endRes = selectionGroup.getEndRes();
1220         }
1221         copyAnn.restrict(startRes, endRes);
1222
1223         /*
1224          * Add to the sequence (sets copyAnn.datasetSequence), unless the
1225          * original annotation is already on the sequence.
1226          */
1227         if (!seq.hasAnnotation(ann))
1228         {
1229           seq.addAlignmentAnnotation(copyAnn);
1230         }
1231         // adjust for gaps
1232         copyAnn.adjustForAlignment();
1233         // add to the alignment and set visible
1234         alignment.addAnnotation(copyAnn);
1235         copyAnn.visible = true;
1236       }
1237     }
1238   }
1239
1240   /**
1241    * Set visibility of alignment annotations of specified types (labels), for
1242    * specified sequences. This supports controls like
1243    * "Show all secondary structure", "Hide all Temp factor", etc.
1244    *
1245    * @al the alignment to scan for annotations
1246    * @param types
1247    *          the types (labels) of annotations to be updated
1248    * @param forSequences
1249    *          if not null, only annotations linked to one of these sequences are
1250    *          in scope for update; if null, acts on all sequence annotations
1251    * @param anyType
1252    *          if this flag is true, 'types' is ignored (label not checked)
1253    * @param doShow
1254    *          if true, set visibility on, else set off
1255    */
1256   public static void showOrHideSequenceAnnotations(AlignmentI al,
1257           Collection<String> types, List<SequenceI> forSequences,
1258           boolean anyType, boolean doShow)
1259   {
1260     for (AlignmentAnnotation aa : al.getAlignmentAnnotation())
1261     {
1262       if (anyType || types.contains(aa.label))
1263       {
1264         if ((aa.sequenceRef != null)
1265                 && (forSequences == null || forSequences
1266                         .contains(aa.sequenceRef)))
1267         {
1268           aa.visible = doShow;
1269         }
1270       }
1271     }
1272   }
1273
1274   /**
1275    * Returns true if either sequence has a cross-reference to the other
1276    *
1277    * @param seq1
1278    * @param seq2
1279    * @return
1280    */
1281   public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1282   {
1283     // Note: moved here from class CrossRef as the latter class has dependencies
1284     // not availability to the applet's classpath
1285     return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1286   }
1287
1288   /**
1289    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1290    * that sequence name is structured as Source|AccessionId.
1291    *
1292    * @param seq1
1293    * @param seq2
1294    * @return
1295    */
1296   public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1297   {
1298     if (seq1 == null || seq2 == null)
1299     {
1300       return false;
1301     }
1302     String name = seq2.getName();
1303     final DBRefEntry[] xrefs = seq1.getDBRef();
1304     if (xrefs != null)
1305     {
1306       for (DBRefEntry xref : xrefs)
1307       {
1308         String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1309         // case-insensitive test, consistent with DBRefEntry.equalRef()
1310         if (xrefName.equalsIgnoreCase(name))
1311         {
1312           return true;
1313         }
1314       }
1315     }
1316     return false;
1317   }
1318
1319   /**
1320    * Constructs an alignment consisting of the mapped exon regions in the given
1321    * nucleotide sequences, and updates mappings to match.
1322    *
1323    * @param dna
1324    *          aligned dna sequences
1325    * @param mappings
1326    *          from dna to protein; these are replaced with new mappings
1327    * @return an alignment whose sequences are the exon-only parts of the dna
1328    *         sequences (or null if no exons are found)
1329    */
1330   public static AlignmentI makeExonAlignment(SequenceI[] dna,
1331           List<AlignedCodonFrame> mappings)
1332   {
1333     List<AlignedCodonFrame> newMappings = new ArrayList<AlignedCodonFrame>();
1334     List<SequenceI> exonSequences = new ArrayList<SequenceI>();
1335
1336     for (SequenceI dnaSeq : dna)
1337     {
1338       final SequenceI ds = dnaSeq.getDatasetSequence();
1339       List<AlignedCodonFrame> seqMappings = MappingUtils
1340               .findMappingsForSequence(ds, mappings);
1341       for (AlignedCodonFrame acf : seqMappings)
1342       {
1343         AlignedCodonFrame newMapping = new AlignedCodonFrame();
1344         final List<SequenceI> mappedExons = makeExonSequences(ds, acf,
1345                 newMapping);
1346         if (!mappedExons.isEmpty())
1347         {
1348           exonSequences.addAll(mappedExons);
1349           newMappings.add(newMapping);
1350         }
1351       }
1352     }
1353     AlignmentI al = new Alignment(
1354             exonSequences.toArray(new SequenceI[exonSequences.size()]));
1355     al.setDataset(null);
1356
1357     /*
1358      * Replace the old mappings with the new ones
1359      */
1360     mappings.clear();
1361     mappings.addAll(newMappings);
1362
1363     return al;
1364   }
1365
1366   /**
1367    * Helper method to make exon-only sequences and populate their mappings to
1368    * protein products
1369    * <p>
1370    * For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein
1371    * then generate a sequence CCTTGA with mapping [1, 6] to the same protein
1372    * residues
1373    * <p>
1374    * Typically eukaryotic dna will include exons encoding for a single peptide
1375    * sequence i.e. return a single result. Bacterial dna may have overlapping
1376    * exon mappings coding for multiple peptides so return multiple results
1377    * (example EMBL KF591215).
1378    *
1379    * @param dnaSeq
1380    *          a dna dataset sequence
1381    * @param mapping
1382    *          containing one or more mappings of the sequence to protein
1383    * @param newMapping
1384    *          the new mapping to populate, from the exon-only sequences to their
1385    *          mapped protein sequences
1386    * @return
1387    */
1388   protected static List<SequenceI> makeExonSequences(SequenceI dnaSeq,
1389           AlignedCodonFrame mapping, AlignedCodonFrame newMapping)
1390   {
1391     List<SequenceI> exonSequences = new ArrayList<SequenceI>();
1392     List<Mapping> seqMappings = mapping.getMappingsForSequence(dnaSeq);
1393     final char[] dna = dnaSeq.getSequence();
1394     for (Mapping seqMapping : seqMappings)
1395     {
1396       StringBuilder newSequence = new StringBuilder(dnaSeq.getLength());
1397
1398       /*
1399        * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
1400        */
1401       final List<int[]> dnaExonRanges = seqMapping.getMap().getFromRanges();
1402       for (int[] range : dnaExonRanges)
1403       {
1404         for (int pos = range[0]; pos <= range[1]; pos++)
1405         {
1406           newSequence.append(dna[pos - 1]);
1407         }
1408       }
1409
1410       SequenceI exon = new Sequence(dnaSeq.getName(),
1411               newSequence.toString());
1412
1413       /*
1414        * Locate any xrefs to CDS database on the protein product and attach to
1415        * the CDS sequence. Also add as a sub-token of the sequence name.
1416        */
1417       // default to "CDS" if we can't locate an actual gene id
1418       String cdsAccId = FeatureProperties
1419               .getCodingFeature(DBRefSource.EMBL);
1420       DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo()
1421               .getDBRef(), DBRefSource.CODINGDBS);
1422       if (cdsRefs != null)
1423       {
1424         for (DBRefEntry cdsRef : cdsRefs)
1425         {
1426           exon.addDBRef(new DBRefEntry(cdsRef));
1427           cdsAccId = cdsRef.getAccessionId();
1428         }
1429       }
1430       exon.setName(exon.getName() + "|" + cdsAccId);
1431       exon.createDatasetSequence();
1432
1433       /*
1434        * Build new mappings - from the same protein regions, but now to
1435        * contiguous exons
1436        */
1437       List<int[]> exonRange = new ArrayList<int[]>();
1438       exonRange.add(new int[] { 1, newSequence.length() });
1439       MapList map = new MapList(exonRange, seqMapping.getMap()
1440               .getToRanges(), 3, 1);
1441       newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map);
1442       MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1);
1443       newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap);
1444
1445       exonSequences.add(exon);
1446     }
1447     return exonSequences;
1448   }
1449 }