src/jalview/analysis/AlignmentUtils.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis;
  22
  23 import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
  24
  25 import jalview.datamodel.AlignedCodon;
  26 import jalview.datamodel.AlignedCodonFrame;
  27 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
  28 import jalview.datamodel.Alignment;
  29 import jalview.datamodel.AlignmentAnnotation;
  30 import jalview.datamodel.AlignmentI;
  31 import jalview.datamodel.DBRefEntry;
  32 import jalview.datamodel.GeneLociI;
  33 import jalview.datamodel.IncompleteCodonException;
  34 import jalview.datamodel.Mapping;
  35 import jalview.datamodel.Sequence;
  36 import jalview.datamodel.SequenceFeature;
  37 import jalview.datamodel.SequenceGroup;
  38 import jalview.datamodel.SequenceI;
  39 import jalview.datamodel.features.SequenceFeatures;
  40 import jalview.io.gff.Gff3Helper;
  41 import jalview.io.gff.SequenceOntologyI;
  42 import jalview.schemes.ResidueProperties;
  43 import jalview.util.Comparison;
  44 import jalview.util.DBRefUtils;
  45 import jalview.util.IntRangeComparator;
  46 import jalview.util.MapList;
  47 import jalview.util.MappingUtils;
  48 import jalview.util.StringUtils;
  49
  50 import java.io.UnsupportedEncodingException;
  51 import java.net.URLEncoder;
  52 import java.util.ArrayList;
  53 import java.util.Arrays;
  54 import java.util.Collection;
  55 import java.util.Collections;
  56 import java.util.HashMap;
  57 import java.util.HashSet;
  58 import java.util.Iterator;
  59 import java.util.LinkedHashMap;
  60 import java.util.List;
  61 import java.util.Map;
  62 import java.util.Map.Entry;
  63 import java.util.NoSuchElementException;
  64 import java.util.Set;
  65 import java.util.SortedMap;
  66 import java.util.TreeMap;
  67
  68 /**
  69  * grab bag of useful alignment manipulation operations Expect these to be
  70  * refactored elsewhere at some point.
  71  *
  72  * @author jimp
  73  *
  74  */
  75 public class AlignmentUtils
  76 {
  77   private static final int CODON_LENGTH = 3;
  78
  79   private static final String SEQUENCE_VARIANT = "sequence_variant:";
  80
  81   /*
  82    * the 'id' attribute is provided for variant features fetched from
  83    * Ensembl using its REST service with JSON format
  84    */
  85   public static final String VARIANT_ID = "id";
  86
  87   /**
  88    * A data model to hold the 'normal' base value at a position, and an optional
  89    * sequence variant feature
  90    */
  91   static final class DnaVariant
  92   {
  93     final String base;
  94
  95     SequenceFeature variant;
  96
  97     DnaVariant(String nuc)
  98     {
  99       base = nuc;
 100       variant = null;
 101     }
 102
 103     DnaVariant(String nuc, SequenceFeature var)
 104     {
 105       base = nuc;
 106       variant = var;
 107     }
 108
 109     public String getSource()
 110     {
 111       return variant == null ? null : variant.getFeatureGroup();
 112     }
 113
 114     /**
 115      * toString for aid in the debugger only
 116      */
 117     @Override
 118     public String toString()
 119     {
 120       return base + ":" + (variant == null ? "" : variant.getDescription());
 121     }
 122   }
 123
 124   /**
 125    * given an existing alignment, create a new alignment including all, or up to
 126    * flankSize additional symbols from each sequence's dataset sequence
 127    *
 128    * @param core
 129    * @param flankSize
 130    * @return AlignmentI
 131    */
 132   public static AlignmentI expandContext(AlignmentI core, int flankSize)
 133   {
 134     List<SequenceI> sq = new ArrayList<>();
 135     int maxoffset = 0;
 136     for (SequenceI s : core.getSequences())
 137     {
 138       SequenceI newSeq = s.deriveSequence();
 139       final int newSeqStart = newSeq.getStart() - 1;
 140       if (newSeqStart > maxoffset
 141               && newSeq.getDatasetSequence().getStart() < s.getStart())
 142       {
 143         maxoffset = newSeqStart;
 144       }
 145       sq.add(newSeq);
 146     }
 147     if (flankSize > -1)
 148     {
 149       maxoffset = Math.min(maxoffset, flankSize);
 150     }
 151
 152     /*
 153      * now add offset left and right to create an expanded alignment
 154      */
 155     for (SequenceI s : sq)
 156     {
 157       SequenceI ds = s;
 158       while (ds.getDatasetSequence() != null)
 159       {
 160         ds = ds.getDatasetSequence();
 161       }
 162       int s_end = s.findPosition(s.getStart() + s.getLength());
 163       // find available flanking residues for sequence
 164       int ustream_ds = s.getStart() - ds.getStart();
 165       int dstream_ds = ds.getEnd() - s_end;
 166
 167       // build new flanked sequence
 168
 169       // compute gap padding to start of flanking sequence
 170       int offset = maxoffset - ustream_ds;
 171
 172       // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
 173       if (flankSize >= 0)
 174       {
 175         if (flankSize < ustream_ds)
 176         {
 177           // take up to flankSize residues
 178           offset = maxoffset - flankSize;
 179           ustream_ds = flankSize;
 180         }
 181         if (flankSize <= dstream_ds)
 182         {
 183           dstream_ds = flankSize - 1;
 184         }
 185       }
 186       // TODO use Character.toLowerCase to avoid creating String objects?
 187       char[] upstream = new String(ds
 188               .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
 189                       .toLowerCase().toCharArray();
 190       char[] downstream = new String(
 191               ds.getSequence(s_end - 1, s_end + dstream_ds)).toLowerCase()
 192                       .toCharArray();
 193       char[] coreseq = s.getSequence();
 194       char[] nseq = new char[offset + upstream.length + downstream.length
 195               + coreseq.length];
 196       char c = core.getGapCharacter();
 197
 198       int p = 0;
 199       for (; p < offset; p++)
 200       {
 201         nseq[p] = c;
 202       }
 203
 204       System.arraycopy(upstream, 0, nseq, p, upstream.length);
 205       System.arraycopy(coreseq, 0, nseq, p + upstream.length,
 206               coreseq.length);
 207       System.arraycopy(downstream, 0, nseq,
 208               p + coreseq.length + upstream.length, downstream.length);
 209       s.setSequence(new String(nseq));
 210       s.setStart(s.getStart() - ustream_ds);
 211       s.setEnd(s_end + downstream.length);
 212     }
 213     AlignmentI newAl = new jalview.datamodel.Alignment(
 214             sq.toArray(new SequenceI[0]));
 215     for (SequenceI s : sq)
 216     {
 217       if (s.getAnnotation() != null)
 218       {
 219         for (AlignmentAnnotation aa : s.getAnnotation())
 220         {
 221           aa.adjustForAlignment(); // JAL-1712 fix
 222           newAl.addAnnotation(aa);
 223         }
 224       }
 225     }
 226     newAl.setDataset(core.getDataset());
 227     return newAl;
 228   }
 229
 230   /**
 231    * Returns a map of lists of sequences in the alignment, keyed by sequence
 232    * name. For use in mapping between different alignment views of the same
 233    * sequences.
 234    *
 235    * @see jalview.datamodel.AlignmentI#getSequencesByName()
 236    */
 237   public static Map<String, List<SequenceI>> getSequencesByName(
 238           AlignmentI al)
 239   {
 240     Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
 241     for (SequenceI seq : al.getSequences())
 242     {
 243       String name = seq.getName();
 244       if (name != null)
 245       {
 246         List<SequenceI> seqs = theMap.get(name);
 247         if (seqs == null)
 248         {
 249           seqs = new ArrayList<>();
 250           theMap.put(name, seqs);
 251         }
 252         seqs.add(seq);
 253       }
 254     }
 255     return theMap;
 256   }
 257
 258   /**
 259    * Build mapping of protein to cDNA alignment. Mappings are made between
 260    * sequences where the cDNA translates to the protein sequence. Any new
 261    * mappings are added to the protein alignment. Returns true if any mappings
 262    * either already exist or were added, else false.
 263    *
 264    * @param proteinAlignment
 265    * @param cdnaAlignment
 266    * @return
 267    */
 268   public static boolean mapProteinAlignmentToCdna(
 269           final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
 270   {
 271     if (proteinAlignment == null || cdnaAlignment == null)
 272     {
 273       return false;
 274     }
 275
 276     Set<SequenceI> mappedDna = new HashSet<>();
 277     Set<SequenceI> mappedProtein = new HashSet<>();
 278
 279     /*
 280      * First pass - map sequences where cross-references exist. This include
 281      * 1-to-many mappings to support, for example, variant cDNA.
 282      */
 283     boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
 284             cdnaAlignment, mappedDna, mappedProtein, true);
 285
 286     /*
 287      * Second pass - map sequences where no cross-references exist. This only
 288      * does 1-to-1 mappings and assumes corresponding sequences are in the same
 289      * order in the alignments.
 290      */
 291     mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
 292             mappedDna, mappedProtein, false);
 293     return mappingPerformed;
 294   }
 295
 296   /**
 297    * Make mappings between compatible sequences (where the cDNA translation
 298    * matches the protein).
 299    *
 300    * @param proteinAlignment
 301    * @param cdnaAlignment
 302    * @param mappedDna
 303    *          a set of mapped DNA sequences (to add to)
 304    * @param mappedProtein
 305    *          a set of mapped Protein sequences (to add to)
 306    * @param xrefsOnly
 307    *          if true, only map sequences where xrefs exist
 308    * @return
 309    */
 310   protected static boolean mapProteinToCdna(
 311           final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
 312           Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
 313           boolean xrefsOnly)
 314   {
 315     boolean mappingExistsOrAdded = false;
 316     List<SequenceI> thisSeqs = proteinAlignment.getSequences();
 317     for (SequenceI aaSeq : thisSeqs)
 318     {
 319       boolean proteinMapped = false;
 320       AlignedCodonFrame acf = new AlignedCodonFrame();
 321
 322       for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
 323       {
 324         /*
 325          * Always try to map if sequences have xref to each other; this supports
 326          * variant cDNA or alternative splicing for a protein sequence.
 327          *
 328          * If no xrefs, try to map progressively, assuming that alignments have
 329          * mappable sequences in corresponding order. These are not
 330          * many-to-many, as that would risk mixing species with similar cDNA
 331          * sequences.
 332          */
 333         if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
 334         {
 335           continue;
 336         }
 337
 338         /*
 339          * Don't map non-xrefd sequences more than once each. This heuristic
 340          * allows us to pair up similar sequences in ordered alignments.
 341          */
 342         if (!xrefsOnly && (mappedProtein.contains(aaSeq)
 343                 || mappedDna.contains(cdnaSeq)))
 344         {
 345           continue;
 346         }
 347         if (mappingExists(proteinAlignment.getCodonFrames(),
 348                 aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
 349         {
 350           mappingExistsOrAdded = true;
 351         }
 352         else
 353         {
 354           MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
 355           if (map != null)
 356           {
 357             acf.addMap(cdnaSeq, aaSeq, map);
 358             mappingExistsOrAdded = true;
 359             proteinMapped = true;
 360             mappedDna.add(cdnaSeq);
 361             mappedProtein.add(aaSeq);
 362           }
 363         }
 364       }
 365       if (proteinMapped)
 366       {
 367         proteinAlignment.addCodonFrame(acf);
 368       }
 369     }
 370     return mappingExistsOrAdded;
 371   }
 372
 373   /**
 374    * Answers true if the mappings include one between the given (dataset)
 375    * sequences.
 376    */
 377   protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
 378           SequenceI aaSeq, SequenceI cdnaSeq)
 379   {
 380     if (mappings != null)
 381     {
 382       for (AlignedCodonFrame acf : mappings)
 383       {
 384         if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
 385         {
 386           return true;
 387         }
 388       }
 389     }
 390     return false;
 391   }
 392
 393   /**
 394    * Builds a mapping (if possible) of a cDNA to a protein sequence.
 395    * <ul>
 396    * <li>first checks if the cdna translates exactly to the protein
 397    * sequence</li>
 398    * <li>else checks for translation after removing a STOP codon</li>
 399    * <li>else checks for translation after removing a START codon</li>
 400    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
 401    * </ul>
 402    * Returns null if no mapping is determined.
 403    *
 404    * @param proteinSeq
 405    *          the aligned protein sequence
 406    * @param cdnaSeq
 407    *          the aligned cdna sequence
 408    * @return
 409    */
 410   public static MapList mapCdnaToProtein(SequenceI proteinSeq,
 411           SequenceI cdnaSeq)
 412   {
 413     /*
 414      * Here we handle either dataset sequence set (desktop) or absent (applet).
 415      * Use only the char[] form of the sequence to avoid creating possibly large
 416      * String objects.
 417      */
 418     final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
 419     char[] aaSeqChars = proteinDataset != null
 420             ? proteinDataset.getSequence()
 421             : proteinSeq.getSequence();
 422     final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
 423     char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
 424             : cdnaSeq.getSequence();
 425     if (aaSeqChars == null || cdnaSeqChars == null)
 426     {
 427       return null;
 428     }
 429
 430     /*
 431      * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
 432      */
 433     final int mappedLength = CODON_LENGTH * aaSeqChars.length;
 434     int cdnaLength = cdnaSeqChars.length;
 435     int cdnaStart = cdnaSeq.getStart();
 436     int cdnaEnd = cdnaSeq.getEnd();
 437     final int proteinStart = proteinSeq.getStart();
 438     final int proteinEnd = proteinSeq.getEnd();
 439
 440     /*
 441      * If lengths don't match, try ignoring stop codon (if present)
 442      */
 443     if (cdnaLength != mappedLength && cdnaLength > 2)
 444     {
 445       String lastCodon = String.valueOf(cdnaSeqChars,
 446               cdnaLength - CODON_LENGTH, CODON_LENGTH).toUpperCase();
 447       for (String stop : ResidueProperties.STOP_CODONS)
 448       {
 449         if (lastCodon.equals(stop))
 450         {
 451           cdnaEnd -= CODON_LENGTH;
 452           cdnaLength -= CODON_LENGTH;
 453           break;
 454         }
 455       }
 456     }
 457
 458     /*
 459      * If lengths still don't match, try ignoring start codon.
 460      */
 461     int startOffset = 0;
 462     if (cdnaLength != mappedLength && cdnaLength > 2
 463             && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH).toUpperCase()
 464                     .equals(ResidueProperties.START))
 465     {
 466       startOffset += CODON_LENGTH;
 467       cdnaStart += CODON_LENGTH;
 468       cdnaLength -= CODON_LENGTH;
 469     }
 470
 471     if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
 472     {
 473       /*
 474        * protein is translation of dna (+/- start/stop codons)
 475        */
 476       MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
 477               new int[]
 478               { proteinStart, proteinEnd }, CODON_LENGTH, 1);
 479       return map;
 480     }
 481
 482     /*
 483      * translation failed - try mapping CDS annotated regions of dna
 484      */
 485     return mapCdsToProtein(cdnaSeq, proteinSeq);
 486   }
 487
 488   /**
 489    * Test whether the given cdna sequence, starting at the given offset,
 490    * translates to the given amino acid sequence, using the standard translation
 491    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
 492    *
 493    * @param cdnaSeqChars
 494    * @param cdnaStart
 495    * @param aaSeqChars
 496    * @return
 497    */
 498   protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
 499           char[] aaSeqChars)
 500   {
 501     if (cdnaSeqChars == null || aaSeqChars == null)
 502     {
 503       return false;
 504     }
 505
 506     int aaPos = 0;
 507     int dnaPos = cdnaStart;
 508     for (; dnaPos < cdnaSeqChars.length - 2
 509             && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
 510     {
 511       String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
 512       final String translated = ResidueProperties.codonTranslate(codon);
 513
 514       /*
 515        * allow * in protein to match untranslatable in dna
 516        */
 517       final char aaRes = aaSeqChars[aaPos];
 518       if ((translated == null || ResidueProperties.STOP.equals(translated))
 519               && aaRes == '*')
 520       {
 521         continue;
 522       }
 523       if (translated == null || !(aaRes == translated.charAt(0)))
 524       {
 525         // debug
 526         // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": "
 527         // + codon + "(" + translated + ") != " + aaRes));
 528         return false;
 529       }
 530     }
 531
 532     /*
 533      * check we matched all of the protein sequence
 534      */
 535     if (aaPos != aaSeqChars.length)
 536     {
 537       return false;
 538     }
 539
 540     /*
 541      * check we matched all of the dna except
 542      * for optional trailing STOP codon
 543      */
 544     if (dnaPos == cdnaSeqChars.length)
 545     {
 546       return true;
 547     }
 548     if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
 549     {
 550       String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
 551       if (ResidueProperties.STOP
 552               .equals(ResidueProperties.codonTranslate(codon)))
 553       {
 554         return true;
 555       }
 556     }
 557     return false;
 558   }
 559
 560   /**
 561    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
 562    * currently assumes that we are aligning cDNA to match protein.
 563    *
 564    * @param seq
 565    *          the sequence to be realigned
 566    * @param al
 567    *          the alignment whose sequence alignment is to be 'copied'
 568    * @param gap
 569    *          character string represent a gap in the realigned sequence
 570    * @param preserveUnmappedGaps
 571    * @param preserveMappedGaps
 572    * @return true if the sequence was realigned, false if it could not be
 573    */
 574   public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
 575           String gap, boolean preserveMappedGaps,
 576           boolean preserveUnmappedGaps)
 577   {
 578     /*
 579      * Get any mappings from the source alignment to the target (dataset)
 580      * sequence.
 581      */
 582     // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
 583     // all mappings. Would it help to constrain this?
 584     List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
 585     if (mappings == null || mappings.isEmpty())
 586     {
 587       return false;
 588     }
 589
 590     /*
 591      * Locate the aligned source sequence whose dataset sequence is mapped. We
 592      * just take the first match here (as we can't align like more than one
 593      * sequence).
 594      */
 595     SequenceI alignFrom = null;
 596     AlignedCodonFrame mapping = null;
 597     for (AlignedCodonFrame mp : mappings)
 598     {
 599       alignFrom = mp.findAlignedSequence(seq, al);
 600       if (alignFrom != null)
 601       {
 602         mapping = mp;
 603         break;
 604       }
 605     }
 606
 607     if (alignFrom == null)
 608     {
 609       return false;
 610     }
 611     alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
 612             preserveMappedGaps, preserveUnmappedGaps);
 613     return true;
 614   }
 615
 616   /**
 617    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
 618    * match residues and codons. Flags control whether existing gaps in unmapped
 619    * (intron) and mapped (exon) regions are preserved or not. Gaps between
 620    * intron and exon are only retained if both flags are set.
 621    *
 622    * @param alignTo
 623    * @param alignFrom
 624    * @param mapping
 625    * @param myGap
 626    * @param sourceGap
 627    * @param preserveUnmappedGaps
 628    * @param preserveMappedGaps
 629    */
 630   public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
 631           AlignedCodonFrame mapping, String myGap, char sourceGap,
 632           boolean preserveMappedGaps, boolean preserveUnmappedGaps)
 633   {
 634     // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
 635
 636     // aligned and dataset sequence positions, all base zero
 637     int thisSeqPos = 0;
 638     int sourceDsPos = 0;
 639
 640     int basesWritten = 0;
 641     char myGapChar = myGap.charAt(0);
 642     int ratio = myGap.length();
 643
 644     int fromOffset = alignFrom.getStart() - 1;
 645     int toOffset = alignTo.getStart() - 1;
 646     int sourceGapMappedLength = 0;
 647     boolean inExon = false;
 648     final int toLength = alignTo.getLength();
 649     final int fromLength = alignFrom.getLength();
 650     StringBuilder thisAligned = new StringBuilder(2 * toLength);
 651
 652     /*
 653      * Traverse the 'model' aligned sequence
 654      */
 655     for (int i = 0; i < fromLength; i++)
 656     {
 657       char sourceChar = alignFrom.getCharAt(i);
 658       if (sourceChar == sourceGap)
 659       {
 660         sourceGapMappedLength += ratio;
 661         continue;
 662       }
 663
 664       /*
 665        * Found a non-gap character. Locate its mapped region if any.
 666        */
 667       sourceDsPos++;
 668       // Note mapping positions are base 1, our sequence positions base 0
 669       int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
 670               sourceDsPos + fromOffset);
 671       if (mappedPos == null)
 672       {
 673         /*
 674          * unmapped position; treat like a gap
 675          */
 676         sourceGapMappedLength += ratio;
 677         // System.err.println("Can't align: no codon mapping to residue "
 678         // + sourceDsPos + "(" + sourceChar + ")");
 679         // return;
 680         continue;
 681       }
 682
 683       int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
 684       int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
 685       StringBuilder trailingCopiedGap = new StringBuilder();
 686
 687       /*
 688        * Copy dna sequence up to and including this codon. Optionally, include
 689        * gaps before the codon starts (in introns) and/or after the codon starts
 690        * (in exons).
 691        *
 692        * Note this only works for 'linear' splicing, not reverse or interleaved.
 693        * But then 'align dna as protein' doesn't make much sense otherwise.
 694        */
 695       int intronLength = 0;
 696       while (basesWritten + toOffset < mappedCodonEnd
 697               && thisSeqPos < toLength)
 698       {
 699         final char c = alignTo.getCharAt(thisSeqPos++);
 700         if (c != myGapChar)
 701         {
 702           basesWritten++;
 703           int sourcePosition = basesWritten + toOffset;
 704           if (sourcePosition < mappedCodonStart)
 705           {
 706             /*
 707              * Found an unmapped (intron) base. First add in any preceding gaps
 708              * (if wanted).
 709              */
 710             if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
 711             {
 712               thisAligned.append(trailingCopiedGap.toString());
 713               intronLength += trailingCopiedGap.length();
 714               trailingCopiedGap = new StringBuilder();
 715             }
 716             intronLength++;
 717             inExon = false;
 718           }
 719           else
 720           {
 721             final boolean startOfCodon = sourcePosition == mappedCodonStart;
 722             int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
 723                     preserveUnmappedGaps, sourceGapMappedLength, inExon,
 724                     trailingCopiedGap.length(), intronLength, startOfCodon);
 725             for (int k = 0; k < gapsToAdd; k++)
 726             {
 727               thisAligned.append(myGapChar);
 728             }
 729             sourceGapMappedLength = 0;
 730             inExon = true;
 731           }
 732           thisAligned.append(c);
 733           trailingCopiedGap = new StringBuilder();
 734         }
 735         else
 736         {
 737           if (inExon && preserveMappedGaps)
 738           {
 739             trailingCopiedGap.append(myGapChar);
 740           }
 741           else if (!inExon && preserveUnmappedGaps)
 742           {
 743             trailingCopiedGap.append(myGapChar);
 744           }
 745         }
 746       }
 747     }
 748
 749     /*
 750      * At end of model aligned sequence. Copy any remaining target sequence, optionally
 751      * including (intron) gaps.
 752      */
 753     while (thisSeqPos < toLength)
 754     {
 755       final char c = alignTo.getCharAt(thisSeqPos++);
 756       if (c != myGapChar || preserveUnmappedGaps)
 757       {
 758         thisAligned.append(c);
 759       }
 760       sourceGapMappedLength--;
 761     }
 762
 763     /*
 764      * finally add gaps to pad for any trailing source gaps or
 765      * unmapped characters
 766      */
 767     if (preserveUnmappedGaps)
 768     {
 769       while (sourceGapMappedLength > 0)
 770       {
 771         thisAligned.append(myGapChar);
 772         sourceGapMappedLength--;
 773       }
 774     }
 775
 776     /*
 777      * All done aligning, set the aligned sequence.
 778      */
 779     alignTo.setSequence(new String(thisAligned));
 780   }
 781
 782   /**
 783    * Helper method to work out how many gaps to insert when realigning.
 784    *
 785    * @param preserveMappedGaps
 786    * @param preserveUnmappedGaps
 787    * @param sourceGapMappedLength
 788    * @param inExon
 789    * @param trailingCopiedGap
 790    * @param intronLength
 791    * @param startOfCodon
 792    * @return
 793    */
 794   protected static int calculateGapsToInsert(boolean preserveMappedGaps,
 795           boolean preserveUnmappedGaps, int sourceGapMappedLength,
 796           boolean inExon, int trailingGapLength, int intronLength,
 797           final boolean startOfCodon)
 798   {
 799     int gapsToAdd = 0;
 800     if (startOfCodon)
 801     {
 802       /*
 803        * Reached start of codon. Ignore trailing gaps in intron unless we are
 804        * preserving gaps in both exon and intron. Ignore them anyway if the
 805        * protein alignment introduces a gap at least as large as the intronic
 806        * region.
 807        */
 808       if (inExon && !preserveMappedGaps)
 809       {
 810         trailingGapLength = 0;
 811       }
 812       if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
 813       {
 814         trailingGapLength = 0;
 815       }
 816       if (inExon)
 817       {
 818         gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 819       }
 820       else
 821       {
 822         if (intronLength + trailingGapLength <= sourceGapMappedLength)
 823         {
 824           gapsToAdd = sourceGapMappedLength - intronLength;
 825         }
 826         else
 827         {
 828           gapsToAdd = Math.min(
 829                   intronLength + trailingGapLength - sourceGapMappedLength,
 830                   trailingGapLength);
 831         }
 832       }
 833     }
 834     else
 835     {
 836       /*
 837        * second or third base of codon; check for any gaps in dna
 838        */
 839       if (!preserveMappedGaps)
 840       {
 841         trailingGapLength = 0;
 842       }
 843       gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
 844     }
 845     return gapsToAdd;
 846   }
 847
 848   /**
 849    * Realigns the given protein to match the alignment of the dna, using codon
 850    * mappings to translate aligned codon positions to protein residues.
 851    *
 852    * @param protein
 853    *          the alignment whose sequences are realigned by this method
 854    * @param dna
 855    *          the dna alignment whose alignment we are 'copying'
 856    * @return the number of sequences that were realigned
 857    */
 858   public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
 859   {
 860     if (protein.isNucleotide() || !dna.isNucleotide())
 861     {
 862       System.err.println("Wrong alignment type in alignProteinAsDna");
 863       return 0;
 864     }
 865     List<SequenceI> unmappedProtein = new ArrayList<>();
 866     Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
 867             protein, dna, unmappedProtein);
 868     return alignProteinAs(protein, alignedCodons, unmappedProtein);
 869   }
 870
 871   /**
 872    * Realigns the given dna to match the alignment of the protein, using codon
 873    * mappings to translate aligned peptide positions to codons.
 874    *
 875    * Always produces a padded CDS alignment.
 876    *
 877    * @param dna
 878    *          the alignment whose sequences are realigned by this method
 879    * @param protein
 880    *          the protein alignment whose alignment we are 'copying'
 881    * @return the number of sequences that were realigned
 882    */
 883   public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
 884   {
 885     if (protein.isNucleotide() || !dna.isNucleotide())
 886     {
 887       System.err.println("Wrong alignment type in alignProteinAsDna");
 888       return 0;
 889     }
 890     // todo: implement this
 891     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
 892     int alignedCount = 0;
 893     int width = 0; // alignment width for padding CDS
 894     for (SequenceI dnaSeq : dna.getSequences())
 895     {
 896       if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
 897               dna.getGapCharacter()))
 898       {
 899         alignedCount++;
 900       }
 901       width = Math.max(dnaSeq.getLength(), width);
 902     }
 903     int oldwidth;
 904     int diff;
 905     for (SequenceI dnaSeq : dna.getSequences())
 906     {
 907       oldwidth = dnaSeq.getLength();
 908       diff = width - oldwidth;
 909       if (diff > 0)
 910       {
 911         dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
 912       }
 913     }
 914     return alignedCount;
 915   }
 916
 917   /**
 918    * Helper method to align (if possible) the dna sequence to match the
 919    * alignment of a mapped protein sequence. This is currently limited to
 920    * handling coding sequence only.
 921    *
 922    * @param cdsSeq
 923    * @param protein
 924    * @param mappings
 925    * @param gapChar
 926    * @return
 927    */
 928   static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
 929           AlignmentI protein, List<AlignedCodonFrame> mappings,
 930           char gapChar)
 931   {
 932     SequenceI cdsDss = cdsSeq.getDatasetSequence();
 933     if (cdsDss == null)
 934     {
 935       System.err
 936               .println("alignCdsSequenceAsProtein needs aligned sequence!");
 937       return false;
 938     }
 939
 940     List<AlignedCodonFrame> dnaMappings = MappingUtils
 941             .findMappingsForSequence(cdsSeq, mappings);
 942     for (AlignedCodonFrame mapping : dnaMappings)
 943     {
 944       SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
 945       if (peptide != null)
 946       {
 947         final int peptideLength = peptide.getLength();
 948         Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
 949         if (map != null)
 950         {
 951           MapList mapList = map.getMap();
 952           if (map.getTo() == peptide.getDatasetSequence())
 953           {
 954             mapList = mapList.getInverse();
 955           }
 956           final int cdsLength = cdsDss.getLength();
 957           int mappedFromLength = MappingUtils.getLength(mapList
 958                   .getFromRanges());
 959           int mappedToLength = MappingUtils
 960                   .getLength(mapList.getToRanges());
 961           boolean addStopCodon = (cdsLength == mappedFromLength
 962                   * CODON_LENGTH + CODON_LENGTH)
 963                   || (peptide.getDatasetSequence()
 964                           .getLength() == mappedFromLength - 1);
 965           if (cdsLength != mappedToLength && !addStopCodon)
 966           {
 967             System.err.println(String.format(
 968                     "Can't align cds as protein (length mismatch %d/%d): %s",
 969                     cdsLength, mappedToLength, cdsSeq.getName()));
 970           }
 971
 972           /*
 973            * pre-fill the aligned cds sequence with gaps
 974            */
 975           char[] alignedCds = new char[peptideLength * CODON_LENGTH
 976                   + (addStopCodon ? CODON_LENGTH : 0)];
 977           Arrays.fill(alignedCds, gapChar);
 978
 979           /*
 980            * walk over the aligned peptide sequence and insert mapped
 981            * codons for residues in the aligned cds sequence
 982            */
 983           int copiedBases = 0;
 984           int cdsStart = cdsDss.getStart();
 985           int proteinPos = peptide.getStart() - 1;
 986           int cdsCol = 0;
 987
 988           for (int col = 0; col < peptideLength; col++)
 989           {
 990             char residue = peptide.getCharAt(col);
 991
 992             if (Comparison.isGap(residue))
 993             {
 994               cdsCol += CODON_LENGTH;
 995             }
 996             else
 997             {
 998               proteinPos++;
 999               int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1000               if (codon == null)
1001               {
1002                 // e.g. incomplete start codon, X in peptide
1003                 cdsCol += CODON_LENGTH;
1004               }
1005               else
1006               {
1007                 for (int j = codon[0]; j <= codon[1]; j++)
1008                 {
1009                   char mappedBase = cdsDss.getCharAt(j - cdsStart);
1010                   alignedCds[cdsCol++] = mappedBase;
1011                   copiedBases++;
1012                 }
1013               }
1014             }
1015           }
1016
1017           /*
1018            * append stop codon if not mapped from protein,
1019            * closing it up to the end of the mapped sequence
1020            */
1021           if (copiedBases == cdsLength - CODON_LENGTH)
1022           {
1023             for (int i = alignedCds.length - 1; i >= 0; i--)
1024             {
1025               if (!Comparison.isGap(alignedCds[i]))
1026               {
1027                 cdsCol = i + 1; // gap just after end of sequence
1028                 break;
1029               }
1030             }
1031             for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1032             {
1033               alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1034             }
1035           }
1036           cdsSeq.setSequence(new String(alignedCds));
1037           return true;
1038         }
1039       }
1040     }
1041     return false;
1042   }
1043
1044   /**
1045    * Builds a map whose key is an aligned codon position (3 alignment column
1046    * numbers base 0), and whose value is a map from protein sequence to each
1047    * protein's peptide residue for that codon. The map generates an ordering of
1048    * the codons, and allows us to read off the peptides at each position in
1049    * order to assemble 'aligned' protein sequences.
1050    *
1051    * @param protein
1052    *          the protein alignment
1053    * @param dna
1054    *          the coding dna alignment
1055    * @param unmappedProtein
1056    *          any unmapped proteins are added to this list
1057    * @return
1058    */
1059   protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1060           AlignmentI protein, AlignmentI dna,
1061           List<SequenceI> unmappedProtein)
1062   {
1063     /*
1064      * maintain a list of any proteins with no mappings - these will be
1065      * rendered 'as is' in the protein alignment as we can't align them
1066      */
1067     unmappedProtein.addAll(protein.getSequences());
1068
1069     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1070
1071     /*
1072      * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1073      * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1074      * comparator keeps the codon positions ordered.
1075      */
1076     Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1077             new CodonComparator());
1078
1079     for (SequenceI dnaSeq : dna.getSequences())
1080     {
1081       for (AlignedCodonFrame mapping : mappings)
1082       {
1083         SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1084         if (prot != null)
1085         {
1086           Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1087           addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1088                   alignedCodons);
1089           unmappedProtein.remove(prot);
1090         }
1091       }
1092     }
1093
1094     /*
1095      * Finally add any unmapped peptide start residues (e.g. for incomplete
1096      * codons) as if at the codon position before the second residue
1097      */
1098     // TODO resolve JAL-2022 so this fudge can be removed
1099     int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1100     addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1101
1102     return alignedCodons;
1103   }
1104
1105   /**
1106    * Scans for any protein mapped from position 2 (meaning unmapped start
1107    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1108    * preceding position in the alignment
1109    *
1110    * @param alignedCodons
1111    *          the codon-to-peptide map
1112    * @param mappedSequenceCount
1113    *          the number of distinct sequences in the map
1114    */
1115   protected static void addUnmappedPeptideStarts(
1116           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1117           int mappedSequenceCount)
1118   {
1119     // TODO delete this ugly hack once JAL-2022 is resolved
1120     // i.e. we can model startPhase > 0 (incomplete start codon)
1121
1122     List<SequenceI> sequencesChecked = new ArrayList<>();
1123     AlignedCodon lastCodon = null;
1124     Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1125
1126     for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1127             .entrySet())
1128     {
1129       for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1130               .entrySet())
1131       {
1132         SequenceI seq = sequenceCodon.getKey();
1133         if (sequencesChecked.contains(seq))
1134         {
1135           continue;
1136         }
1137         sequencesChecked.add(seq);
1138         AlignedCodon codon = sequenceCodon.getValue();
1139         if (codon.peptideCol > 1)
1140         {
1141           System.err.println(
1142                   "Problem mapping protein with >1 unmapped start positions: "
1143                           + seq.getName());
1144         }
1145         else if (codon.peptideCol == 1)
1146         {
1147           /*
1148            * first position (peptideCol == 0) was unmapped - add it
1149            */
1150           if (lastCodon != null)
1151           {
1152             AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1153                     lastCodon.pos2, lastCodon.pos3,
1154                     String.valueOf(seq.getCharAt(0)), 0);
1155             toAdd.put(seq, firstPeptide);
1156           }
1157           else
1158           {
1159             /*
1160              * unmapped residue at start of alignment (no prior column) -
1161              * 'insert' at nominal codon [0, 0, 0]
1162              */
1163             AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1164                     String.valueOf(seq.getCharAt(0)), 0);
1165             toAdd.put(seq, firstPeptide);
1166           }
1167         }
1168         if (sequencesChecked.size() == mappedSequenceCount)
1169         {
1170           // no need to check past first mapped position in all sequences
1171           break;
1172         }
1173       }
1174       lastCodon = entry.getKey();
1175     }
1176
1177     /*
1178      * add any new codons safely after iterating over the map
1179      */
1180     for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1181     {
1182       addCodonToMap(alignedCodons, startCodon.getValue(),
1183               startCodon.getKey());
1184     }
1185   }
1186
1187   /**
1188    * Update the aligned protein sequences to match the codon alignments given in
1189    * the map.
1190    *
1191    * @param protein
1192    * @param alignedCodons
1193    *          an ordered map of codon positions (columns), with sequence/peptide
1194    *          values present in each column
1195    * @param unmappedProtein
1196    * @return
1197    */
1198   protected static int alignProteinAs(AlignmentI protein,
1199           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1200           List<SequenceI> unmappedProtein)
1201   {
1202     /*
1203      * prefill peptide sequences with gaps
1204      */
1205     int alignedWidth = alignedCodons.size();
1206     char[] gaps = new char[alignedWidth];
1207     Arrays.fill(gaps, protein.getGapCharacter());
1208     Map<SequenceI, char[]> peptides = new HashMap<>();
1209     for (SequenceI seq : protein.getSequences())
1210     {
1211       if (!unmappedProtein.contains(seq))
1212       {
1213         peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1214       }
1215     }
1216
1217     /*
1218      * Traverse the codons left to right (as defined by CodonComparator)
1219      * and insert peptides in each column where the sequence is mapped.
1220      * This gives a peptide 'alignment' where residues are aligned if their
1221      * corresponding codons occupy the same columns in the cdna alignment.
1222      */
1223     int column = 0;
1224     for (AlignedCodon codon : alignedCodons.keySet())
1225     {
1226       final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1227               .get(codon);
1228       for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1229       {
1230         char residue = entry.getValue().product.charAt(0);
1231         peptides.get(entry.getKey())[column] = residue;
1232       }
1233       column++;
1234     }
1235
1236     /*
1237      * and finally set the constructed sequences
1238      */
1239     for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1240     {
1241       entry.getKey().setSequence(new String(entry.getValue()));
1242     }
1243
1244     return 0;
1245   }
1246
1247   /**
1248    * Populate the map of aligned codons by traversing the given sequence
1249    * mapping, locating the aligned positions of mapped codons, and adding those
1250    * positions and their translation products to the map.
1251    *
1252    * @param dna
1253    *          the aligned sequence we are mapping from
1254    * @param protein
1255    *          the sequence to be aligned to the codons
1256    * @param gapChar
1257    *          the gap character in the dna sequence
1258    * @param seqMap
1259    *          a mapping to a sequence translation
1260    * @param alignedCodons
1261    *          the map we are building up
1262    */
1263   static void addCodonPositions(SequenceI dna, SequenceI protein,
1264           char gapChar, Mapping seqMap,
1265           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1266   {
1267     Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1268
1269     /*
1270      * add codon positions, and their peptide translations, to the alignment
1271      * map, while remembering the first codon mapped
1272      */
1273     while (codons.hasNext())
1274     {
1275       try
1276       {
1277         AlignedCodon codon = codons.next();
1278         addCodonToMap(alignedCodons, codon, protein);
1279       } catch (IncompleteCodonException e)
1280       {
1281         // possible incomplete trailing codon - ignore
1282       } catch (NoSuchElementException e)
1283       {
1284         // possibly peptide lacking STOP
1285       }
1286     }
1287   }
1288
1289   /**
1290    * Helper method to add a codon-to-peptide entry to the aligned codons map
1291    *
1292    * @param alignedCodons
1293    * @param codon
1294    * @param protein
1295    */
1296   protected static void addCodonToMap(
1297           Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1298           AlignedCodon codon, SequenceI protein)
1299   {
1300     Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1301     if (seqProduct == null)
1302     {
1303       seqProduct = new HashMap<>();
1304       alignedCodons.put(codon, seqProduct);
1305     }
1306     seqProduct.put(protein, codon);
1307   }
1308
1309   /**
1310    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1311    * between at least one pair of sequences in the two alignments. Currently,
1312    * the logic is:
1313    * <ul>
1314    * <li>One alignment must be nucleotide, and the other protein</li>
1315    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1316    * <li>Mappable means the nucleotide translation matches the protein
1317    * sequence</li>
1318    * <li>The translation may ignore start and stop codons if present in the
1319    * nucleotide</li>
1320    * </ul>
1321    *
1322    * @param al1
1323    * @param al2
1324    * @return
1325    */
1326   public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1327   {
1328     if (al1 == null || al2 == null)
1329     {
1330       return false;
1331     }
1332
1333     /*
1334      * Require one nucleotide and one protein
1335      */
1336     if (al1.isNucleotide() == al2.isNucleotide())
1337     {
1338       return false;
1339     }
1340     AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1341     AlignmentI protein = dna == al1 ? al2 : al1;
1342     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1343     for (SequenceI dnaSeq : dna.getSequences())
1344     {
1345       for (SequenceI proteinSeq : protein.getSequences())
1346       {
1347         if (isMappable(dnaSeq, proteinSeq, mappings))
1348         {
1349           return true;
1350         }
1351       }
1352     }
1353     return false;
1354   }
1355
1356   /**
1357    * Returns true if the dna sequence is mapped, or could be mapped, to the
1358    * protein sequence.
1359    *
1360    * @param dnaSeq
1361    * @param proteinSeq
1362    * @param mappings
1363    * @return
1364    */
1365   protected static boolean isMappable(SequenceI dnaSeq,
1366           SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1367   {
1368     if (dnaSeq == null || proteinSeq == null)
1369     {
1370       return false;
1371     }
1372
1373     SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1374             : dnaSeq.getDatasetSequence();
1375     SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1376             ? proteinSeq
1377             : proteinSeq.getDatasetSequence();
1378
1379     for (AlignedCodonFrame mapping : mappings)
1380     {
1381       if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1382       {
1383         /*
1384          * already mapped
1385          */
1386         return true;
1387       }
1388     }
1389
1390     /*
1391      * Just try to make a mapping (it is not yet stored), test whether
1392      * successful.
1393      */
1394     return mapCdnaToProtein(proteinDs, dnaDs) != null;
1395   }
1396
1397   /**
1398    * Finds any reference annotations associated with the sequences in
1399    * sequenceScope, that are not already added to the alignment, and adds them
1400    * to the 'candidates' map. Also populates a lookup table of annotation
1401    * labels, keyed by calcId, for use in constructing tooltips or the like.
1402    *
1403    * @param sequenceScope
1404    *          the sequences to scan for reference annotations
1405    * @param labelForCalcId
1406    *          (optional) map to populate with label for calcId
1407    * @param candidates
1408    *          map to populate with annotations for sequence
1409    * @param al
1410    *          the alignment to check for presence of annotations
1411    */
1412   public static void findAddableReferenceAnnotations(
1413           List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1414           final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1415           AlignmentI al)
1416   {
1417     if (sequenceScope == null)
1418     {
1419       return;
1420     }
1421
1422     /*
1423      * For each sequence in scope, make a list of any annotations on the
1424      * underlying dataset sequence which are not already on the alignment.
1425      *
1426      * Add to a map of { alignmentSequence, <List of annotations to add> }
1427      */
1428     for (SequenceI seq : sequenceScope)
1429     {
1430       SequenceI dataset = seq.getDatasetSequence();
1431       if (dataset == null)
1432       {
1433         continue;
1434       }
1435       AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1436       if (datasetAnnotations == null)
1437       {
1438         continue;
1439       }
1440       final List<AlignmentAnnotation> result = new ArrayList<>();
1441       for (AlignmentAnnotation dsann : datasetAnnotations)
1442       {
1443         /*
1444          * Find matching annotations on the alignment. If none is found, then
1445          * add this annotation to the list of 'addable' annotations for this
1446          * sequence.
1447          */
1448         final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1449                 .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1450         if (!matchedAlignmentAnnotations.iterator().hasNext())
1451         {
1452           result.add(dsann);
1453           if (labelForCalcId != null)
1454           {
1455             labelForCalcId.put(dsann.getCalcId(), dsann.label);
1456           }
1457         }
1458       }
1459       /*
1460        * Save any addable annotations for this sequence
1461        */
1462       if (!result.isEmpty())
1463       {
1464         candidates.put(seq, result);
1465       }
1466     }
1467   }
1468
1469   /**
1470    * Adds annotations to the top of the alignment annotations, in the same order
1471    * as their related sequences.
1472    *
1473    * @param annotations
1474    *          the annotations to add
1475    * @param alignment
1476    *          the alignment to add them to
1477    * @param selectionGroup
1478    *          current selection group (or null if none)
1479    */
1480   public static void addReferenceAnnotations(
1481           Map<SequenceI, List<AlignmentAnnotation>> annotations,
1482           final AlignmentI alignment, final SequenceGroup selectionGroup)
1483   {
1484     for (SequenceI seq : annotations.keySet())
1485     {
1486       for (AlignmentAnnotation ann : annotations.get(seq))
1487       {
1488         AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1489         int startRes = 0;
1490         int endRes = ann.annotations.length;
1491         if (selectionGroup != null)
1492         {
1493           startRes = selectionGroup.getStartRes();
1494           endRes = selectionGroup.getEndRes();
1495         }
1496         copyAnn.restrict(startRes, endRes);
1497
1498         /*
1499          * Add to the sequence (sets copyAnn.datasetSequence), unless the
1500          * original annotation is already on the sequence.
1501          */
1502         if (!seq.hasAnnotation(ann))
1503         {
1504           seq.addAlignmentAnnotation(copyAnn);
1505         }
1506         // adjust for gaps
1507         copyAnn.adjustForAlignment();
1508         // add to the alignment and set visible
1509         alignment.addAnnotation(copyAnn);
1510         copyAnn.visible = true;
1511       }
1512     }
1513   }
1514
1515   /**
1516    * Set visibility of alignment annotations of specified types (labels), for
1517    * specified sequences. This supports controls like "Show all secondary
1518    * structure", "Hide all Temp factor", etc.
1519    *
1520    * @al the alignment to scan for annotations
1521    * @param types
1522    *          the types (labels) of annotations to be updated
1523    * @param forSequences
1524    *          if not null, only annotations linked to one of these sequences are
1525    *          in scope for update; if null, acts on all sequence annotations
1526    * @param anyType
1527    *          if this flag is true, 'types' is ignored (label not checked)
1528    * @param doShow
1529    *          if true, set visibility on, else set off
1530    */
1531   public static void showOrHideSequenceAnnotations(AlignmentI al,
1532           Collection<String> types, List<SequenceI> forSequences,
1533           boolean anyType, boolean doShow)
1534   {
1535     AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1536     if (anns != null)
1537     {
1538       for (AlignmentAnnotation aa : anns)
1539       {
1540         if (anyType || types.contains(aa.label))
1541         {
1542           if ((aa.sequenceRef != null) && (forSequences == null
1543                   || forSequences.contains(aa.sequenceRef)))
1544           {
1545             aa.visible = doShow;
1546           }
1547         }
1548       }
1549     }
1550   }
1551
1552   /**
1553    * Returns true if either sequence has a cross-reference to the other
1554    *
1555    * @param seq1
1556    * @param seq2
1557    * @return
1558    */
1559   public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1560   {
1561     // Note: moved here from class CrossRef as the latter class has dependencies
1562     // not availability to the applet's classpath
1563     return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1564   }
1565
1566   /**
1567    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1568    * that sequence name is structured as Source|AccessionId.
1569    *
1570    * @param seq1
1571    * @param seq2
1572    * @return
1573    */
1574   public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1575   {
1576     if (seq1 == null || seq2 == null)
1577     {
1578       return false;
1579     }
1580     String name = seq2.getName();
1581     final DBRefEntry[] xrefs = seq1.getDBRefs();
1582     if (xrefs != null)
1583     {
1584       for (DBRefEntry xref : xrefs)
1585       {
1586         String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1587         // case-insensitive test, consistent with DBRefEntry.equalRef()
1588         if (xrefName.equalsIgnoreCase(name))
1589         {
1590           return true;
1591         }
1592       }
1593     }
1594     return false;
1595   }
1596
1597   /**
1598    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1599    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1600    * added to the original alignment's dataset, which is shared by the new
1601    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1602    * added to the alignment dataset.
1603    *
1604    * @param dna
1605    *          aligned nucleotide (dna or cds) sequences
1606    * @param dataset
1607    *          the alignment dataset the sequences belong to
1608    * @param products
1609    *          (optional) to restrict results to CDS that map to specified
1610    *          protein products
1611    * @return an alignment whose sequences are the cds-only parts of the dna
1612    *         sequences (or null if no mappings are found)
1613    */
1614   public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1615           AlignmentI dataset, SequenceI[] products)
1616   {
1617     if (dataset == null || dataset.getDataset() != null)
1618     {
1619       throw new IllegalArgumentException(
1620               "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1621     }
1622     List<SequenceI> foundSeqs = new ArrayList<>();
1623     List<SequenceI> cdsSeqs = new ArrayList<>();
1624     List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1625     HashSet<SequenceI> productSeqs = null;
1626     if (products != null)
1627     {
1628       productSeqs = new HashSet<>();
1629       for (SequenceI seq : products)
1630       {
1631         productSeqs.add(seq.getDatasetSequence() == null ? seq : seq
1632                 .getDatasetSequence());
1633       }
1634     }
1635
1636     /*
1637      * Construct CDS sequences from mappings on the alignment dataset.
1638      * The logic is:
1639      * - find the protein product(s) mapped to from each dna sequence
1640      * - if the mapping covers the whole dna sequence (give or take start/stop
1641      *   codon), take the dna as the CDS sequence
1642      * - else search dataset mappings for a suitable dna sequence, i.e. one
1643      *   whose whole sequence is mapped to the protein
1644      * - if no sequence found, construct one from the dna sequence and mapping
1645      *   (and add it to dataset so it is found if this is repeated)
1646      */
1647     for (SequenceI dnaSeq : dna)
1648     {
1649       SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1650               : dnaSeq.getDatasetSequence();
1651
1652       List<AlignedCodonFrame> seqMappings = MappingUtils
1653               .findMappingsForSequence(dnaSeq, mappings);
1654       for (AlignedCodonFrame mapping : seqMappings)
1655       {
1656         List<Mapping> mappingsFromSequence = mapping
1657                 .getMappingsFromSequence(dnaSeq);
1658
1659         for (Mapping aMapping : mappingsFromSequence)
1660         {
1661           MapList mapList = aMapping.getMap();
1662           if (mapList.getFromRatio() == 1)
1663           {
1664             /*
1665              * not a dna-to-protein mapping (likely dna-to-cds)
1666              */
1667             continue;
1668           }
1669
1670           /*
1671            * skip if mapping is not to one of the target set of proteins
1672            */
1673           SequenceI proteinProduct = aMapping.getTo();
1674           if (productSeqs != null && !productSeqs.contains(proteinProduct))
1675           {
1676             continue;
1677           }
1678
1679           /*
1680            * try to locate the CDS from the dataset mappings;
1681            * guard against duplicate results (for the case that protein has
1682            * dbrefs to both dna and cds sequences)
1683            */
1684           SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1685                   seqMappings, aMapping);
1686           if (cdsSeq != null)
1687           {
1688             if (!foundSeqs.contains(cdsSeq))
1689             {
1690               foundSeqs.add(cdsSeq);
1691               SequenceI derivedSequence = cdsSeq.deriveSequence();
1692               cdsSeqs.add(derivedSequence);
1693               if (!dataset.getSequences().contains(cdsSeq))
1694               {
1695                 dataset.addSequence(cdsSeq);
1696               }
1697             }
1698             continue;
1699           }
1700
1701           /*
1702            * didn't find mapped CDS sequence - construct it and add
1703            * its dataset sequence to the dataset
1704            */
1705           cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1706                   dataset).deriveSequence();
1707           // cdsSeq has a name constructed as CDS|<dbref>
1708           // <dbref> will be either the accession for the coding sequence,
1709           // marked in the /via/ dbref to the protein product accession
1710           // or it will be the original nucleotide accession.
1711           SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1712
1713           cdsSeqs.add(cdsSeq);
1714
1715           if (!dataset.getSequences().contains(cdsSeqDss))
1716           {
1717             // check if this sequence is a newly created one
1718             // so needs adding to the dataset
1719             dataset.addSequence(cdsSeqDss);
1720           }
1721
1722           /*
1723            * add a mapping from CDS to the (unchanged) mapped to range
1724            */
1725           List<int[]> cdsRange = Collections.singletonList(new int[] { 1,
1726               cdsSeq.getLength() });
1727           MapList cdsToProteinMap = new MapList(cdsRange,
1728                   mapList.getToRanges(), mapList.getFromRatio(),
1729                   mapList.getToRatio());
1730           AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1731           cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1732                   cdsToProteinMap);
1733
1734           /*
1735            * guard against duplicating the mapping if repeating this action
1736            */
1737           if (!mappings.contains(cdsToProteinMapping))
1738           {
1739             mappings.add(cdsToProteinMapping);
1740           }
1741
1742           propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1743                   proteinProduct, aMapping);
1744           /*
1745            * add another mapping from original 'from' range to CDS
1746            */
1747           AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1748           final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1749                   cdsRange, 1, 1);
1750           dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1751                   dnaToCdsMap);
1752           if (!mappings.contains(dnaToCdsMapping))
1753           {
1754             mappings.add(dnaToCdsMapping);
1755           }
1756
1757           /*
1758            * transfer dna chromosomal loci (if known) to the CDS
1759            * sequence (via the mapping)
1760            */
1761           final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1762           transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1763
1764           /*
1765            * add DBRef with mapping from protein to CDS
1766            * (this enables Get Cross-References from protein alignment)
1767            * This is tricky because we can't have two DBRefs with the
1768            * same source and accession, so need a different accession for
1769            * the CDS from the dna sequence
1770            */
1771
1772           // specific use case:
1773           // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1774           // ENSG02, ENSG03, with transcripts and products similarly named.
1775           // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1776
1777           // JBPNote: ?? can't actually create an example that demonstrates we
1778           // need to
1779           // synthesize an xref.
1780
1781           for (DBRefEntry primRef : dnaDss.getPrimaryDBRefs())
1782           {
1783             /*
1784              * create a cross-reference from CDS to the source sequence's
1785              * primary reference and vice versa
1786              */
1787             String source = primRef.getSource();
1788             String version = primRef.getVersion();
1789             DBRefEntry cdsCrossRef = new DBRefEntry(source, source + ":"
1790                     + version, primRef.getAccessionId());
1791             cdsCrossRef.setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1792             cdsSeqDss.addDBRef(cdsCrossRef);
1793
1794             dnaSeq.addDBRef(new DBRefEntry(source, version, cdsSeq
1795                     .getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1796
1797             // problem here is that the cross-reference is synthesized -
1798             // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1799             // 'CDS|emblcdsacc'
1800             // assuming cds version same as dna ?!?
1801
1802             DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1803                     cdsSeq.getName());
1804             //
1805             proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
1806                     .getInverse()));
1807             proteinProduct.addDBRef(proteinToCdsRef);
1808           }
1809
1810           /*
1811            * transfer any features on dna that overlap the CDS
1812            */
1813           transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1814                   SequenceOntologyI.CDS);
1815         }
1816       }
1817     }
1818
1819     AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs
1820             .size()]));
1821     cds.setDataset(dataset);
1822
1823     return cds;
1824   }
1825
1826   /**
1827    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1828    * toSeq, mediated by the given mapping between the sequences
1829    *
1830    * @param fromSeq
1831    * @param targetToFrom
1832    *          Map
1833    * @param targetSeq
1834    */
1835   protected static void transferGeneLoci(SequenceI fromSeq,
1836           MapList targetToFrom, SequenceI targetSeq)
1837   {
1838     if (targetSeq.getGeneLoci() != null)
1839     {
1840       // already have - don't override
1841       return;
1842     }
1843     GeneLociI fromLoci = fromSeq.getGeneLoci();
1844     if (fromLoci == null)
1845     {
1846       return;
1847     }
1848
1849     MapList newMap = targetToFrom.traverse(fromLoci.getMap());
1850
1851     if (newMap != null)
1852     {
1853       targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
1854               fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
1855     }
1856   }
1857
1858   /**
1859    * A helper method that finds a CDS sequence in the alignment dataset that is
1860    * mapped to the given protein sequence, and either is, or has a mapping from,
1861    * the given dna sequence.
1862    *
1863    * @param mappings
1864    *          set of all mappings on the dataset
1865    * @param dnaSeq
1866    *          a dna (or cds) sequence we are searching from
1867    * @param seqMappings
1868    *          the set of mappings involving dnaSeq
1869    * @param aMapping
1870    *          a transcript-to-peptide mapping
1871    * @return
1872    */
1873   static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
1874           SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
1875           Mapping aMapping)
1876   {
1877     /*
1878      * TODO a better dna-cds-protein mapping data representation to allow easy
1879      * navigation; until then this clunky looping around lists of mappings
1880      */
1881     SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1882             : dnaSeq.getDatasetSequence();
1883     SequenceI proteinProduct = aMapping.getTo();
1884
1885     /*
1886      * is this mapping from the whole dna sequence (i.e. CDS)?
1887      * allowing for possible stop codon on dna but not peptide
1888      */
1889     int mappedFromLength = MappingUtils
1890             .getLength(aMapping.getMap().getFromRanges());
1891     int dnaLength = seqDss.getLength();
1892     if (mappedFromLength == dnaLength
1893             || mappedFromLength == dnaLength - CODON_LENGTH)
1894     {
1895       /*
1896        * if sequence has CDS features, this is a transcript with no UTR
1897        * - do not take this as the CDS sequence! (JAL-2789)
1898        */
1899       if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
1900               .isEmpty())
1901       {
1902         return seqDss;
1903       }
1904     }
1905
1906     /*
1907      * looks like we found the dna-to-protein mapping; search for the
1908      * corresponding cds-to-protein mapping
1909      */
1910     List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
1911             .findMappingsForSequence(proteinProduct, mappings);
1912     for (AlignedCodonFrame acf : mappingsToPeptide)
1913     {
1914       for (SequenceToSequenceMapping map : acf.getMappings())
1915       {
1916         Mapping mapping = map.getMapping();
1917         if (mapping != aMapping
1918                 && mapping.getMap().getFromRatio() == CODON_LENGTH
1919                 && proteinProduct == mapping.getTo()
1920                 && seqDss != map.getFromSeq())
1921         {
1922           mappedFromLength = MappingUtils
1923                   .getLength(mapping.getMap().getFromRanges());
1924           if (mappedFromLength == map.getFromSeq().getLength())
1925           {
1926             /*
1927             * found a 3:1 mapping to the protein product which covers
1928             * the whole dna sequence i.e. is from CDS; finally check the CDS
1929             * is mapped from the given dna start sequence
1930             */
1931             SequenceI cdsSeq = map.getFromSeq();
1932             // todo this test is weak if seqMappings contains multiple mappings;
1933             // we get away with it if transcript:cds relationship is 1:1
1934             List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
1935                     .findMappingsForSequence(cdsSeq, seqMappings);
1936             if (!dnaToCdsMaps.isEmpty())
1937             {
1938               return cdsSeq;
1939             }
1940           }
1941         }
1942       }
1943     }
1944     return null;
1945   }
1946
1947   /**
1948    * Helper method that makes a CDS sequence as defined by the mappings from the
1949    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
1950    * forward or reverse strand).
1951    *
1952    * @param seq
1953    * @param mapping
1954    * @param dataset
1955    *          - existing dataset. We check for sequences that look like the CDS
1956    *          we are about to construct, if one exists already, then we will
1957    *          just return that one.
1958    * @return CDS sequence (as a dataset sequence)
1959    */
1960   static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
1961           AlignmentI dataset)
1962   {
1963     char[] seqChars = seq.getSequence();
1964     List<int[]> fromRanges = mapping.getMap().getFromRanges();
1965     int cdsWidth = MappingUtils.getLength(fromRanges);
1966     char[] newSeqChars = new char[cdsWidth];
1967
1968     int newPos = 0;
1969     for (int[] range : fromRanges)
1970     {
1971       if (range[0] <= range[1])
1972       {
1973         // forward strand mapping - just copy the range
1974         int length = range[1] - range[0] + 1;
1975         System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
1976                 length);
1977         newPos += length;
1978       }
1979       else
1980       {
1981         // reverse strand mapping - copy and complement one by one
1982         for (int i = range[0]; i >= range[1]; i--)
1983         {
1984           newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
1985         }
1986       }
1987     }
1988
1989     /*
1990      * assign 'from id' held in the mapping if set (e.g. EMBL protein_id),
1991      * else generate a sequence name
1992      */
1993     String mapFromId = mapping.getMappedFromId();
1994     String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName());
1995     SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
1996     if (dataset != null)
1997     {
1998       SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
1999       if (matches != null)
2000       {
2001         boolean matched = false;
2002         for (SequenceI mtch : matches)
2003         {
2004           if (mtch.getStart() != newSeq.getStart())
2005           {
2006             continue;
2007           }
2008           if (mtch.getEnd() != newSeq.getEnd())
2009           {
2010             continue;
2011           }
2012           if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2013           {
2014             continue;
2015           }
2016           if (!matched)
2017           {
2018             matched = true;
2019             newSeq = mtch;
2020           }
2021           else
2022           {
2023             System.err.println(
2024                     "JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):"
2025                             + mtch.toString());
2026           }
2027         }
2028       }
2029     }
2030     // newSeq.setDescription(mapFromId);
2031
2032     return newSeq;
2033   }
2034
2035   /**
2036    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2037    * the given mapping.
2038    *
2039    * @param cdsSeq
2040    * @param contig
2041    * @param proteinProduct
2042    * @param mapping
2043    * @return list of DBRefEntrys added
2044    */
2045   protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2046           SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2047   {
2048
2049     // gather direct refs from contig congruent with mapping
2050     List<DBRefEntry> direct = new ArrayList<>();
2051     HashSet<String> directSources = new HashSet<>();
2052
2053     if (contig.getDBRefs() != null)
2054     {
2055       for (DBRefEntry dbr : contig.getDBRefs())
2056       {
2057         if (dbr.hasMap() && dbr.getMap().getMap().isTripletMap())
2058         {
2059           MapList map = dbr.getMap().getMap();
2060           // check if map is the CDS mapping
2061           if (mapping.getMap().equals(map))
2062           {
2063             direct.add(dbr);
2064             directSources.add(dbr.getSource());
2065           }
2066         }
2067       }
2068     }
2069     DBRefEntry[] onSource = DBRefUtils.selectRefs(
2070             proteinProduct.getDBRefs(),
2071             directSources.toArray(new String[0]));
2072     List<DBRefEntry> propagated = new ArrayList<>();
2073
2074     // and generate appropriate mappings
2075     for (DBRefEntry cdsref : direct)
2076     {
2077       // clone maplist and mapping
2078       MapList cdsposmap = new MapList(
2079               Arrays.asList(new int[][]
2080               { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2081               cdsref.getMap().getMap().getToRanges(), 3, 1);
2082       Mapping cdsmap = new Mapping(cdsref.getMap().getTo(),
2083               cdsref.getMap().getMap());
2084
2085       // create dbref
2086       DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2087               cdsref.getVersion(), cdsref.getAccessionId(),
2088               new Mapping(cdsmap.getTo(), cdsposmap));
2089
2090       // and see if we can map to the protein product for this mapping.
2091       // onSource is the filtered set of accessions on protein that we are
2092       // tranferring, so we assume accession is the same.
2093       if (cdsmap.getTo() == null && onSource != null)
2094       {
2095         List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2096                 cdsref.getAccessionId());
2097         if (sourceRefs != null)
2098         {
2099           for (DBRefEntry srcref : sourceRefs)
2100           {
2101             if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2102             {
2103               // we have found a complementary dbref on the protein product, so
2104               // update mapping's getTo
2105               newref.getMap().setTo(proteinProduct);
2106             }
2107           }
2108         }
2109       }
2110       cdsSeq.addDBRef(newref);
2111       propagated.add(newref);
2112     }
2113     return propagated;
2114   }
2115
2116   /**
2117    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2118    * feature start/end ranges, optionally omitting specified feature types.
2119    * Returns the number of features copied.
2120    *
2121    * @param fromSeq
2122    * @param toSeq
2123    * @param mapping
2124    *          the mapping from 'fromSeq' to 'toSeq'
2125    * @param select
2126    *          if not null, only features of this type are copied (including
2127    *          subtypes in the Sequence Ontology)
2128    * @param omitting
2129    */
2130   protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2131           MapList mapping, String select, String... omitting)
2132   {
2133     SequenceI copyTo = toSeq;
2134     while (copyTo.getDatasetSequence() != null)
2135     {
2136       copyTo = copyTo.getDatasetSequence();
2137     }
2138
2139     /*
2140      * get features, optionally restricted by an ontology term
2141      */
2142     List<SequenceFeature> sfs = select == null ? fromSeq.getFeatures()
2143             .getPositionalFeatures() : fromSeq.getFeatures()
2144             .getFeaturesByOntology(select);
2145
2146     int count = 0;
2147     for (SequenceFeature sf : sfs)
2148     {
2149       String type = sf.getType();
2150       boolean omit = false;
2151       for (String toOmit : omitting)
2152       {
2153         if (type.equals(toOmit))
2154         {
2155           omit = true;
2156         }
2157       }
2158       if (omit)
2159       {
2160         continue;
2161       }
2162
2163       /*
2164        * locate the mapped range - null if either start or end is
2165        * not mapped (no partial overlaps are calculated)
2166        */
2167       int start = sf.getBegin();
2168       int end = sf.getEnd();
2169       int[] mappedTo = mapping.locateInTo(start, end);
2170       /*
2171        * if whole exon range doesn't map, try interpreting it
2172        * as 5' or 3' exon overlapping the CDS range
2173        */
2174       if (mappedTo == null)
2175       {
2176         mappedTo = mapping.locateInTo(end, end);
2177         if (mappedTo != null)
2178         {
2179           /*
2180            * end of exon is in CDS range - 5' overlap
2181            * to a range from the start of the peptide
2182            */
2183           mappedTo[0] = 1;
2184         }
2185       }
2186       if (mappedTo == null)
2187       {
2188         mappedTo = mapping.locateInTo(start, start);
2189         if (mappedTo != null)
2190         {
2191           /*
2192            * start of exon is in CDS range - 3' overlap
2193            * to a range up to the end of the peptide
2194            */
2195           mappedTo[1] = toSeq.getLength();
2196         }
2197       }
2198       if (mappedTo != null)
2199       {
2200         int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2201         int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2202         SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2203                 sf.getFeatureGroup(), sf.getScore());
2204         copyTo.addSequenceFeature(copy);
2205         count++;
2206       }
2207     }
2208     return count;
2209   }
2210
2211   /**
2212    * Returns a mapping from dna to protein by inspecting sequence features of
2213    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2214    * length is 3 times the peptide length (optionally after dropping a trailing
2215    * stop codon). This method does not check whether the CDS nucleotide sequence
2216    * translates to the peptide sequence.
2217    *
2218    * @param dnaSeq
2219    * @param proteinSeq
2220    * @return
2221    */
2222   public static MapList mapCdsToProtein(SequenceI dnaSeq,
2223           SequenceI proteinSeq)
2224   {
2225     List<int[]> ranges = findCdsPositions(dnaSeq);
2226     int mappedDnaLength = MappingUtils.getLength(ranges);
2227
2228     /*
2229      * if not a whole number of codons, truncate mapping
2230      */
2231     int codonRemainder = mappedDnaLength % CODON_LENGTH;
2232     if (codonRemainder > 0)
2233     {
2234       mappedDnaLength -= codonRemainder;
2235       MappingUtils.removeEndPositions(codonRemainder, ranges);
2236     }
2237
2238     int proteinLength = proteinSeq.getLength();
2239     int proteinStart = proteinSeq.getStart();
2240     int proteinEnd = proteinSeq.getEnd();
2241
2242     /*
2243      * incomplete start codon may mean X at start of peptide
2244      * we ignore both for mapping purposes
2245      */
2246     if (proteinSeq.getCharAt(0) == 'X')
2247     {
2248       // todo JAL-2022 support startPhase > 0
2249       proteinStart++;
2250       proteinLength--;
2251     }
2252     List<int[]> proteinRange = new ArrayList<>();
2253
2254     /*
2255      * dna length should map to protein (or protein plus stop codon)
2256      */
2257     int codesForResidues = mappedDnaLength / CODON_LENGTH;
2258     if (codesForResidues == (proteinLength + 1))
2259     {
2260       // assuming extra codon is for STOP and not in peptide
2261       // todo: check trailing codon is indeed a STOP codon
2262       codesForResidues--;
2263       mappedDnaLength -= CODON_LENGTH;
2264       MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2265     }
2266
2267     if (codesForResidues == proteinLength)
2268     {
2269       proteinRange.add(new int[] { proteinStart, proteinEnd });
2270       return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2271     }
2272     return null;
2273   }
2274
2275   /**
2276    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2277    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2278    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2279    * position order, so this method is only valid for linear CDS in the same
2280    * sense as the protein product.
2281    *
2282    * @param dnaSeq
2283    * @return
2284    */
2285   protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2286   {
2287     List<int[]> result = new ArrayList<>();
2288
2289     List<SequenceFeature> sfs = dnaSeq.getFeatures().getFeaturesByOntology(
2290             SequenceOntologyI.CDS);
2291     if (sfs.isEmpty())
2292     {
2293       return result;
2294     }
2295     SequenceFeatures.sortFeatures(sfs, true);
2296
2297     for (SequenceFeature sf : sfs)
2298     {
2299       int phase = 0;
2300       try
2301       {
2302         phase = Integer.parseInt(sf.getPhase());
2303       } catch (NumberFormatException e)
2304       {
2305         // ignore
2306       }
2307       /*
2308        * phase > 0 on first codon means 5' incomplete - skip to the start
2309        * of the next codon; example ENST00000496384
2310        */
2311       int begin = sf.getBegin();
2312       int end = sf.getEnd();
2313       if (result.isEmpty() && phase > 0)
2314       {
2315         begin += phase;
2316         if (begin > end)
2317         {
2318           // shouldn't happen!
2319           System.err
2320                   .println("Error: start phase extends beyond start CDS in "
2321                           + dnaSeq.getName());
2322         }
2323       }
2324       result.add(new int[] { begin, end });
2325     }
2326
2327     /*
2328      * Finally sort ranges by start position. This avoids a dependency on
2329      * keeping features in order on the sequence (if they are in order anyway,
2330      * the sort will have almost no work to do). The implicit assumption is CDS
2331      * ranges are assembled in order. Other cases should not use this method,
2332      * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2333      */
2334     Collections.sort(result, IntRangeComparator.ASCENDING);
2335     return result;
2336   }
2337
2338   /**
2339    * Maps exon features from dna to protein, and computes variants in peptide
2340    * product generated by variants in dna, and adds them as sequence_variant
2341    * features on the protein sequence. Returns the number of variant features
2342    * added.
2343    *
2344    * @param dnaSeq
2345    * @param peptide
2346    * @param dnaToProtein
2347    */
2348   public static int computeProteinFeatures(SequenceI dnaSeq,
2349           SequenceI peptide, MapList dnaToProtein)
2350   {
2351     while (dnaSeq.getDatasetSequence() != null)
2352     {
2353       dnaSeq = dnaSeq.getDatasetSequence();
2354     }
2355     while (peptide.getDatasetSequence() != null)
2356     {
2357       peptide = peptide.getDatasetSequence();
2358     }
2359
2360     transferFeatures(dnaSeq, peptide, dnaToProtein, SequenceOntologyI.EXON);
2361
2362     /*
2363      * compute protein variants from dna variants and codon mappings;
2364      * NB - alternatively we could retrieve this using the REST service e.g.
2365      * http://rest.ensembl.org/overlap/translation
2366      * /ENSP00000288602?feature=transcript_variation;content-type=text/xml
2367      * which would be a bit slower but possibly more reliable
2368      */
2369
2370     /*
2371      * build a map with codon variations for each potentially varying peptide
2372      */
2373     LinkedHashMap<Integer, List<DnaVariant>[]> variants = buildDnaVariantsMap(
2374             dnaSeq, dnaToProtein);
2375
2376     /*
2377      * scan codon variations, compute peptide variants and add to peptide sequence
2378      */
2379     int count = 0;
2380     for (Entry<Integer, List<DnaVariant>[]> variant : variants.entrySet())
2381     {
2382       int peptidePos = variant.getKey();
2383       List<DnaVariant>[] codonVariants = variant.getValue();
2384       count += computePeptideVariants(peptide, peptidePos, codonVariants);
2385     }
2386
2387     return count;
2388   }
2389
2390   /**
2391    * Computes non-synonymous peptide variants from codon variants and adds them
2392    * as sequence_variant features on the protein sequence (one feature per
2393    * allele variant). Selected attributes (variant id, clinical significance)
2394    * are copied over to the new features.
2395    *
2396    * @param peptide
2397    *          the protein sequence
2398    * @param peptidePos
2399    *          the position to compute peptide variants for
2400    * @param codonVariants
2401    *          a list of dna variants per codon position
2402    * @return the number of features added
2403    */
2404   static int computePeptideVariants(SequenceI peptide, int peptidePos,
2405           List<DnaVariant>[] codonVariants)
2406   {
2407     String residue = String.valueOf(peptide.getCharAt(peptidePos - 1));
2408     int count = 0;
2409     String base1 = codonVariants[0].get(0).base;
2410     String base2 = codonVariants[1].get(0).base;
2411     String base3 = codonVariants[2].get(0).base;
2412
2413     /*
2414      * variants in first codon base
2415      */
2416     for (DnaVariant var : codonVariants[0])
2417     {
2418       if (var.variant != null)
2419       {
2420         String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES);
2421         if (alleles != null)
2422         {
2423           for (String base : alleles.split(","))
2424           {
2425             if (!base1.equalsIgnoreCase(base))
2426             {
2427               String codon = base.toUpperCase() + base2.toLowerCase()
2428                       + base3.toLowerCase();
2429               String canonical = base1.toUpperCase() + base2.toLowerCase()
2430                       + base3.toLowerCase();
2431               if (addPeptideVariant(peptide, peptidePos, residue, var,
2432                       codon, canonical))
2433               {
2434                 count++;
2435               }
2436             }
2437           }
2438         }
2439       }
2440     }
2441
2442     /*
2443      * variants in second codon base
2444      */
2445     for (DnaVariant var : codonVariants[1])
2446     {
2447       if (var.variant != null)
2448       {
2449         String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES);
2450         if (alleles != null)
2451         {
2452           for (String base : alleles.split(","))
2453           {
2454             if (!base2.equalsIgnoreCase(base))
2455             {
2456               String codon = base1.toLowerCase() + base.toUpperCase()
2457                       + base3.toLowerCase();
2458               String canonical = base1.toLowerCase() + base2.toUpperCase()
2459                       + base3.toLowerCase();
2460               if (addPeptideVariant(peptide, peptidePos, residue, var,
2461                       codon, canonical))
2462               {
2463                 count++;
2464               }
2465             }
2466           }
2467         }
2468       }
2469     }
2470
2471     /*
2472      * variants in third codon base
2473      */
2474     for (DnaVariant var : codonVariants[2])
2475     {
2476       if (var.variant != null)
2477       {
2478         String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES);
2479         if (alleles != null)
2480         {
2481           for (String base : alleles.split(","))
2482           {
2483             if (!base3.equalsIgnoreCase(base))
2484             {
2485               String codon = base1.toLowerCase() + base2.toLowerCase()
2486                       + base.toUpperCase();
2487               String canonical = base1.toLowerCase() + base2.toLowerCase()
2488                       + base3.toUpperCase();
2489               if (addPeptideVariant(peptide, peptidePos, residue, var,
2490                       codon, canonical))
2491               {
2492                 count++;
2493               }
2494             }
2495           }
2496         }
2497       }
2498     }
2499
2500     return count;
2501   }
2502
2503   /**
2504    * Helper method that adds a peptide variant feature. ID and
2505    * clinical_significance attributes of the dna variant (if present) are copied
2506    * to the new feature.
2507    *
2508    * @param peptide
2509    * @param peptidePos
2510    * @param residue
2511    * @param var
2512    * @param codon
2513    *          the variant codon e.g. aCg
2514    * @param canonical
2515    *          the 'normal' codon e.g. aTg
2516    * @return true if a feature was added, else false
2517    */
2518   static boolean addPeptideVariant(SequenceI peptide, int peptidePos,
2519           String residue, DnaVariant var, String codon, String canonical)
2520   {
2521     /*
2522      * get peptide translation of codon e.g. GAT -> D
2523      * note that variants which are not single alleles,
2524      * e.g. multibase variants or HGMD_MUTATION etc
2525      * are currently ignored here
2526      */
2527     String trans = codon.contains("-") ? null
2528             : (codon.length() > CODON_LENGTH ? null
2529                     : ResidueProperties.codonTranslate(codon));
2530     if (trans == null)
2531     {
2532       return false;
2533     }
2534     String desc = canonical + "/" + codon;
2535     String featureType = "";
2536     if (trans.equals(residue))
2537     {
2538       featureType = SequenceOntologyI.SYNONYMOUS_VARIANT;
2539     }
2540     else if (ResidueProperties.STOP.equals(trans))
2541     {
2542       featureType = SequenceOntologyI.STOP_GAINED;
2543     }
2544     else
2545     {
2546       String residue3Char = StringUtils
2547               .toSentenceCase(ResidueProperties.aa2Triplet.get(residue));
2548       String trans3Char = StringUtils
2549               .toSentenceCase(ResidueProperties.aa2Triplet.get(trans));
2550       desc = "p." + residue3Char + peptidePos + trans3Char;
2551       featureType = SequenceOntologyI.NONSYNONYMOUS_VARIANT;
2552     }
2553     SequenceFeature sf = new SequenceFeature(featureType, desc, peptidePos,
2554             peptidePos, var.getSource());
2555
2556     StringBuilder attributes = new StringBuilder(32);
2557     String id = (String) var.variant.getValue(VARIANT_ID);
2558     if (id != null)
2559     {
2560       if (id.startsWith(SEQUENCE_VARIANT))
2561       {
2562         id = id.substring(SEQUENCE_VARIANT.length());
2563       }
2564       sf.setValue(VARIANT_ID, id);
2565       attributes.append(VARIANT_ID).append("=").append(id);
2566       // TODO handle other species variants JAL-2064
2567       StringBuilder link = new StringBuilder(32);
2568       try
2569       {
2570         link.append(desc).append(" ").append(id).append(
2571                 "|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=")
2572                 .append(URLEncoder.encode(id, "UTF-8"));
2573         sf.addLink(link.toString());
2574       } catch (UnsupportedEncodingException e)
2575       {
2576         // as if
2577       }
2578     }
2579     String clinSig = (String) var.variant.getValue(CLINICAL_SIGNIFICANCE);
2580     if (clinSig != null)
2581     {
2582       sf.setValue(CLINICAL_SIGNIFICANCE, clinSig);
2583       attributes.append(";").append(CLINICAL_SIGNIFICANCE).append("=")
2584               .append(clinSig);
2585     }
2586     peptide.addSequenceFeature(sf);
2587     if (attributes.length() > 0)
2588     {
2589       sf.setAttributes(attributes.toString());
2590     }
2591     return true;
2592   }
2593
2594   /**
2595    * Builds a map whose key is position in the protein sequence, and value is a
2596    * list of the base and all variants for each corresponding codon position.
2597    * <p>
2598    * This depends on dna variants being held as a comma-separated list as
2599    * property "alleles" on variant features.
2600    *
2601    * @param dnaSeq
2602    * @param dnaToProtein
2603    * @return
2604    */
2605   @SuppressWarnings("unchecked")
2606   static LinkedHashMap<Integer, List<DnaVariant>[]> buildDnaVariantsMap(
2607           SequenceI dnaSeq, MapList dnaToProtein)
2608   {
2609     /*
2610      * map from peptide position to all variants of the codon which codes for it
2611      * LinkedHashMap ensures we keep the peptide features in sequence order
2612      */
2613     LinkedHashMap<Integer, List<DnaVariant>[]> variants = new LinkedHashMap<>();
2614
2615     List<SequenceFeature> dnaFeatures = dnaSeq.getFeatures()
2616             .getFeaturesByOntology(SequenceOntologyI.SEQUENCE_VARIANT);
2617     if (dnaFeatures.isEmpty())
2618     {
2619       return variants;
2620     }
2621
2622     int dnaStart = dnaSeq.getStart();
2623     int[] lastCodon = null;
2624     int lastPeptidePostion = 0;
2625
2626     /*
2627      * build a map of codon variations for peptides
2628      */
2629     for (SequenceFeature sf : dnaFeatures)
2630     {
2631       int dnaCol = sf.getBegin();
2632       if (dnaCol != sf.getEnd())
2633       {
2634         // not handling multi-locus variant features
2635         continue;
2636       }
2637
2638       /*
2639        * ignore variant if not a SNP
2640        */
2641       String alls = (String) sf.getValue(Gff3Helper.ALLELES);
2642       if (alls == null)
2643       {
2644         continue; // non-SNP VCF variant perhaps - can't process this
2645       }
2646
2647       String[] alleles = alls.toUpperCase().split(",");
2648       boolean isSnp = true;
2649       for (String allele : alleles)
2650       {
2651         if (allele.trim().length() > 1)
2652         {
2653           isSnp = false;
2654         }
2655       }
2656       if (!isSnp)
2657       {
2658         continue;
2659       }
2660
2661       int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
2662       if (mapsTo == null)
2663       {
2664         // feature doesn't lie within coding region
2665         continue;
2666       }
2667       int peptidePosition = mapsTo[0];
2668       List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
2669       if (codonVariants == null)
2670       {
2671         codonVariants = new ArrayList[CODON_LENGTH];
2672         codonVariants[0] = new ArrayList<>();
2673         codonVariants[1] = new ArrayList<>();
2674         codonVariants[2] = new ArrayList<>();
2675         variants.put(peptidePosition, codonVariants);
2676       }
2677
2678       /*
2679        * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
2680        */
2681       int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
2682               : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
2683                       peptidePosition, peptidePosition));
2684       lastPeptidePostion = peptidePosition;
2685       lastCodon = codon;
2686
2687       /*
2688        * save nucleotide (and any variant) for each codon position
2689        */
2690       for (int codonPos = 0; codonPos < CODON_LENGTH; codonPos++)
2691       {
2692         String nucleotide = String.valueOf(
2693                 dnaSeq.getCharAt(codon[codonPos] - dnaStart)).toUpperCase();
2694         List<DnaVariant> codonVariant = codonVariants[codonPos];
2695         if (codon[codonPos] == dnaCol)
2696         {
2697           if (!codonVariant.isEmpty()
2698                   && codonVariant.get(0).variant == null)
2699           {
2700             /*
2701              * already recorded base value, add this variant
2702              */
2703             codonVariant.get(0).variant = sf;
2704           }
2705           else
2706           {
2707             /*
2708              * add variant with base value
2709              */
2710             codonVariant.add(new DnaVariant(nucleotide, sf));
2711           }
2712         }
2713         else if (codonVariant.isEmpty())
2714         {
2715           /*
2716            * record (possibly non-varying) base value
2717            */
2718           codonVariant.add(new DnaVariant(nucleotide));
2719         }
2720       }
2721     }
2722     return variants;
2723   }
2724
2725   /**
2726    * Makes an alignment with a copy of the given sequences, adding in any
2727    * non-redundant sequences which are mapped to by the cross-referenced
2728    * sequences.
2729    *
2730    * @param seqs
2731    * @param xrefs
2732    * @param dataset
2733    *          the alignment dataset shared by the new copy
2734    * @return
2735    */
2736   public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2737           SequenceI[] xrefs, AlignmentI dataset)
2738   {
2739     AlignmentI copy = new Alignment(new Alignment(seqs));
2740     copy.setDataset(dataset);
2741     boolean isProtein = !copy.isNucleotide();
2742     SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2743     if (xrefs != null)
2744     {
2745       for (SequenceI xref : xrefs)
2746       {
2747         DBRefEntry[] dbrefs = xref.getDBRefs();
2748         if (dbrefs != null)
2749         {
2750           for (DBRefEntry dbref : dbrefs)
2751           {
2752             if (dbref.getMap() == null || dbref.getMap().getTo() == null
2753                     || dbref.getMap().getTo().isProtein() != isProtein)
2754             {
2755               continue;
2756             }
2757             SequenceI mappedTo = dbref.getMap().getTo();
2758             SequenceI match = matcher.findIdMatch(mappedTo);
2759             if (match == null)
2760             {
2761               matcher.add(mappedTo);
2762               copy.addSequence(mappedTo);
2763             }
2764           }
2765         }
2766       }
2767     }
2768     return copy;
2769   }
2770
2771   /**
2772    * Try to align sequences in 'unaligned' to match the alignment of their
2773    * mapped regions in 'aligned'. For example, could use this to align CDS
2774    * sequences which are mapped to their parent cDNA sequences.
2775    *
2776    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2777    * dna-to-protein or protein-to-dna use alternative methods.
2778    *
2779    * @param unaligned
2780    *          sequences to be aligned
2781    * @param aligned
2782    *          holds aligned sequences and their mappings
2783    * @return
2784    */
2785   public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2786   {
2787     /*
2788      * easy case - aligning a copy of aligned sequences
2789      */
2790     if (alignAsSameSequences(unaligned, aligned))
2791     {
2792       return unaligned.getHeight();
2793     }
2794
2795     /*
2796      * fancy case - aligning via mappings between sequences
2797      */
2798     List<SequenceI> unmapped = new ArrayList<>();
2799     Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2800             unaligned, aligned, unmapped);
2801     int width = columnMap.size();
2802     char gap = unaligned.getGapCharacter();
2803     int realignedCount = 0;
2804     // TODO: verify this loop scales sensibly for very wide/high alignments
2805
2806     for (SequenceI seq : unaligned.getSequences())
2807     {
2808       if (!unmapped.contains(seq))
2809       {
2810         char[] newSeq = new char[width];
2811         Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2812                                   // Integer iteration below
2813         int newCol = 0;
2814         int lastCol = 0;
2815
2816         /*
2817          * traverse the map to find columns populated
2818          * by our sequence
2819          */
2820         for (Integer column : columnMap.keySet())
2821         {
2822           Character c = columnMap.get(column).get(seq);
2823           if (c != null)
2824           {
2825             /*
2826              * sequence has a character at this position
2827              *
2828              */
2829             newSeq[newCol] = c;
2830             lastCol = newCol;
2831           }
2832           newCol++;
2833         }
2834
2835         /*
2836          * trim trailing gaps
2837          */
2838         if (lastCol < width)
2839         {
2840           char[] tmp = new char[lastCol + 1];
2841           System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2842           newSeq = tmp;
2843         }
2844         // TODO: optimise SequenceI to avoid char[]->String->char[]
2845         seq.setSequence(String.valueOf(newSeq));
2846         realignedCount++;
2847       }
2848     }
2849     return realignedCount;
2850   }
2851
2852   /**
2853    * If unaligned and aligned sequences share the same dataset sequences, then
2854    * simply copies the aligned sequences to the unaligned sequences and returns
2855    * true; else returns false
2856    *
2857    * @param unaligned
2858    *          - sequences to be aligned based on aligned
2859    * @param aligned
2860    *          - 'guide' alignment containing sequences derived from same dataset
2861    *          as unaligned
2862    * @return
2863    */
2864   static boolean alignAsSameSequences(AlignmentI unaligned,
2865           AlignmentI aligned)
2866   {
2867     if (aligned.getDataset() == null || unaligned.getDataset() == null)
2868     {
2869       return false; // should only pass alignments with datasets here
2870     }
2871
2872     // map from dataset sequence to alignment sequence(s)
2873     Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2874     for (SequenceI seq : aligned.getSequences())
2875     {
2876       SequenceI ds = seq.getDatasetSequence();
2877       if (alignedDatasets.get(ds) == null)
2878       {
2879         alignedDatasets.put(ds, new ArrayList<SequenceI>());
2880       }
2881       alignedDatasets.get(ds).add(seq);
2882     }
2883
2884     /*
2885      * first pass - check whether all sequences to be aligned share a dataset
2886      * sequence with an aligned sequence
2887      */
2888     for (SequenceI seq : unaligned.getSequences())
2889     {
2890       if (!alignedDatasets.containsKey(seq.getDatasetSequence()))
2891       {
2892         return false;
2893       }
2894     }
2895
2896     /*
2897      * second pass - copy aligned sequences;
2898      * heuristic rule: pair off sequences in order for the case where
2899      * more than one shares the same dataset sequence
2900      */
2901     for (SequenceI seq : unaligned.getSequences())
2902     {
2903       List<SequenceI> alignedSequences = alignedDatasets
2904               .get(seq.getDatasetSequence());
2905       // TODO: getSequenceAsString() will be deprecated in the future
2906       // TODO: need to leave to SequenceI implementor to update gaps
2907       seq.setSequence(alignedSequences.get(0).getSequenceAsString());
2908       if (alignedSequences.size() > 0)
2909       {
2910         // pop off aligned sequences (except the last one)
2911         alignedSequences.remove(0);
2912       }
2913     }
2914
2915     return true;
2916   }
2917
2918   /**
2919    * Returns a map whose key is alignment column number (base 1), and whose
2920    * values are a map of sequence characters in that column.
2921    *
2922    * @param unaligned
2923    * @param aligned
2924    * @param unmapped
2925    * @return
2926    */
2927   static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2928           AlignmentI unaligned, AlignmentI aligned,
2929           List<SequenceI> unmapped)
2930   {
2931     /*
2932      * Map will hold, for each aligned column position, a map of
2933      * {unalignedSequence, characterPerSequence} at that position.
2934      * TreeMap keeps the entries in ascending column order.
2935      */
2936     SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2937
2938     /*
2939      * record any sequences that have no mapping so can't be realigned
2940      */
2941     unmapped.addAll(unaligned.getSequences());
2942
2943     List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2944
2945     for (SequenceI seq : unaligned.getSequences())
2946     {
2947       for (AlignedCodonFrame mapping : mappings)
2948       {
2949         SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2950         if (fromSeq != null)
2951         {
2952           Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2953           if (addMappedPositions(seq, fromSeq, seqMap, map))
2954           {
2955             unmapped.remove(seq);
2956           }
2957         }
2958       }
2959     }
2960     return map;
2961   }
2962
2963   /**
2964    * Helper method that adds to a map the mapped column positions of a sequence.
2965    * <br>
2966    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2967    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2968    * sequence.
2969    *
2970    * @param seq
2971    *          the sequence whose column positions we are recording
2972    * @param fromSeq
2973    *          a sequence that is mapped to the first sequence
2974    * @param seqMap
2975    *          the mapping from 'fromSeq' to 'seq'
2976    * @param map
2977    *          a map to add the column positions (in fromSeq) of the mapped
2978    *          positions of seq
2979    * @return
2980    */
2981   static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2982           Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2983   {
2984     if (seqMap == null)
2985     {
2986       return false;
2987     }
2988
2989     /*
2990      * invert mapping if it is from unaligned to aligned sequence
2991      */
2992     if (seqMap.getTo() == fromSeq.getDatasetSequence())
2993     {
2994       seqMap = new Mapping(seq.getDatasetSequence(),
2995               seqMap.getMap().getInverse());
2996     }
2997
2998     int toStart = seq.getStart();
2999
3000     /*
3001      * traverse [start, end, start, end...] ranges in fromSeq
3002      */
3003     for (int[] fromRange : seqMap.getMap().getFromRanges())
3004     {
3005       for (int i = 0; i < fromRange.length - 1; i += 2)
3006       {
3007         boolean forward = fromRange[i + 1] >= fromRange[i];
3008
3009         /*
3010          * find the range mapped to (sequence positions base 1)
3011          */
3012         int[] range = seqMap.locateMappedRange(fromRange[i],
3013                 fromRange[i + 1]);
3014         if (range == null)
3015         {
3016           System.err.println("Error in mapping " + seqMap + " from "
3017                   + fromSeq.getName());
3018           return false;
3019         }
3020         int fromCol = fromSeq.findIndex(fromRange[i]);
3021         int mappedCharPos = range[0];
3022
3023         /*
3024          * walk over the 'from' aligned sequence in forward or reverse
3025          * direction; when a non-gap is found, record the column position
3026          * of the next character of the mapped-to sequence; stop when all
3027          * the characters of the range have been counted
3028          */
3029         while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
3030                 && fromCol >= 0)
3031         {
3032           if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
3033           {
3034             /*
3035              * mapped from sequence has a character in this column
3036              * record the column position for the mapped to character
3037              */
3038             Map<SequenceI, Character> seqsMap = map.get(fromCol);
3039             if (seqsMap == null)
3040             {
3041               seqsMap = new HashMap<>();
3042               map.put(fromCol, seqsMap);
3043             }
3044             seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
3045             mappedCharPos++;
3046           }
3047           fromCol += (forward ? 1 : -1);
3048         }
3049       }
3050     }
3051     return true;
3052   }
3053
3054   // strictly temporary hack until proper criteria for aligning protein to cds
3055   // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
3056   public static boolean looksLikeEnsembl(AlignmentI alignment)
3057   {
3058     for (SequenceI seq : alignment.getSequences())
3059     {
3060       String name = seq.getName();
3061       if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
3062       {
3063         return false;
3064       }
3065     }
3066     return true;
3067   }
3068 }