JAL-2598 Sequence.getSequence return a copy of the char array

[jalview.git] / src / jalview / analysis / AlignmentUtils.java
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java

index ba9821c..b65096c 100644 (file)
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -22,7 +22,6 @@ package jalview.analysis;
  
  import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
  
-import jalview.api.DBRefEntryI;
  import jalview.datamodel.AlignedCodon;
  import jalview.datamodel.AlignedCodonFrame;
  import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
@@ -36,11 +35,12 @@ import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceGroup;
  import jalview.datamodel.SequenceI;
-import jalview.io.gff.SequenceOntologyFactory;
+import jalview.datamodel.features.SequenceFeatures;
  import jalview.io.gff.SequenceOntologyI;
  import jalview.schemes.ResidueProperties;
  import jalview.util.Comparison;
  import jalview.util.DBRefUtils;
+import jalview.util.IntRangeComparator;
  import jalview.util.MapList;
  import jalview.util.MappingUtils;
  import jalview.util.StringUtils;
@@ -51,7 +51,6 @@ import java.util.ArrayList;
  import java.util.Arrays;
  import java.util.Collection;
  import java.util.Collections;
-import java.util.Comparator;
  import java.util.HashMap;
  import java.util.HashSet;
  import java.util.Iterator;
@@ -61,6 +60,7 @@ import java.util.Map;
  import java.util.Map.Entry;
  import java.util.NoSuchElementException;
  import java.util.Set;
+import java.util.SortedMap;
  import java.util.TreeMap;
  
  /**
@@ -73,22 +73,26 @@ import java.util.TreeMap;
  public class AlignmentUtils
  {
  
+  private static final int CODON_LENGTH = 3;
+
    private static final String SEQUENCE_VARIANT = "sequence_variant:";
+
    private static final String ID = "ID";
  
    /**
     * A data model to hold the 'normal' base value at a position, and an optional
     * sequence variant feature
     */
-  static class DnaVariant
+  static final class DnaVariant
    {
-    String base;
+    final String base;
  
      SequenceFeature variant;
  
      DnaVariant(String nuc)
      {
        base = nuc;
+      variant = null;
      }
  
      DnaVariant(String nuc, SequenceFeature var)
@@ -96,6 +100,11 @@ public class AlignmentUtils
        base = nuc;
        variant = var;
      }
+
+    public String getSource()
+    {
+      return variant == null ? null : variant.getFeatureGroup();
+    }
    }
  
    /**
@@ -428,7 +437,7 @@ public class AlignmentUtils
      /*
       * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
       */
-    final int mappedLength = 3 * aaSeqChars.length;
+    final int mappedLength = CODON_LENGTH * aaSeqChars.length;
      int cdnaLength = cdnaSeqChars.length;
      int cdnaStart = cdnaSeq.getStart();
      int cdnaEnd = cdnaSeq.getEnd();
@@ -440,14 +449,14 @@ public class AlignmentUtils
       */
      if (cdnaLength != mappedLength && cdnaLength > 2)
      {
-      String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3)
-              .toUpperCase();
+      String lastCodon = String.valueOf(cdnaSeqChars,
+              cdnaLength - CODON_LENGTH, CODON_LENGTH).toUpperCase();
        for (String stop : ResidueProperties.STOP)
        {
          if (lastCodon.equals(stop))
          {
-          cdnaEnd -= 3;
-          cdnaLength -= 3;
+          cdnaEnd -= CODON_LENGTH;
+          cdnaLength -= CODON_LENGTH;
            break;
          }
        }
@@ -459,12 +468,12 @@ public class AlignmentUtils
      int startOffset = 0;
      if (cdnaLength != mappedLength
              && cdnaLength > 2
-            && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase()
+            && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH).toUpperCase()
                      .equals(ResidueProperties.START))
      {
-      startOffset += 3;
-      cdnaStart += 3;
-      cdnaLength -= 3;
+      startOffset += CODON_LENGTH;
+      cdnaStart += CODON_LENGTH;
+      cdnaLength -= CODON_LENGTH;
      }
  
      if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
@@ -473,7 +482,7 @@ public class AlignmentUtils
         * protein is translation of dna (+/- start/stop codons)
         */
        MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[]
-      { proteinStart, proteinEnd }, 3, 1);
+      { proteinStart, proteinEnd }, CODON_LENGTH, 1);
        return map;
      }
  
@@ -503,10 +512,9 @@ public class AlignmentUtils
  
      int aaPos = 0;
      int dnaPos = cdnaStart;
-    for (; dnaPos < cdnaSeqChars.length - 2
-            && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++)
+    for (; dnaPos < cdnaSeqChars.length - 2 && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
      {
-      String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+      String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
        final String translated = ResidueProperties.codonTranslate(codon);
  
        /*
@@ -542,9 +550,9 @@ public class AlignmentUtils
      {
        return true;
      }
-    if (dnaPos == cdnaSeqChars.length - 3)
+    if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
      {
-      String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+      String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
        if ("STOP".equals(ResidueProperties.codonTranslate(codon)))
        {
          return true;
@@ -642,15 +650,16 @@ public class AlignmentUtils
      int toOffset = alignTo.getStart() - 1;
      int sourceGapMappedLength = 0;
      boolean inExon = false;
-    final char[] thisSeq = alignTo.getSequence();
-    final char[] thatAligned = alignFrom.getSequence();
-    StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
+    final int toLength = alignTo.getLength();
+    final int fromLength = alignFrom.getLength();
+    StringBuilder thisAligned = new StringBuilder(2 * toLength);
  
      /*
       * Traverse the 'model' aligned sequence
       */
-    for (char sourceChar : thatAligned)
+    for (int i = 0; i < fromLength; i++)
      {
+      char sourceChar = alignFrom.getCharAt(i);
        if (sourceChar == sourceGap)
        {
          sourceGapMappedLength += ratio;
@@ -690,9 +699,9 @@ public class AlignmentUtils
         */
        int intronLength = 0;
        while (basesWritten + toOffset < mappedCodonEnd
-              && thisSeqPos < thisSeq.length)
+              && thisSeqPos < toLength)
        {
-        final char c = thisSeq[thisSeqPos++];
+        final char c = alignTo.getCharAt(thisSeqPos++);
          if (c != myGapChar)
          {
            basesWritten++;
@@ -718,7 +727,7 @@ public class AlignmentUtils
              int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
                      preserveUnmappedGaps, sourceGapMappedLength, inExon,
                      trailingCopiedGap.length(), intronLength, startOfCodon);
-            for (int i = 0; i < gapsToAdd; i++)
+            for (int k = 0; k < gapsToAdd; k++)
              {
                thisAligned.append(myGapChar);
              }
@@ -746,9 +755,9 @@ public class AlignmentUtils
       * At end of model aligned sequence. Copy any remaining target sequence, optionally
       * including (intron) gaps.
       */
-    while (thisSeqPos < thisSeq.length)
+    while (thisSeqPos < toLength)
      {
-      final char c = thisSeq[thisSeqPos++];
+      final char c = alignTo.getCharAt(thisSeqPos++);
        if (c != myGapChar || preserveUnmappedGaps)
        {
          thisAligned.append(c);
@@ -895,7 +904,8 @@ public class AlignmentUtils
        }
        width = Math.max(dnaSeq.getLength(), width);
      }
-    int oldwidth, diff;
+    int oldwidth;
+    int diff;
      for (SequenceI dnaSeq : dna.getSequences())
      {
        oldwidth = dnaSeq.getLength();
@@ -929,15 +939,15 @@ public class AlignmentUtils
                .println("alignCdsSequenceAsProtein needs aligned sequence!");
        return false;
      }
-    
+
      List<AlignedCodonFrame> dnaMappings = MappingUtils
              .findMappingsForSequence(cdsSeq, mappings);
      for (AlignedCodonFrame mapping : dnaMappings)
      {
        SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
-      int peptideLength = peptide.getLength();
        if (peptide != null)
        {
+        final int peptideLength = peptide.getLength();
          Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
          if (map != null)
          {
@@ -946,12 +956,13 @@ public class AlignmentUtils
            {
              mapList = mapList.getInverse();
            }
-          int cdsLength = cdsDss.getLength();
+          final int cdsLength = cdsDss.getLength();
            int mappedFromLength = MappingUtils.getLength(mapList
                    .getFromRanges());
            int mappedToLength = MappingUtils
                    .getLength(mapList.getToRanges());
-          boolean addStopCodon = (cdsLength == mappedFromLength * 3 + 3)
+          boolean addStopCodon = (cdsLength == mappedFromLength
+                  * CODON_LENGTH + CODON_LENGTH)
                    || (peptide.getDatasetSequence().getLength() == mappedFromLength - 1);
            if (cdsLength != mappedToLength && !addStopCodon)
            {
@@ -965,25 +976,26 @@ public class AlignmentUtils
            /*
             * pre-fill the aligned cds sequence with gaps
             */
-          char[] alignedCds = new char[peptideLength * 3
-                  + (addStopCodon ? 3 : 0)];
+          char[] alignedCds = new char[peptideLength * CODON_LENGTH
+                  + (addStopCodon ? CODON_LENGTH : 0)];
            Arrays.fill(alignedCds, gapChar);
  
            /*
             * walk over the aligned peptide sequence and insert mapped 
             * codons for residues in the aligned cds sequence 
             */
-          char[] alignedPeptide = peptide.getSequence();
-          char[] nucleotides = cdsDss.getSequence();
            int copiedBases = 0;
            int cdsStart = cdsDss.getStart();
            int proteinPos = peptide.getStart() - 1;
            int cdsCol = 0;
-          for (char residue : alignedPeptide)
+
+          for (int col = 0; col < peptideLength; col++)
            {
+            char residue = peptide.getCharAt(col);
+
              if (Comparison.isGap(residue))
              {
-              cdsCol += 3;
+              cdsCol += CODON_LENGTH;
              }
              else
              {
@@ -992,13 +1004,13 @@ public class AlignmentUtils
                if (codon == null)
                {
                  // e.g. incomplete start codon, X in peptide
-                cdsCol += 3;
+                cdsCol += CODON_LENGTH;
                }
                else
                {
                  for (int j = codon[0]; j <= codon[1]; j++)
                  {
-                  char mappedBase = nucleotides[j - cdsStart];
+                  char mappedBase = cdsDss.getCharAt(j - cdsStart);
                    alignedCds[cdsCol++] = mappedBase;
                    copiedBases++;
                  }
@@ -1010,7 +1022,7 @@ public class AlignmentUtils
             * append stop codon if not mapped from protein,
             * closing it up to the end of the mapped sequence
             */
-          if (copiedBases == nucleotides.length - 3)
+          if (copiedBases == cdsLength - CODON_LENGTH)
            {
              for (int i = alignedCds.length - 1; i >= 0; i--)
              {
@@ -1020,9 +1032,9 @@ public class AlignmentUtils
                  break;
                }
              }
-            for (int i = nucleotides.length - 3; i < nucleotides.length; i++)
+            for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
              {
-              alignedCds[cdsCol++] = nucleotides[i];
+              alignedCds[cdsCol++] = cdsDss.getCharAt(i);
              }
            }
            cdsSeq.setSequence(new String(alignedCds));
@@ -1090,7 +1102,7 @@ public class AlignmentUtils
      // TODO resolve JAL-2022 so this fudge can be removed
      int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
      addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
-    
+
      return alignedCodons;
    }
  
@@ -1192,21 +1204,26 @@ public class AlignmentUtils
            List<SequenceI> unmappedProtein)
    {
      /*
-     * Prefill aligned sequences with gaps before inserting aligned protein
-     * residues.
+     * prefill peptide sequences with gaps 
       */
      int alignedWidth = alignedCodons.size();
      char[] gaps = new char[alignedWidth];
      Arrays.fill(gaps, protein.getGapCharacter());
-    String allGaps = String.valueOf(gaps);
+    Map<SequenceI, char[]> peptides = new HashMap<>();
      for (SequenceI seq : protein.getSequences())
      {
        if (!unmappedProtein.contains(seq))
        {
-        seq.setSequence(allGaps);
+        peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
        }
      }
  
+    /*
+     * Traverse the codons left to right (as defined by CodonComparator)
+     * and insert peptides in each column where the sequence is mapped.
+     * This gives a peptide 'alignment' where residues are aligned if their
+     * corresponding codons occupy the same columns in the cdna alignment.
+     */
      int column = 0;
      for (AlignedCodon codon : alignedCodons.keySet())
      {
@@ -1214,12 +1231,20 @@ public class AlignmentUtils
                .get(codon);
        for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
        {
-        // place translated codon at its column position in sequence
-        entry.getKey().getSequence()[column] = entry.getValue().product
-                .charAt(0);
+        char residue = entry.getValue().product.charAt(0);
+        peptides.get(entry.getKey())[column] = residue;
        }
        column++;
      }
+
+    /*
+     * and finally set the constructed sequences
+     */
+    for (Entry<SequenceI, char[]> entry : peptides.entrySet())
+    {
+      entry.getKey().setSequence(new String(entry.getValue()));
+    }
+
      return 0;
    }
  
@@ -1681,11 +1706,20 @@ public class AlignmentUtils
             * didn't find mapped CDS sequence - construct it and add
             * its dataset sequence to the dataset
             */
-          cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping);
-          SequenceI cdsSeqDss = cdsSeq.createDatasetSequence();
+          cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
+                  dataset).deriveSequence();
+          // cdsSeq has a name constructed as CDS|<dbref>
+          // <dbref> will be either the accession for the coding sequence,
+          // marked in the /via/ dbref to the protein product accession
+          // or it will be the original nucleotide accession.
+          SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
+
            cdsSeqs.add(cdsSeq);
+
            if (!dataset.getSequences().contains(cdsSeqDss))
            {
+            // check if this sequence is a newly created one
+            // so needs adding to the dataset
              dataset.addSequence(cdsSeqDss);
            }
  
@@ -1694,10 +1728,12 @@ public class AlignmentUtils
             */
            List<int[]> cdsRange = Collections.singletonList(new int[] { 1,
                cdsSeq.getLength() });
-          MapList cdsToProteinMap = new MapList(cdsRange, mapList.getToRanges(),
-                  mapList.getFromRatio(), mapList.getToRatio());
+          MapList cdsToProteinMap = new MapList(cdsRange,
+                  mapList.getToRanges(), mapList.getFromRatio(),
+                  mapList.getToRatio());
            AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
-          cdsToProteinMapping.addMap(cdsSeq, proteinProduct, cdsToProteinMap);
+          cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
+                  cdsToProteinMap);
  
            /*
             * guard against duplicating the mapping if repeating this action
@@ -1707,14 +1743,15 @@ public class AlignmentUtils
              mappings.add(cdsToProteinMapping);
            }
  
+          propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
+                  proteinProduct, aMapping);
            /*
             * add another mapping from original 'from' range to CDS
             */
            AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
            MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
-                  cdsRange, 1,
-                  1);
-          dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeq,
+                  cdsRange, 1, 1);
+          dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
                    dnaToCdsMap);
            if (!mappings.contains(dnaToCdsMapping))
            {
@@ -1728,12 +1765,37 @@ public class AlignmentUtils
             * same source and accession, so need a different accession for
             * the CDS from the dna sequence
             */
-          DBRefEntryI dnaRef = dnaDss.getSourceDBRef();
-          if (dnaRef != null)
+
+          // specific use case:
+          // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
+          // ENSG02, ENSG03, with transcripts and products similarly named.
+          // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
+
+          // JBPNote: ?? can't actually create an example that demonstrates we
+          // need to
+          // synthesize an xref.
+
+          for (DBRefEntry primRef : dnaDss.getPrimaryDBRefs())
            {
+            // creates a complementary cross-reference to the source sequence's
+            // primary reference.
+
+            DBRefEntry cdsCrossRef = new DBRefEntry(primRef.getSource(),
+                    primRef.getSource() + ":" + primRef.getVersion(),
+                    primRef.getAccessionId());
+            cdsCrossRef
+                    .setMap(new Mapping(dnaDss, new MapList(dnaToCdsMap)));
+            cdsSeqDss.addDBRef(cdsCrossRef);
+
+            // problem here is that the cross-reference is synthesized -
+            // cdsSeq.getName() may be like 'CDS|dnaaccession' or
+            // 'CDS|emblcdsacc'
              // assuming cds version same as dna ?!?
-            DBRefEntry proteinToCdsRef = new DBRefEntry(dnaRef.getSource(),
-                    dnaRef.getVersion(), cdsSeq.getName());
+
+            DBRefEntry proteinToCdsRef = new DBRefEntry(
+                    primRef.getSource(), primRef.getVersion(),
+                    cdsSeq.getName());
+            //
              proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
                      .getInverse()));
              proteinProduct.addDBRef(proteinToCdsRef);
@@ -1742,7 +1804,7 @@ public class AlignmentUtils
            /*
             * transfer any features on dna that overlap the CDS
             */
-          transferFeatures(dnaSeq, cdsSeq, cdsToProteinMap, null,
+          transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
                    SequenceOntologyI.CDS);
          }
        }
@@ -1789,7 +1851,8 @@ public class AlignmentUtils
      int mappedFromLength = MappingUtils.getLength(aMapping.getMap()
              .getFromRanges());
      int dnaLength = seqDss.getLength();
-    if (mappedFromLength == dnaLength || mappedFromLength == dnaLength - 3)
+    if (mappedFromLength == dnaLength
+            || mappedFromLength == dnaLength - CODON_LENGTH)
      {
        return seqDss;
      }
@@ -1805,7 +1868,8 @@ public class AlignmentUtils
        for (SequenceToSequenceMapping map : acf.getMappings())
        {
          Mapping mapping = map.getMapping();
-        if (mapping != aMapping && mapping.getMap().getFromRatio() == 3
+        if (mapping != aMapping
+                && mapping.getMap().getFromRatio() == CODON_LENGTH
                  && proteinProduct == mapping.getTo()
                  && seqDss != map.getFromSeq())
          {
@@ -1839,9 +1903,14 @@ public class AlignmentUtils
     * 
     * @param seq
     * @param mapping
+   * @param dataset
+   *          - existing dataset. We check for sequences that look like the CDS
+   *          we are about to construct, if one exists already, then we will
+   *          just return that one.
     * @return CDS sequence (as a dataset sequence)
     */
-  static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping)
+  static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
+          AlignmentI dataset)
    {
      char[] seqChars = seq.getSequence();
      List<int[]> fromRanges = mapping.getMap().getFromRanges();
@@ -1868,7 +1937,7 @@ public class AlignmentUtils
          }
        }
      }
-    
+
      /*
       * assign 'from id' held in the mapping if set (e.g. EMBL protein_id),
       * else generate a sequence name
@@ -1876,10 +1945,42 @@ public class AlignmentUtils
      String mapFromId = mapping.getMappedFromId();
      String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName());
      SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
+    if (dataset != null)
+    {
+      SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
+      if (matches != null)
+      {
+        boolean matched = false;
+        for (SequenceI mtch : matches)
+        {
+          if (mtch.getStart() != newSeq.getStart())
+          {
+            continue;
+          }
+          if (mtch.getEnd() != newSeq.getEnd())
+          {
+            continue;
+          }
+          if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
+          {
+            continue;
+          }
+          if (!matched)
+          {
+            matched = true;
+            newSeq = mtch;
+          }
+          else
+          {
+            System.err
+                    .println("JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):"
+                            + mtch.toString());
+          }
+        }
+      }
+    }
      // newSeq.setDescription(mapFromId);
  
-    propagateDBRefsToCDS(newSeq, seq, mapping);
-
      return newSeq;
    }
  
@@ -1893,11 +1994,12 @@ public class AlignmentUtils
     * @return list of DBRefEntrys added.
     */
    public static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
-          SequenceI contig, Mapping mapping)
+          SequenceI contig, SequenceI proteinProduct, Mapping mapping)
    {
  
      // gather direct refs from contig congrent with mapping
      List<DBRefEntry> direct = new ArrayList<DBRefEntry>();
+    HashSet<String> directSources = new HashSet<String>();
      if (contig.getDBRefs() != null)
      {
        for (DBRefEntry dbr : contig.getDBRefs())
@@ -1909,24 +2011,51 @@ public class AlignmentUtils
            if (mapping.getMap().equals(map))
            {
              direct.add(dbr);
+            directSources.add(dbr.getSource());
            }
          }
        }
      }
-
+    DBRefEntry[] onSource = DBRefUtils.selectRefs(
+            proteinProduct.getDBRefs(),
+            directSources.toArray(new String[0]));
      List<DBRefEntry> propagated = new ArrayList<DBRefEntry>();
  
      // and generate appropriate mappings
      for (DBRefEntry cdsref : direct)
      {
-      Mapping cdsmap = cdsref.getMap();
+      // clone maplist and mapping
        MapList cdsposmap = new MapList(Arrays.asList(new int[][] { new int[]
-      { cdsSeq.getStart(), cdsSeq.getEnd() } }), cdsmap.getMap()
+      { cdsSeq.getStart(), cdsSeq.getEnd() } }), cdsref.getMap().getMap()
                .getToRanges(), 3, 1);
+      Mapping cdsmap = new Mapping(cdsref.getMap().getTo(), cdsref.getMap()
+              .getMap());
  
+      // create dbref
        DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
                cdsref.getVersion(), cdsref.getAccessionId(), new Mapping(
                        cdsmap.getTo(), cdsposmap));
+
+      // and see if we can map to the protein product for this mapping.
+      // onSource is the filtered set of accessions on protein that we are
+      // tranferring, so we assume accession is the same.
+      if (cdsmap.getTo() == null && onSource != null)
+      {
+        List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
+                cdsref.getAccessionId());
+        if (sourceRefs != null)
+        {
+          for (DBRefEntry srcref : sourceRefs)
+          {
+            if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
+            {
+              // we have found a complementary dbref on the protein product, so
+              // update mapping's getTo
+              newref.getMap().setTo(proteinProduct);
+            }
+          }
+        }
+      }
        cdsSeq.addDBRef(newref);
        propagated.add(newref);
      }
@@ -1940,11 +2069,11 @@ public class AlignmentUtils
     * 
     * @param fromSeq
     * @param toSeq
+   * @param mapping
+   *          the mapping from 'fromSeq' to 'toSeq'
     * @param select
     *          if not null, only features of this type are copied (including
     *          subtypes in the Sequence Ontology)
-   * @param mapping
-   *          the mapping from 'fromSeq' to 'toSeq'
     * @param omitting
     */
    public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
@@ -1956,75 +2085,74 @@ public class AlignmentUtils
        copyTo = copyTo.getDatasetSequence();
      }
  
-    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+    /*
+     * get features, optionally restricted by an ontology term
+     */
+    List<SequenceFeature> sfs = select == null ? fromSeq.getFeatures()
+            .getPositionalFeatures() : fromSeq.getFeatures()
+            .getFeaturesByOntology(select);
+
      int count = 0;
-    SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
-    if (sfs != null)
+    for (SequenceFeature sf : sfs)
      {
-      for (SequenceFeature sf : sfs)
+      String type = sf.getType();
+      boolean omit = false;
+      for (String toOmit : omitting)
        {
-        String type = sf.getType();
-        if (select != null && !so.isA(type, select))
-        {
-          continue;
-        }
-        boolean omit = false;
-        for (String toOmit : omitting)
-        {
-          if (type.equals(toOmit))
-          {
-            omit = true;
-          }
-        }
-        if (omit)
+        if (type.equals(toOmit))
          {
-          continue;
+          omit = true;
          }
+      }
+      if (omit)
+      {
+        continue;
+      }
  
-        /*
-         * locate the mapped range - null if either start or end is
-         * not mapped (no partial overlaps are calculated)
-         */
-        int start = sf.getBegin();
-        int end = sf.getEnd();
-        int[] mappedTo = mapping.locateInTo(start, end);
-        /*
-         * if whole exon range doesn't map, try interpreting it
-         * as 5' or 3' exon overlapping the CDS range
-         */
-        if (mappedTo == null)
-        {
-          mappedTo = mapping.locateInTo(end, end);
-          if (mappedTo != null)
-          {
-            /*
-             * end of exon is in CDS range - 5' overlap
-             * to a range from the start of the peptide
-             */
-            mappedTo[0] = 1;
-          }
-        }
-        if (mappedTo == null)
+      /*
+       * locate the mapped range - null if either start or end is
+       * not mapped (no partial overlaps are calculated)
+       */
+      int start = sf.getBegin();
+      int end = sf.getEnd();
+      int[] mappedTo = mapping.locateInTo(start, end);
+      /*
+       * if whole exon range doesn't map, try interpreting it
+       * as 5' or 3' exon overlapping the CDS range
+       */
+      if (mappedTo == null)
+      {
+        mappedTo = mapping.locateInTo(end, end);
+        if (mappedTo != null)
          {
-          mappedTo = mapping.locateInTo(start, start);
-          if (mappedTo != null)
-          {
-            /*
-             * start of exon is in CDS range - 3' overlap
-             * to a range up to the end of the peptide
-             */
-            mappedTo[1] = toSeq.getLength();
-          }
+          /*
+           * end of exon is in CDS range - 5' overlap
+           * to a range from the start of the peptide
+           */
+          mappedTo[0] = 1;
          }
+      }
+      if (mappedTo == null)
+      {
+        mappedTo = mapping.locateInTo(start, start);
          if (mappedTo != null)
          {
-          SequenceFeature copy = new SequenceFeature(sf);
-          copy.setBegin(Math.min(mappedTo[0], mappedTo[1]));
-          copy.setEnd(Math.max(mappedTo[0], mappedTo[1]));
-          copyTo.addSequenceFeature(copy);
-          count++;
+          /*
+           * start of exon is in CDS range - 3' overlap
+           * to a range up to the end of the peptide
+           */
+          mappedTo[1] = toSeq.getLength();
          }
        }
+      if (mappedTo != null)
+      {
+        int newBegin = Math.min(mappedTo[0], mappedTo[1]);
+        int newEnd = Math.max(mappedTo[0], mappedTo[1]);
+        SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
+                sf.getFeatureGroup(), sf.getScore());
+        copyTo.addSequenceFeature(copy);
+        count++;
+      }
      }
      return count;
    }
@@ -2062,7 +2190,7 @@ public class AlignmentUtils
      /*
       * dna length should map to protein (or protein plus stop codon)
       */
-    int codesForResidues = mappedDnaLength / 3;
+    int codesForResidues = mappedDnaLength / CODON_LENGTH;
      if (codesForResidues == (proteinLength + 1))
      {
        // assuming extra codon is for STOP and not in peptide
@@ -2071,7 +2199,7 @@ public class AlignmentUtils
      if (codesForResidues == proteinLength)
      {
        proteinRange.add(new int[] { proteinStart, proteinEnd });
-      return new MapList(ranges, proteinRange, 3, 1);
+      return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
      }
      return null;
    }
@@ -2089,49 +2217,44 @@ public class AlignmentUtils
    public static List<int[]> findCdsPositions(SequenceI dnaSeq)
    {
      List<int[]> result = new ArrayList<int[]>();
-    SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
-    if (sfs == null)
+
+    List<SequenceFeature> sfs = dnaSeq.getFeatures().getFeaturesByOntology(
+            SequenceOntologyI.CDS);
+    if (sfs.isEmpty())
      {
        return result;
      }
-
-    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
+    SequenceFeatures.sortFeatures(sfs, true);
      int startPhase = 0;
  
      for (SequenceFeature sf : sfs)
      {
+      int phase = 0;
+      try
+      {
+        phase = Integer.parseInt(sf.getPhase());
+      } catch (NumberFormatException e)
+      {
+        // ignore
+      }
        /*
-       * process a CDS feature (or a sub-type of CDS)
+       * phase > 0 on first codon means 5' incomplete - skip to the start
+       * of the next codon; example ENST00000496384
         */
-      if (so.isA(sf.getType(), SequenceOntologyI.CDS))
+      int begin = sf.getBegin();
+      int end = sf.getEnd();
+      if (result.isEmpty())
        {
-        int phase = 0;
-        try
-        {
-          phase = Integer.parseInt(sf.getPhase());
-        } catch (NumberFormatException e)
-        {
-          // ignore
-        }
-        /*
-         * phase > 0 on first codon means 5' incomplete - skip to the start
-         * of the next codon; example ENST00000496384
-         */
-        int begin = sf.getBegin();
-        int end = sf.getEnd();
-        if (result.isEmpty())
+        begin += phase;
+        if (begin > end)
          {
-          begin += phase;
-          if (begin > end)
-          {
-            // shouldn't happen!
-            System.err
-                    .println("Error: start phase extends beyond start CDS in "
-                            + dnaSeq.getName());
-          }
+          // shouldn't happen!
+          System.err
+                  .println("Error: start phase extends beyond start CDS in "
+                          + dnaSeq.getName());
          }
-        result.add(new int[] { begin, end });
        }
+      result.add(new int[] { begin, end });
      }
  
      /*
@@ -2151,14 +2274,7 @@ public class AlignmentUtils
       * ranges are assembled in order. Other cases should not use this method,
       * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
       */
-    Collections.sort(result, new Comparator<int[]>()
-    {
-      @Override
-      public int compare(int[] o1, int[] o2)
-      {
-        return Integer.compare(o1[0], o2[0]);
-      }
-    });
+    Collections.sort(result, IntRangeComparator.ASCENDING);
      return result;
    }
  
@@ -2211,24 +2327,6 @@ public class AlignmentUtils
        count += computePeptideVariants(peptide, peptidePos, codonVariants);
      }
  
-    /*
-     * sort to get sequence features in start position order
-     * - would be better to store in Sequence as a TreeSet or NCList?
-     */
-    if (peptide.getSequenceFeatures() != null)
-    {
-      Arrays.sort(peptide.getSequenceFeatures(),
-              new Comparator<SequenceFeature>()
-              {
-                @Override
-                public int compare(SequenceFeature o1, SequenceFeature o2)
-                {
-                  int c = Integer.compare(o1.getBegin(), o2.getBegin());
-                  return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
-                          : c;
-                }
-              });
-    }
      return count;
    }
  
@@ -2347,7 +2445,7 @@ public class AlignmentUtils
       * are currently ignored here
       */
      String trans = codon.contains("-") ? "-"
-            : (codon.length() > 3 ? null : ResidueProperties
+            : (codon.length() > CODON_LENGTH ? null : ResidueProperties
                      .codonTranslate(codon));
      if (trans != null && !trans.equals(residue))
      {
@@ -2356,10 +2454,9 @@ public class AlignmentUtils
        String trans3Char = StringUtils
                .toSentenceCase(ResidueProperties.aa2Triplet.get(trans));
        String desc = "p." + residue3Char + peptidePos + trans3Char;
-      // set score to 0f so 'graduated colour' option is offered! JAL-2060
        SequenceFeature sf = new SequenceFeature(
                SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
-              peptidePos, 0f, "Jalview");
+              peptidePos, var.getSource());
        StringBuilder attributes = new StringBuilder(32);
        String id = (String) var.variant.getValue(ID);
        if (id != null)
@@ -2370,11 +2467,13 @@ public class AlignmentUtils
          }
          sf.setValue(ID, id);
          attributes.append(ID).append("=").append(id);
-        // TODO handle other species variants
+        // TODO handle other species variants JAL-2064
          StringBuilder link = new StringBuilder(32);
          try
          {
-          link.append(desc).append(" ").append(id)
+          link.append(desc)
+                  .append(" ")
+                  .append(id)
                    .append("|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=")
                    .append(URLEncoder.encode(id, "UTF-8"));
            sf.addLink(link.toString());
@@ -2383,8 +2482,7 @@ public class AlignmentUtils
            // as if
          }
        }
-      String clinSig = (String) var.variant
-              .getValue(CLINICAL_SIGNIFICANCE);
+      String clinSig = (String) var.variant.getValue(CLINICAL_SIGNIFICANCE);
        if (clinSig != null)
        {
          sf.setValue(CLINICAL_SIGNIFICANCE, clinSig);
@@ -2409,6 +2507,7 @@ public class AlignmentUtils
     * @param dnaToProtein
     * @return
     */
+  @SuppressWarnings("unchecked")
    static LinkedHashMap<Integer, List<DnaVariant>[]> buildDnaVariantsMap(
            SequenceI dnaSeq, MapList dnaToProtein)
    {
@@ -2417,10 +2516,10 @@ public class AlignmentUtils
       * LinkedHashMap ensures we keep the peptide features in sequence order
       */
      LinkedHashMap<Integer, List<DnaVariant>[]> variants = new LinkedHashMap<Integer, List<DnaVariant>[]>();
-    SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  
-    SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
-    if (dnaFeatures == null)
+    List<SequenceFeature> dnaFeatures = dnaSeq.getFeatures()
+            .getFeaturesByOntology(SequenceOntologyI.SEQUENCE_VARIANT);
+    if (dnaFeatures.isEmpty())
      {
        return variants;
      }
@@ -2440,84 +2539,80 @@ public class AlignmentUtils
          // not handling multi-locus variant features
          continue;
        }
-      if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
+      int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
+      if (mapsTo == null)
        {
-        int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
-        if (mapsTo == null)
-        {
-          // feature doesn't lie within coding region
-          continue;
-        }
-        int peptidePosition = mapsTo[0];
-        List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
-        if (codonVariants == null)
-        {
-          codonVariants = new ArrayList[3];
-          codonVariants[0] = new ArrayList<DnaVariant>();
-          codonVariants[1] = new ArrayList<DnaVariant>();
-          codonVariants[2] = new ArrayList<DnaVariant>();
-          variants.put(peptidePosition, codonVariants);
-        }
+        // feature doesn't lie within coding region
+        continue;
+      }
+      int peptidePosition = mapsTo[0];
+      List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
+      if (codonVariants == null)
+      {
+        codonVariants = new ArrayList[CODON_LENGTH];
+        codonVariants[0] = new ArrayList<DnaVariant>();
+        codonVariants[1] = new ArrayList<DnaVariant>();
+        codonVariants[2] = new ArrayList<DnaVariant>();
+        variants.put(peptidePosition, codonVariants);
+      }
  
-        /*
-         * extract dna variants to a string array
-         */
-        String alls = (String) sf.getValue("alleles");
-        if (alls == null)
-        {
-          continue;
-        }
-        String[] alleles = alls.toUpperCase().split(",");
-        int i = 0;
-        for (String allele : alleles)
-        {
-          alleles[i++] = allele.trim(); // lose any space characters "A, G"
-        }
+      /*
+       * extract dna variants to a string array
+       */
+      String alls = (String) sf.getValue("alleles");
+      if (alls == null)
+      {
+        continue;
+      }
+      String[] alleles = alls.toUpperCase().split(",");
+      int i = 0;
+      for (String allele : alleles)
+      {
+        alleles[i++] = allele.trim(); // lose any space characters "A, G"
+      }
  
-        /*
-         * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
-         */
-        int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
-                : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
-                        peptidePosition, peptidePosition));
-        lastPeptidePostion = peptidePosition;
-        lastCodon = codon;
+      /*
+       * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
+       */
+      int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
+              : MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
+                      peptidePosition, peptidePosition));
+      lastPeptidePostion = peptidePosition;
+      lastCodon = codon;
  
-        /*
-         * save nucleotide (and any variant) for each codon position
-         */
-        for (int codonPos = 0; codonPos < 3; codonPos++)
+      /*
+       * save nucleotide (and any variant) for each codon position
+       */
+      for (int codonPos = 0; codonPos < CODON_LENGTH; codonPos++)
+      {
+        String nucleotide = String.valueOf(
+                dnaSeq.getCharAt(codon[codonPos] - dnaStart)).toUpperCase();
+        List<DnaVariant> codonVariant = codonVariants[codonPos];
+        if (codon[codonPos] == dnaCol)
          {
-          String nucleotide = String.valueOf(
-                  dnaSeq.getCharAt(codon[codonPos] - dnaStart))
-                  .toUpperCase();
-          List<DnaVariant> codonVariant = codonVariants[codonPos];
-          if (codon[codonPos] == dnaCol)
+          if (!codonVariant.isEmpty()
+                  && codonVariant.get(0).variant == null)
            {
-            if (!codonVariant.isEmpty()
-                    && codonVariant.get(0).variant == null)
-            {
-              /*
-               * already recorded base value, add this variant
-               */
-              codonVariant.get(0).variant = sf;
-            }
-            else
-            {
-              /*
-               * add variant with base value
-               */
-              codonVariant.add(new DnaVariant(nucleotide, sf));
-            }
+            /*
+             * already recorded base value, add this variant
+             */
+            codonVariant.get(0).variant = sf;
            }
-          else if (codonVariant.isEmpty())
+          else
            {
              /*
-             * record (possibly non-varying) base value
+             * add variant with base value
               */
-            codonVariant.add(new DnaVariant(nucleotide));
+            codonVariant.add(new DnaVariant(nucleotide, sf));
            }
          }
+        else if (codonVariant.isEmpty())
+        {
+          /*
+           * record (possibly non-varying) base value
+           */
+          codonVariant.add(new DnaVariant(nucleotide));
+        }
        }
      }
      return variants;
@@ -2539,7 +2634,7 @@ public class AlignmentUtils
    {
      AlignmentI copy = new Alignment(new Alignment(seqs));
      copy.setDataset(dataset);
-
+    boolean isProtein = !copy.isNucleotide();
      SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
      if (xrefs != null)
      {
@@ -2550,7 +2645,8 @@ public class AlignmentUtils
          {
            for (DBRefEntry dbref : dbrefs)
            {
-            if (dbref.getMap() == null || dbref.getMap().getTo() == null)
+            if (dbref.getMap() == null || dbref.getMap().getTo() == null
+                    || dbref.getMap().getTo().isProtein() != isProtein)
              {
                continue;
              }
@@ -2631,7 +2727,7 @@ public class AlignmentUtils
            }
            newCol++;
          }
-        
+
          /*
           * trim trailing gaps
           */
@@ -2724,7 +2820,7 @@ public class AlignmentUtils
     * @param unmapped
     * @return
     */
-  static Map<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
+  static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
            AlignmentI unaligned, AlignmentI aligned, List<SequenceI> unmapped)
    {
      /*
@@ -2732,7 +2828,7 @@ public class AlignmentUtils
       * {unalignedSequence, characterPerSequence} at that position.
       * TreeMap keeps the entries in ascending column order. 
       */
-    Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
+    SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
  
      /*
       * record any sequences that have no mapping so can't be realigned
@@ -2793,9 +2889,7 @@ public class AlignmentUtils
                .getInverse());
      }
  
-    char[] fromChars = fromSeq.getSequence();
      int toStart = seq.getStart();
-    char[] toChars = seq.getSequence();
  
      /*
       * traverse [start, end, start, end...] ranges in fromSeq
@@ -2826,10 +2920,10 @@ public class AlignmentUtils
           * of the next character of the mapped-to sequence; stop when all
           * the characters of the range have been counted
           */
-        while (mappedCharPos <= range[1] && fromCol <= fromChars.length
+        while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
                  && fromCol >= 0)
          {
-          if (!Comparison.isGap(fromChars[fromCol - 1]))
+          if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
            {
              /*
               * mapped from sequence has a character in this column
@@ -2841,7 +2935,7 @@ public class AlignmentUtils
                seqsMap = new HashMap<SequenceI, Character>();
                map.put(fromCol, seqsMap);
              }
-            seqsMap.put(seq, toChars[mappedCharPos - toStart]);
+            seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
              mappedCharPos++;
            }
            fromCol += (forward ? 1 : -1);