Merge branch 'develop' into features/JAL-2360colourSchemeApplicability

[jalview.git] / src / jalview / analysis / AlignmentUtils.java
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java

index cc80384..ea96b3b 100644 (file)
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -60,6 +60,7 @@ import java.util.Map;
  import java.util.Map.Entry;
  import java.util.NoSuchElementException;
  import java.util.Set;
+import java.util.SortedMap;
  import java.util.TreeMap;
  
  /**
@@ -72,22 +73,26 @@ import java.util.TreeMap;
  public class AlignmentUtils
  {
  
+  private static final int CODON_LENGTH = 3;
+
    private static final String SEQUENCE_VARIANT = "sequence_variant:";
+
    private static final String ID = "ID";
  
    /**
     * A data model to hold the 'normal' base value at a position, and an optional
     * sequence variant feature
     */
-  static class DnaVariant
+  static final class DnaVariant
    {
-    String base;
+    final String base;
  
      SequenceFeature variant;
  
      DnaVariant(String nuc)
      {
        base = nuc;
+      variant = null;
      }
  
      DnaVariant(String nuc, SequenceFeature var)
@@ -95,6 +100,11 @@ public class AlignmentUtils
        base = nuc;
        variant = var;
      }
+
+    public String getSource()
+    {
+      return variant == null ? null : variant.getFeatureGroup();
+    }
    }
  
    /**
@@ -427,7 +437,7 @@ public class AlignmentUtils
      /*
       * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
       */
-    final int mappedLength = 3 * aaSeqChars.length;
+    final int mappedLength = CODON_LENGTH * aaSeqChars.length;
      int cdnaLength = cdnaSeqChars.length;
      int cdnaStart = cdnaSeq.getStart();
      int cdnaEnd = cdnaSeq.getEnd();
@@ -439,14 +449,14 @@ public class AlignmentUtils
       */
      if (cdnaLength != mappedLength && cdnaLength > 2)
      {
-      String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3)
-              .toUpperCase();
+      String lastCodon = String.valueOf(cdnaSeqChars,
+              cdnaLength - CODON_LENGTH, CODON_LENGTH).toUpperCase();
        for (String stop : ResidueProperties.STOP)
        {
          if (lastCodon.equals(stop))
          {
-          cdnaEnd -= 3;
-          cdnaLength -= 3;
+          cdnaEnd -= CODON_LENGTH;
+          cdnaLength -= CODON_LENGTH;
            break;
          }
        }
@@ -458,12 +468,12 @@ public class AlignmentUtils
      int startOffset = 0;
      if (cdnaLength != mappedLength
              && cdnaLength > 2
-            && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase()
+            && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH).toUpperCase()
                      .equals(ResidueProperties.START))
      {
-      startOffset += 3;
-      cdnaStart += 3;
-      cdnaLength -= 3;
+      startOffset += CODON_LENGTH;
+      cdnaStart += CODON_LENGTH;
+      cdnaLength -= CODON_LENGTH;
      }
  
      if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
@@ -472,7 +482,7 @@ public class AlignmentUtils
         * protein is translation of dna (+/- start/stop codons)
         */
        MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[]
-      { proteinStart, proteinEnd }, 3, 1);
+      { proteinStart, proteinEnd }, CODON_LENGTH, 1);
        return map;
      }
  
@@ -502,10 +512,9 @@ public class AlignmentUtils
  
      int aaPos = 0;
      int dnaPos = cdnaStart;
-    for (; dnaPos < cdnaSeqChars.length - 2
-            && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++)
+    for (; dnaPos < cdnaSeqChars.length - 2 && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
      {
-      String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+      String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
        final String translated = ResidueProperties.codonTranslate(codon);
  
        /*
@@ -541,9 +550,9 @@ public class AlignmentUtils
      {
        return true;
      }
-    if (dnaPos == cdnaSeqChars.length - 3)
+    if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
      {
-      String codon = String.valueOf(cdnaSeqChars, dnaPos, 3);
+      String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
        if ("STOP".equals(ResidueProperties.codonTranslate(codon)))
        {
          return true;
@@ -894,7 +903,8 @@ public class AlignmentUtils
        }
        width = Math.max(dnaSeq.getLength(), width);
      }
-    int oldwidth, diff;
+    int oldwidth;
+    int diff;
      for (SequenceI dnaSeq : dna.getSequences())
      {
        oldwidth = dnaSeq.getLength();
@@ -928,15 +938,15 @@ public class AlignmentUtils
                .println("alignCdsSequenceAsProtein needs aligned sequence!");
        return false;
      }
-    
+
      List<AlignedCodonFrame> dnaMappings = MappingUtils
              .findMappingsForSequence(cdsSeq, mappings);
      for (AlignedCodonFrame mapping : dnaMappings)
      {
        SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
-      int peptideLength = peptide.getLength();
        if (peptide != null)
        {
+        int peptideLength = peptide.getLength();
          Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
          if (map != null)
          {
@@ -950,7 +960,8 @@ public class AlignmentUtils
                    .getFromRanges());
            int mappedToLength = MappingUtils
                    .getLength(mapList.getToRanges());
-          boolean addStopCodon = (cdsLength == mappedFromLength * 3 + 3)
+          boolean addStopCodon = (cdsLength == mappedFromLength
+                  * CODON_LENGTH + CODON_LENGTH)
                    || (peptide.getDatasetSequence().getLength() == mappedFromLength - 1);
            if (cdsLength != mappedToLength && !addStopCodon)
            {
@@ -964,8 +975,8 @@ public class AlignmentUtils
            /*
             * pre-fill the aligned cds sequence with gaps
             */
-          char[] alignedCds = new char[peptideLength * 3
-                  + (addStopCodon ? 3 : 0)];
+          char[] alignedCds = new char[peptideLength * CODON_LENGTH
+                  + (addStopCodon ? CODON_LENGTH : 0)];
            Arrays.fill(alignedCds, gapChar);
  
            /*
@@ -982,7 +993,7 @@ public class AlignmentUtils
            {
              if (Comparison.isGap(residue))
              {
-              cdsCol += 3;
+              cdsCol += CODON_LENGTH;
              }
              else
              {
@@ -991,7 +1002,7 @@ public class AlignmentUtils
                if (codon == null)
                {
                  // e.g. incomplete start codon, X in peptide
-                cdsCol += 3;
+                cdsCol += CODON_LENGTH;
                }
                else
                {
@@ -1009,7 +1020,7 @@ public class AlignmentUtils
             * append stop codon if not mapped from protein,
             * closing it up to the end of the mapped sequence
             */
-          if (copiedBases == nucleotides.length - 3)
+          if (copiedBases == nucleotides.length - CODON_LENGTH)
            {
              for (int i = alignedCds.length - 1; i >= 0; i--)
              {
@@ -1019,7 +1030,7 @@ public class AlignmentUtils
                  break;
                }
              }
-            for (int i = nucleotides.length - 3; i < nucleotides.length; i++)
+            for (int i = nucleotides.length - CODON_LENGTH; i < nucleotides.length; i++)
              {
                alignedCds[cdsCol++] = nucleotides[i];
              }
@@ -1089,7 +1100,7 @@ public class AlignmentUtils
      // TODO resolve JAL-2022 so this fudge can be removed
      int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
      addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
-    
+
      return alignedCodons;
    }
  
@@ -1680,15 +1691,20 @@ public class AlignmentUtils
             * didn't find mapped CDS sequence - construct it and add
             * its dataset sequence to the dataset
             */
-          cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping);
+          cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
+                  dataset).deriveSequence();
            // cdsSeq has a name constructed as CDS|<dbref>
            // <dbref> will be either the accession for the coding sequence,
            // marked in the /via/ dbref to the protein product accession
            // or it will be the original nucleotide accession.
-          SequenceI cdsSeqDss = cdsSeq.createDatasetSequence();
+          SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
+
            cdsSeqs.add(cdsSeq);
+
            if (!dataset.getSequences().contains(cdsSeqDss))
            {
+            // check if this sequence is a newly created one
+            // so needs adding to the dataset
              dataset.addSequence(cdsSeqDss);
            }
  
@@ -1697,8 +1713,9 @@ public class AlignmentUtils
             */
            List<int[]> cdsRange = Collections.singletonList(new int[] { 1,
                cdsSeq.getLength() });
-          MapList cdsToProteinMap = new MapList(cdsRange, mapList.getToRanges(),
-                  mapList.getFromRatio(), mapList.getToRatio());
+          MapList cdsToProteinMap = new MapList(cdsRange,
+                  mapList.getToRanges(), mapList.getFromRatio(),
+                  mapList.getToRatio());
            AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
            cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
                    cdsToProteinMap);
@@ -1718,8 +1735,7 @@ public class AlignmentUtils
             */
            AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
            MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
-                  cdsRange, 1,
-                  1);
+                  cdsRange, 1, 1);
            dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
                    dnaToCdsMap);
            if (!mappings.contains(dnaToCdsMapping))
@@ -1734,16 +1750,16 @@ public class AlignmentUtils
             * same source and accession, so need a different accession for
             * the CDS from the dna sequence
             */
-          
+
            // specific use case:
            // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
            // ENSG02, ENSG03, with transcripts and products similarly named.
            // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
-          
+
            // JBPNote: ?? can't actually create an example that demonstrates we
            // need to
            // synthesize an xref.
-          
+
            for (DBRefEntry primRef : dnaDss.getPrimaryDBRefs())
            {
              // creates a complementary cross-reference to the source sequence's
@@ -1773,7 +1789,7 @@ public class AlignmentUtils
            /*
             * transfer any features on dna that overlap the CDS
             */
-          transferFeatures(dnaSeq, cdsSeq, cdsToProteinMap, null,
+          transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
                    SequenceOntologyI.CDS);
          }
        }
@@ -1820,7 +1836,8 @@ public class AlignmentUtils
      int mappedFromLength = MappingUtils.getLength(aMapping.getMap()
              .getFromRanges());
      int dnaLength = seqDss.getLength();
-    if (mappedFromLength == dnaLength || mappedFromLength == dnaLength - 3)
+    if (mappedFromLength == dnaLength
+            || mappedFromLength == dnaLength - CODON_LENGTH)
      {
        return seqDss;
      }
@@ -1836,7 +1853,8 @@ public class AlignmentUtils
        for (SequenceToSequenceMapping map : acf.getMappings())
        {
          Mapping mapping = map.getMapping();
-        if (mapping != aMapping && mapping.getMap().getFromRatio() == 3
+        if (mapping != aMapping
+                && mapping.getMap().getFromRatio() == CODON_LENGTH
                  && proteinProduct == mapping.getTo()
                  && seqDss != map.getFromSeq())
          {
@@ -1870,9 +1888,14 @@ public class AlignmentUtils
     * 
     * @param seq
     * @param mapping
+   * @param dataset
+   *          - existing dataset. We check for sequences that look like the CDS
+   *          we are about to construct, if one exists already, then we will
+   *          just return that one.
     * @return CDS sequence (as a dataset sequence)
     */
-  static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping)
+  static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
+          AlignmentI dataset)
    {
      char[] seqChars = seq.getSequence();
      List<int[]> fromRanges = mapping.getMap().getFromRanges();
@@ -1899,7 +1922,7 @@ public class AlignmentUtils
          }
        }
      }
-    
+
      /*
       * assign 'from id' held in the mapping if set (e.g. EMBL protein_id),
       * else generate a sequence name
@@ -1907,6 +1930,40 @@ public class AlignmentUtils
      String mapFromId = mapping.getMappedFromId();
      String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName());
      SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
+    if (dataset != null)
+    {
+      SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
+      if (matches != null)
+      {
+        boolean matched = false;
+        for (SequenceI mtch : matches)
+        {
+          if (mtch.getStart() != newSeq.getStart())
+          {
+            continue;
+          }
+          if (mtch.getEnd() != newSeq.getEnd())
+          {
+            continue;
+          }
+          if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
+          {
+            continue;
+          }
+          if (!matched)
+          {
+            matched = true;
+            newSeq = mtch;
+          }
+          else
+          {
+            System.err
+                    .println("JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):"
+                            + mtch.toString());
+          }
+        }
+      }
+    }
      // newSeq.setDescription(mapFromId);
  
      return newSeq;
@@ -2119,7 +2176,7 @@ public class AlignmentUtils
      /*
       * dna length should map to protein (or protein plus stop codon)
       */
-    int codesForResidues = mappedDnaLength / 3;
+    int codesForResidues = mappedDnaLength / CODON_LENGTH;
      if (codesForResidues == (proteinLength + 1))
      {
        // assuming extra codon is for STOP and not in peptide
@@ -2128,7 +2185,7 @@ public class AlignmentUtils
      if (codesForResidues == proteinLength)
      {
        proteinRange.add(new int[] { proteinStart, proteinEnd });
-      return new MapList(ranges, proteinRange, 3, 1);
+      return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
      }
      return null;
    }
@@ -2404,7 +2461,7 @@ public class AlignmentUtils
       * are currently ignored here
       */
      String trans = codon.contains("-") ? "-"
-            : (codon.length() > 3 ? null : ResidueProperties
+            : (codon.length() > CODON_LENGTH ? null : ResidueProperties
                      .codonTranslate(codon));
      if (trans != null && !trans.equals(residue))
      {
@@ -2416,7 +2473,7 @@ public class AlignmentUtils
        // set score to 0f so 'graduated colour' option is offered! JAL-2060
        SequenceFeature sf = new SequenceFeature(
                SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
-              peptidePos, 0f, "Jalview");
+              peptidePos, 0f, var.getSource());
        StringBuilder attributes = new StringBuilder(32);
        String id = (String) var.variant.getValue(ID);
        if (id != null)
@@ -2427,11 +2484,13 @@ public class AlignmentUtils
          }
          sf.setValue(ID, id);
          attributes.append(ID).append("=").append(id);
-        // TODO handle other species variants
+        // TODO handle other species variants JAL-2064
          StringBuilder link = new StringBuilder(32);
          try
          {
-          link.append(desc).append(" ").append(id)
+          link.append(desc)
+                  .append(" ")
+                  .append(id)
                    .append("|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=")
                    .append(URLEncoder.encode(id, "UTF-8"));
            sf.addLink(link.toString());
@@ -2440,8 +2499,7 @@ public class AlignmentUtils
            // as if
          }
        }
-      String clinSig = (String) var.variant
-              .getValue(CLINICAL_SIGNIFICANCE);
+      String clinSig = (String) var.variant.getValue(CLINICAL_SIGNIFICANCE);
        if (clinSig != null)
        {
          sf.setValue(CLINICAL_SIGNIFICANCE, clinSig);
@@ -2466,6 +2524,7 @@ public class AlignmentUtils
     * @param dnaToProtein
     * @return
     */
+  @SuppressWarnings("unchecked")
    static LinkedHashMap<Integer, List<DnaVariant>[]> buildDnaVariantsMap(
            SequenceI dnaSeq, MapList dnaToProtein)
    {
@@ -2509,7 +2568,7 @@ public class AlignmentUtils
          List<DnaVariant>[] codonVariants = variants.get(peptidePosition);
          if (codonVariants == null)
          {
-          codonVariants = new ArrayList[3];
+          codonVariants = new ArrayList[CODON_LENGTH];
            codonVariants[0] = new ArrayList<DnaVariant>();
            codonVariants[1] = new ArrayList<DnaVariant>();
            codonVariants[2] = new ArrayList<DnaVariant>();
@@ -2543,7 +2602,7 @@ public class AlignmentUtils
          /*
           * save nucleotide (and any variant) for each codon position
           */
-        for (int codonPos = 0; codonPos < 3; codonPos++)
+        for (int codonPos = 0; codonPos < CODON_LENGTH; codonPos++)
          {
            String nucleotide = String.valueOf(
                    dnaSeq.getCharAt(codon[codonPos] - dnaStart))
@@ -2596,7 +2655,7 @@ public class AlignmentUtils
    {
      AlignmentI copy = new Alignment(new Alignment(seqs));
      copy.setDataset(dataset);
-
+    boolean isProtein = !copy.isNucleotide();
      SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
      if (xrefs != null)
      {
@@ -2607,7 +2666,8 @@ public class AlignmentUtils
          {
            for (DBRefEntry dbref : dbrefs)
            {
-            if (dbref.getMap() == null || dbref.getMap().getTo() == null)
+            if (dbref.getMap() == null || dbref.getMap().getTo() == null
+                    || dbref.getMap().getTo().isProtein() != isProtein)
              {
                continue;
              }
@@ -2688,7 +2748,7 @@ public class AlignmentUtils
            }
            newCol++;
          }
-        
+
          /*
           * trim trailing gaps
           */
@@ -2781,7 +2841,7 @@ public class AlignmentUtils
     * @param unmapped
     * @return
     */
-  static Map<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
+  static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
            AlignmentI unaligned, AlignmentI aligned, List<SequenceI> unmapped)
    {
      /*
@@ -2789,7 +2849,7 @@ public class AlignmentUtils
       * {unalignedSequence, characterPerSequence} at that position.
       * TreeMap keeps the entries in ascending column order. 
       */
-    Map<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
+    SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<Integer, Map<SequenceI, Character>>();
  
      /*
       * record any sequences that have no mapping so can't be realigned