JAL-3763 always make a new dataset sequence for CDS sequence
[jalview.git] / src / jalview / analysis / AlignmentUtils.java
index 55efaa5..d8c1cdf 100644 (file)
  */
 package jalview.analysis;
 
-import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.NoSuchElementException;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
+import jalview.bin.Cache;
 import jalview.commands.RemoveGapColCommand;
 import jalview.datamodel.AlignedCodon;
 import jalview.datamodel.AlignedCodonFrame;
@@ -38,7 +53,6 @@ import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceGroup;
 import jalview.datamodel.SequenceI;
 import jalview.datamodel.features.SequenceFeatures;
-import jalview.io.gff.Gff3Helper;
 import jalview.io.gff.SequenceOntologyI;
 import jalview.schemes.ResidueProperties;
 import jalview.util.Comparison;
@@ -46,25 +60,6 @@ import jalview.util.DBRefUtils;
 import jalview.util.IntRangeComparator;
 import jalview.util.MapList;
 import jalview.util.MappingUtils;
-import jalview.util.StringUtils;
-
-import java.io.UnsupportedEncodingException;
-import java.net.URLEncoder;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.NoSuchElementException;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
 
 /**
  * grab bag of useful alignment manipulation operations Expect these to be
@@ -966,11 +961,12 @@ public class AlignmentUtils
             .findMappingsForSequence(cdsSeq, mappings);
     for (AlignedCodonFrame mapping : dnaMappings)
     {
-      SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
+      List<SequenceToSequenceMapping> foundMap=new ArrayList<>();
+      SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein,foundMap);
       if (peptide != null)
       {
         final int peptideLength = peptide.getLength();
-        Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
+        Mapping map = foundMap.get(0).getMapping();
         if (map != null)
         {
           MapList mapList = map.getMap();
@@ -1737,15 +1733,8 @@ public class AlignmentUtils
 
           cdsSeqs.add(cdsSeq);
 
-          if (!dataset.getSequences().contains(cdsSeqDss))
-          {
-            // check if this sequence is a newly created one
-            // so needs adding to the dataset
-            dataset.addSequence(cdsSeqDss);
-          }
-
           /*
-           * add a mapping from CDS to the (unchanged) mapped to range
+           * build the mapping from CDS to protein
            */
           List<int[]> cdsRange = Collections
                   .singletonList(new int[]
@@ -1754,16 +1743,26 @@ public class AlignmentUtils
           MapList cdsToProteinMap = new MapList(cdsRange,
                   mapList.getToRanges(), mapList.getFromRatio(),
                   mapList.getToRatio());
-          AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
-          cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
-                  cdsToProteinMap);
 
-          /*
-           * guard against duplicating the mapping if repeating this action
-           */
-          if (!mappings.contains(cdsToProteinMapping))
+          if (!dataset.getSequences().contains(cdsSeqDss))
           {
-            mappings.add(cdsToProteinMapping);
+            /*
+             * if this sequence is a newly created one, add it to the dataset
+             * and made a CDS to protein mapping (if sequence already exists,
+             * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
+             */
+            dataset.addSequence(cdsSeqDss);
+            AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
+            cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
+                  cdsToProteinMap);
+
+            /*
+             * guard against duplicating the mapping if repeating this action
+             */
+            if (!mappings.contains(cdsToProteinMapping))
+            {
+              mappings.add(cdsToProteinMapping);
+            }
           }
 
           propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
@@ -1997,45 +1996,31 @@ public class AlignmentUtils
 
     SequenceI newSeq = null;
 
-    final MapList maplist = mapping.getMap();
-    if (maplist.isContiguous() && maplist.isFromForwardStrand())
-    {
-      /*
-       * just a subsequence, keep same dataset sequence
-       */
-      int start = maplist.getFromLowest();
-      int end = maplist.getFromHighest();
-      newSeq = seq.getSubSequence(start - 1, end);
-      newSeq.setName(seqId);
-    }
-    else
-    {
-      /*
-       * construct by splicing mapped from ranges
-       */
-      char[] seqChars = seq.getSequence();
-      List<int[]> fromRanges = maplist.getFromRanges();
-      int cdsWidth = MappingUtils.getLength(fromRanges);
-      char[] newSeqChars = new char[cdsWidth];
+    /*
+     * construct CDS sequence by splicing mapped from ranges
+     */
+    char[] seqChars = seq.getSequence();
+    List<int[]> fromRanges = mapping.getMap().getFromRanges();
+    int cdsWidth = MappingUtils.getLength(fromRanges);
+    char[] newSeqChars = new char[cdsWidth];
 
-      int newPos = 0;
-      for (int[] range : fromRanges)
+    int newPos = 0;
+    for (int[] range : fromRanges)
+    {
+      if (range[0] <= range[1])
       {
-        if (range[0] <= range[1])
+        // forward strand mapping - just copy the range
+        int length = range[1] - range[0] + 1;
+        System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
+                length);
+        newPos += length;
+      }
+      else
+      {
+        // reverse strand mapping - copy and complement one by one
+        for (int i = range[0]; i >= range[1]; i--)
         {
-          // forward strand mapping - just copy the range
-          int length = range[1] - range[0] + 1;
-          System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
-                  length);
-          newPos += length;
-        }
-        else
-        {
-          // reverse strand mapping - copy and complement one by one
-          for (int i = range[0]; i >= range[1]; i--)
-          {
-            newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
-          }
+          newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
         }
       }
 
@@ -2069,9 +2054,8 @@ public class AlignmentUtils
           }
           else
           {
-            System.err.println(
-                    "JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):"
-                            + mtch.toString());
+            Cache.log.error(
+                    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:" + mtch.toString());
           }
         }
       }
@@ -2389,97 +2373,6 @@ public class AlignmentUtils
   }
 
   /**
-   * Helper method that adds a peptide variant feature. ID and
-   * clinical_significance attributes of the dna variant (if present) are copied
-   * to the new feature.
-   * 
-   * @param peptide
-   * @param peptidePos
-   * @param residue
-   * @param var
-   * @param codon
-   *          the variant codon e.g. aCg
-   * @param canonical
-   *          the 'normal' codon e.g. aTg
-   * @return true if a feature was added, else false
-   */
-  static boolean addPeptideVariant(SequenceI peptide, int peptidePos,
-          String residue, DnaVariant var, String codon, String canonical)
-  {
-    /*
-     * get peptide translation of codon e.g. GAT -> D
-     * note that variants which are not single alleles,
-     * e.g. multibase variants or HGMD_MUTATION etc
-     * are currently ignored here
-     */
-    String trans = codon.contains("-") ? null
-            : (codon.length() > CODON_LENGTH ? null
-                    : ResidueProperties.codonTranslate(codon));
-    if (trans == null)
-    {
-      return false;
-    }
-    String desc = canonical + "/" + codon;
-    String featureType = "";
-    if (trans.equals(residue))
-    {
-      featureType = SequenceOntologyI.SYNONYMOUS_VARIANT;
-    }
-    else if (ResidueProperties.STOP.equals(trans))
-    {
-      featureType = SequenceOntologyI.STOP_GAINED;
-    }
-    else
-    {
-      String residue3Char = StringUtils
-              .toSentenceCase(ResidueProperties.aa2Triplet.get(residue));
-      String trans3Char = StringUtils
-              .toSentenceCase(ResidueProperties.aa2Triplet.get(trans));
-      desc = "p." + residue3Char + peptidePos + trans3Char;
-      featureType = SequenceOntologyI.NONSYNONYMOUS_VARIANT;
-    }
-    SequenceFeature sf = new SequenceFeature(featureType, desc, peptidePos,
-            peptidePos, var.getSource());
-
-    StringBuilder attributes = new StringBuilder(32);
-    String id = (String) var.variant.getValue(VARIANT_ID);
-    if (id != null)
-    {
-      if (id.startsWith(SEQUENCE_VARIANT))
-      {
-        id = id.substring(SEQUENCE_VARIANT.length());
-      }
-      sf.setValue(VARIANT_ID, id);
-      attributes.append(VARIANT_ID).append("=").append(id);
-      // TODO handle other species variants JAL-2064
-      StringBuilder link = new StringBuilder(32);
-      try
-      {
-        link.append(desc).append(" ").append(id).append(
-                "|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=")
-                .append(URLEncoder.encode(id, "UTF-8"));
-        sf.addLink(link.toString());
-      } catch (UnsupportedEncodingException e)
-      {
-        // as if
-      }
-    }
-    String clinSig = (String) var.variant.getValue(CLINICAL_SIGNIFICANCE);
-    if (clinSig != null)
-    {
-      sf.setValue(CLINICAL_SIGNIFICANCE, clinSig);
-      attributes.append(";").append(CLINICAL_SIGNIFICANCE).append("=")
-              .append(clinSig);
-    }
-    peptide.addSequenceFeature(sf);
-    if (attributes.length() > 0)
-    {
-      sf.setAttributes(attributes.toString());
-    }
-    return true;
-  }
-
-  /**
    * Makes an alignment with a copy of the given sequences, adding in any
    * non-redundant sequences which are mapped to by the cross-referenced
    * sequences.
@@ -2667,6 +2560,13 @@ public class AlignmentUtils
     {
       List<SequenceI> alignedSequences = alignedDatasets
               .get(seq.getDatasetSequence());
+      if (alignedSequences.isEmpty())
+      {
+        /*
+         * defensive check - shouldn't happen! (JAL-3536)
+         */
+        continue;
+      }
       SequenceI alignedSeq = alignedSequences.get(0);
 
       /*