X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fanalysis%2FAlignmentUtils.java;h=8e0335fee4a161b0414377a99fe60992228f6f12;hb=5c6564f903f75960af960720a8635ab8709afc37;hp=aa1a98b98dd4a91d7f2811e89da76b9997b29a09;hpb=241bd0223b016b5ad5ec78520310a8de32842722;p=jalview.git

diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java
index aa1a98b..8e0335f 100644
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -22,6 +22,7 @@ package jalview.analysis;
 
 import static jalview.io.gff.GffConstants.CLINICAL_SIGNIFICANCE;
 
+import jalview.commands.RemoveGapColCommand;
 import jalview.datamodel.AlignedCodon;
 import jalview.datamodel.AlignedCodonFrame;
 import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
@@ -74,12 +75,15 @@ import java.util.TreeMap;
  */
 public class AlignmentUtils
 {
-
   private static final int CODON_LENGTH = 3;
 
   private static final String SEQUENCE_VARIANT = "sequence_variant:";
 
-  private static final String ID = "ID";
+  /*
+   * the 'id' attribute is provided for variant features fetched from
+   * Ensembl using its REST service with JSON format 
+   */
+  public static final String VARIANT_ID = "id";
 
   /**
    * A data model to hold the 'normal' base value at a position, and an optional
@@ -1743,8 +1747,10 @@ public class AlignmentUtils
           /*
            * add a mapping from CDS to the (unchanged) mapped to range
            */
-          List<int[]> cdsRange = Collections.singletonList(new int[] { 1,
-              cdsSeq.getLength() });
+          List<int[]> cdsRange = Collections
+                  .singletonList(new int[]
+                  { cdsSeq.getStart(),
+                      cdsSeq.getLength() + cdsSeq.getStart() - 1 });
           MapList cdsToProteinMap = new MapList(cdsRange,
                   mapList.getToRanges(), mapList.getFromRatio(),
                   mapList.getToRatio());
@@ -1867,7 +1873,7 @@ public class AlignmentUtils
       return;
     }
 
-    MapList newMap = targetToFrom.traverse(fromLoci.getMap());
+    MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
 
     if (newMap != null)
     {
@@ -1888,7 +1894,7 @@ public class AlignmentUtils
    * @param seqMappings
    *          the set of mappings involving dnaSeq
    * @param aMapping
-   *          an initial candidate from seqMappings
+   *          a transcript-to-peptide mapping
    * @return
    */
   static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
@@ -1913,7 +1919,15 @@ public class AlignmentUtils
     if (mappedFromLength == dnaLength
             || mappedFromLength == dnaLength - CODON_LENGTH)
     {
-      return seqDss;
+      /*
+       * if sequence has CDS features, this is a transcript with no UTR
+       * - do not take this as the CDS sequence! (JAL-2789)
+       */
+      if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
+              .isEmpty())
+      {
+        return seqDss;
+      }
     }
 
     /*
@@ -1938,10 +1952,12 @@ public class AlignmentUtils
           {
             /*
             * found a 3:1 mapping to the protein product which covers
-            * the whole dna sequence i.e. is from CDS; finally check it
-            * is from the dna start sequence
+            * the whole dna sequence i.e. is from CDS; finally check the CDS
+            * is mapped from the given dna start sequence
             */
             SequenceI cdsSeq = map.getFromSeq();
+            // todo this test is weak if seqMappings contains multiple mappings;
+            // we get away with it if transcript:cds relationship is 1:1
             List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
                     .findMappingsForSequence(cdsSeq, seqMappings);
             if (!dnaToCdsMaps.isEmpty())
@@ -1971,39 +1987,61 @@ public class AlignmentUtils
   static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
           AlignmentI dataset)
   {
-    char[] seqChars = seq.getSequence();
-    List<int[]> fromRanges = mapping.getMap().getFromRanges();
-    int cdsWidth = MappingUtils.getLength(fromRanges);
-    char[] newSeqChars = new char[cdsWidth];
+    /*
+     * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
+     * if set (e.g. EMBL protein_id), else sequence name appended
+     */
+    String mapFromId = mapping.getMappedFromId();
+    final String seqId = "CDS|"
+            + (mapFromId != null ? mapFromId : seq.getName());
 
-    int newPos = 0;
-    for (int[] range : fromRanges)
+    SequenceI newSeq = null;
+
+    final MapList maplist = mapping.getMap();
+    if (maplist.isContiguous() && maplist.isFromForwardStrand())
     {
-      if (range[0] <= range[1])
-      {
-        // forward strand mapping - just copy the range
-        int length = range[1] - range[0] + 1;
-        System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
-                length);
-        newPos += length;
-      }
-      else
+      /*
+       * just a subsequence, keep same dataset sequence
+       */
+      int start = maplist.getFromLowest();
+      int end = maplist.getFromHighest();
+      newSeq = seq.getSubSequence(start - 1, end);
+      newSeq.setName(seqId);
+    }
+    else
+    {
+      /*
+       * construct by splicing mapped from ranges
+       */
+      char[] seqChars = seq.getSequence();
+      List<int[]> fromRanges = maplist.getFromRanges();
+      int cdsWidth = MappingUtils.getLength(fromRanges);
+      char[] newSeqChars = new char[cdsWidth];
+
+      int newPos = 0;
+      for (int[] range : fromRanges)
       {
-        // reverse strand mapping - copy and complement one by one
-        for (int i = range[0]; i >= range[1]; i--)
+        if (range[0] <= range[1])
         {
-          newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
+          // forward strand mapping - just copy the range
+          int length = range[1] - range[0] + 1;
+          System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
+                  length);
+          newPos += length;
+        }
+        else
+        {
+          // reverse strand mapping - copy and complement one by one
+          for (int i = range[0]; i >= range[1]; i--)
+          {
+            newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
+          }
         }
       }
+
+      newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
     }
 
-    /*
-     * assign 'from id' held in the mapping if set (e.g. EMBL protein_id),
-     * else generate a sequence name
-     */
-    String mapFromId = mapping.getMappedFromId();
-    String seqId = "CDS|" + (mapFromId != null ? mapFromId : seq.getName());
-    SequenceI newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
     if (dataset != null)
     {
       SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
@@ -2056,9 +2094,11 @@ public class AlignmentUtils
   protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
           SequenceI contig, SequenceI proteinProduct, Mapping mapping)
   {
+
     // gather direct refs from contig congruent with mapping
     List<DBRefEntry> direct = new ArrayList<>();
     HashSet<String> directSources = new HashSet<>();
+
     if (contig.getDBRefs() != null)
     {
       for (DBRefEntry dbr : contig.getDBRefs())
@@ -2144,6 +2184,10 @@ public class AlignmentUtils
     {
       copyTo = copyTo.getDatasetSequence();
     }
+    if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
+    {
+      return 0; // shared dataset sequence
+    }
 
     /*
      * get features, optionally restricted by an ontology term
@@ -2235,12 +2279,13 @@ public class AlignmentUtils
     int mappedDnaLength = MappingUtils.getLength(ranges);
 
     /*
-     * if not a whole number of codons, something is wrong,
-     * abort mapping
+     * if not a whole number of codons, truncate mapping
      */
-    if (mappedDnaLength % CODON_LENGTH > 0)
+    int codonRemainder = mappedDnaLength % CODON_LENGTH;
+    if (codonRemainder > 0)
     {
-      return null;
+      mappedDnaLength -= codonRemainder;
+      MappingUtils.removeEndPositions(codonRemainder, ranges);
     }
 
     int proteinLength = proteinSeq.getLength();
@@ -2396,23 +2441,24 @@ public class AlignmentUtils
   }
 
   /**
-   * Computes non-synonymous peptide variants from codon variants and adds them
-   * as sequence_variant features on the protein sequence (one feature per
-   * allele variant). Selected attributes (variant id, clinical significance)
-   * are copied over to the new features.
+   * Computes non-synonymous peptide variants from codon variants and adds them as
+   * sequence_variant features on the protein sequence (one feature per allele
+   * variant). Selected attributes (variant id, clinical significance) are copied
+   * over to the new features.
    * 
    * @param peptide
-   *          the protein sequence
+   *                        the protein dataset (ungapped) sequence
    * @param peptidePos
-   *          the position to compute peptide variants for
+   *                        the position to compute peptide variants for
    * @param codonVariants
-   *          a list of dna variants per codon position
+   *                        a list of dna variants per codon position
    * @return the number of features added
    */
   static int computePeptideVariants(SequenceI peptide, int peptidePos,
           List<DnaVariant>[] codonVariants)
   {
-    String residue = String.valueOf(peptide.getCharAt(peptidePos - 1));
+    String residue = String
+            .valueOf(peptide.getCharAt(peptidePos - peptide.getStart()));
     int count = 0;
     String base1 = codonVariants[0].get(0).base;
     String base2 = codonVariants[1].get(0).base;
@@ -2430,11 +2476,14 @@ public class AlignmentUtils
         {
           for (String base : alleles.split(","))
           {
-            if (!base1.equals(base))
+            if (!base1.equalsIgnoreCase(base))
             {
-              String codon = base + base2 + base3;
+              String codon = base.toUpperCase() + base2.toLowerCase()
+                      + base3.toLowerCase();
+              String canonical = base1.toUpperCase() + base2.toLowerCase()
+                      + base3.toLowerCase();
               if (addPeptideVariant(peptide, peptidePos, residue, var,
-                      codon))
+                      codon, canonical))
               {
                 count++;
               }
@@ -2456,11 +2505,14 @@ public class AlignmentUtils
         {
           for (String base : alleles.split(","))
           {
-            if (!base2.equals(base))
+            if (!base2.equalsIgnoreCase(base))
             {
-              String codon = base1 + base + base3;
+              String codon = base1.toLowerCase() + base.toUpperCase()
+                      + base3.toLowerCase();
+              String canonical = base1.toLowerCase() + base2.toUpperCase()
+                      + base3.toLowerCase();
               if (addPeptideVariant(peptide, peptidePos, residue, var,
-                      codon))
+                      codon, canonical))
               {
                 count++;
               }
@@ -2482,11 +2534,14 @@ public class AlignmentUtils
         {
           for (String base : alleles.split(","))
           {
-            if (!base3.equals(base))
+            if (!base3.equalsIgnoreCase(base))
             {
-              String codon = base1 + base2 + base;
+              String codon = base1.toLowerCase() + base2.toLowerCase()
+                      + base.toUpperCase();
+              String canonical = base1.toLowerCase() + base2.toLowerCase()
+                      + base3.toUpperCase();
               if (addPeptideVariant(peptide, peptidePos, residue, var,
-                      codon))
+                      codon, canonical))
               {
                 count++;
               }
@@ -2509,10 +2564,13 @@ public class AlignmentUtils
    * @param residue
    * @param var
    * @param codon
+   *          the variant codon e.g. aCg
+   * @param canonical
+   *          the 'normal' codon e.g. aTg
    * @return true if a feature was added, else false
    */
   static boolean addPeptideVariant(SequenceI peptide, int peptidePos,
-          String residue, DnaVariant var, String codon)
+          String residue, DnaVariant var, String codon, String canonical)
   {
     /*
      * get peptide translation of codon e.g. GAT -> D
@@ -2527,7 +2585,7 @@ public class AlignmentUtils
     {
       return false;
     }
-    String desc = codon;
+    String desc = canonical + "/" + codon;
     String featureType = "";
     if (trans.equals(residue))
     {
@@ -2550,15 +2608,15 @@ public class AlignmentUtils
             peptidePos, var.getSource());
 
     StringBuilder attributes = new StringBuilder(32);
-    String id = (String) var.variant.getValue(ID);
+    String id = (String) var.variant.getValue(VARIANT_ID);
     if (id != null)
     {
       if (id.startsWith(SEQUENCE_VARIANT))
       {
         id = id.substring(SEQUENCE_VARIANT.length());
       }
-      sf.setValue(ID, id);
-      attributes.append(ID).append("=").append(id);
+      sf.setValue(VARIANT_ID, id);
+      attributes.append(VARIANT_ID).append("=").append(id);
       // TODO handle other species variants JAL-2064
       StringBuilder link = new StringBuilder(32);
       try
@@ -2851,10 +2909,10 @@ public class AlignmentUtils
    * true; else returns false
    * 
    * @param unaligned
-   *          - sequences to be aligned based on aligned
+   *                    - sequences to be aligned based on aligned
    * @param aligned
-   *          - 'guide' alignment containing sequences derived from same dataset
-   *          as unaligned
+   *                    - 'guide' alignment containing sequences derived from same
+   *                    dataset as unaligned
    * @return
    */
   static boolean alignAsSameSequences(AlignmentI unaligned,
@@ -2878,15 +2936,22 @@ public class AlignmentUtils
     }
 
     /*
-     * first pass - check whether all sequences to be aligned share a dataset
-     * sequence with an aligned sequence
+     * first pass - check whether all sequences to be aligned share a 
+     * dataset sequence with an aligned sequence; also note the leftmost
+     * ungapped column from which to copy
      */
+    int leftmost = Integer.MAX_VALUE;
     for (SequenceI seq : unaligned.getSequences())
     {
-      if (!alignedDatasets.containsKey(seq.getDatasetSequence()))
+      final SequenceI ds = seq.getDatasetSequence();
+      if (!alignedDatasets.containsKey(ds))
       {
         return false;
       }
+      SequenceI alignedSeq = alignedDatasets.get(ds)
+              .get(0);
+      int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
+      leftmost = Math.min(leftmost, startCol);
     }
 
     /*
@@ -2894,13 +2959,25 @@ public class AlignmentUtils
      * heuristic rule: pair off sequences in order for the case where 
      * more than one shares the same dataset sequence 
      */
+    final char gapCharacter = aligned.getGapCharacter();
     for (SequenceI seq : unaligned.getSequences())
     {
       List<SequenceI> alignedSequences = alignedDatasets
               .get(seq.getDatasetSequence());
-      // TODO: getSequenceAsString() will be deprecated in the future
-      // TODO: need to leave to SequenceI implementor to update gaps
-      seq.setSequence(alignedSequences.get(0).getSequenceAsString());
+      SequenceI alignedSeq = alignedSequences.get(0);
+
+      /*
+       * gap fill for leading (5') UTR if any
+       */
+      // TODO this copies intron columns - wrong!
+      int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
+      int endCol = alignedSeq.findIndex(seq.getEnd());
+      char[] seqchars = new char[endCol - leftmost + 1];
+      Arrays.fill(seqchars, gapCharacter);
+      char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
+      System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
+              toCopy.length);
+      seq.setSequence(String.valueOf(seqchars));
       if (alignedSequences.size() > 0)
       {
         // pop off aligned sequences (except the last one)
@@ -2908,6 +2985,12 @@ public class AlignmentUtils
       }
     }
 
+    /*
+     * finally remove gapped columns (e.g. introns)
+     */
+    new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
+            unaligned.getWidth() - 1, unaligned);
+
     return true;
   }