JAL-1793 spike branch updated to latest

[jalview.git] / src / jalview / analysis / AlignmentUtils.java
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java

index c88a462..bef667d 100644 (file)
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -29,6 +29,7 @@ import jalview.datamodel.Alignment;
  import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.GeneLociI;
  import jalview.datamodel.IncompleteCodonException;
  import jalview.datamodel.Mapping;
  import jalview.datamodel.Sequence;
@@ -394,7 +395,7 @@ public class AlignmentUtils
     * Answers true if the mappings include one between the given (dataset)
     * sequences.
     */
-  public static boolean mappingExists(List<AlignedCodonFrame> mappings,
+  protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
            SequenceI aaSeq, SequenceI cdnaSeq)
    {
      if (mappings != null)
@@ -1633,7 +1634,7 @@ public class AlignmentUtils
            AlignmentI dataset, SequenceI[] products)
    {
      if (dataset == null || dataset.getDataset() != null)
-  {
+    {
        throw new IllegalArgumentException(
                "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
      }
@@ -1645,10 +1646,10 @@ public class AlignmentUtils
      {
        productSeqs = new HashSet<SequenceI>();
        for (SequenceI seq : products)
-    {
+      {
          productSeqs.add(seq.getDatasetSequence() == null ? seq : seq
                  .getDatasetSequence());
-    }
+      }
      }
  
      /*
@@ -1670,15 +1671,15 @@ public class AlignmentUtils
        List<AlignedCodonFrame> seqMappings = MappingUtils
                .findMappingsForSequence(dnaSeq, mappings);
        for (AlignedCodonFrame mapping : seqMappings)
-    {
+      {
          List<Mapping> mappingsFromSequence = mapping
                  .getMappingsFromSequence(dnaSeq);
  
          for (Mapping aMapping : mappingsFromSequence)
-      {
+        {
            MapList mapList = aMapping.getMap();
            if (mapList.getFromRatio() == 1)
-        {
+          {
              /*
               * not a dna-to-protein mapping (likely dna-to-cds)
               */
@@ -1704,15 +1705,15 @@ public class AlignmentUtils
            if (cdsSeq != null)
            {
              if (!foundSeqs.contains(cdsSeq))
-          {
+            {
                foundSeqs.add(cdsSeq);
                SequenceI derivedSequence = cdsSeq.deriveSequence();
                cdsSeqs.add(derivedSequence);
                if (!dataset.getSequences().contains(cdsSeq))
-            {
+              {
                  dataset.addSequence(cdsSeq);
+              }
              }
-          }
              continue;
            }
  
@@ -1763,7 +1764,7 @@ public class AlignmentUtils
             * add another mapping from original 'from' range to CDS
             */
            AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
-          MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
+          final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
                    cdsRange, 1, 1);
            dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
                    dnaToCdsMap);
@@ -1773,6 +1774,13 @@ public class AlignmentUtils
            }
  
            /*
+           * transfer dna chromosomal loci (if known) to the CDS
+           * sequence (via the mapping)
+           */
+          final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
+          transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
+
+          /*
             * add DBRef with mapping from protein to CDS
             * (this enables Get Cross-References from protein alignment)
             * This is tricky because we can't have two DBRefs with the
@@ -1795,13 +1803,11 @@ public class AlignmentUtils
               * create a cross-reference from CDS to the source sequence's
               * primary reference and vice versa
               */
-
              String source = primRef.getSource();
              String version = primRef.getVersion();
              DBRefEntry cdsCrossRef = new DBRefEntry(source, source + ":"
                      + version, primRef.getAccessionId());
-            cdsCrossRef.setMap(new Mapping(dnaDss, new MapList(dnaToCdsMap
-                    .getInverse())));
+            cdsCrossRef.setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
              cdsSeqDss.addDBRef(cdsCrossRef);
  
              dnaSeq.addDBRef(new DBRefEntry(source, version, cdsSeq
@@ -1818,16 +1824,16 @@ public class AlignmentUtils
              proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
                      .getInverse()));
              proteinProduct.addDBRef(proteinToCdsRef);
-        }
+          }
  
            /*
             * transfer any features on dna that overlap the CDS
             */
            transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
                    SequenceOntologyI.CDS);
+        }
        }
      }
-    }
  
      AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs
              .size()]));
@@ -1837,6 +1843,38 @@ public class AlignmentUtils
    }
  
    /**
+   * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
+   * toSeq, mediated by the given mapping between the sequences
+   * 
+   * @param fromSeq
+   * @param targetToFrom
+   *          Map
+   * @param targetSeq
+   */
+  protected static void transferGeneLoci(SequenceI fromSeq,
+          MapList targetToFrom, SequenceI targetSeq)
+  {
+    if (targetSeq.getGeneLoci() != null)
+    {
+      // already have - don't override
+      return;
+    }
+    GeneLociI fromLoci = fromSeq.getGeneLoci();
+    if (fromLoci == null)
+    {
+      return;
+    }
+
+    MapList newMap = targetToFrom.traverse(fromLoci.getMap());
+
+    if (newMap != null)
+    {
+      targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
+              fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
+    }
+  }
+
+  /**
     * A helper method that finds a CDS sequence in the alignment dataset that is
     * mapped to the given protein sequence, and either is, or has a mapping from,
     * the given dna sequence.
@@ -2004,19 +2042,19 @@ public class AlignmentUtils
    }
  
    /**
-   * add any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
+   * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
     * the given mapping.
     * 
     * @param cdsSeq
     * @param contig
+   * @param proteinProduct
     * @param mapping
-   * @return list of DBRefEntrys added.
+   * @return list of DBRefEntrys added
     */
-  public static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
+  protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
            SequenceI contig, SequenceI proteinProduct, Mapping mapping)
    {
-
-    // gather direct refs from contig congrent with mapping
+    // gather direct refs from contig congruent with mapping
      List<DBRefEntry> direct = new ArrayList<DBRefEntry>();
      HashSet<String> directSources = new HashSet<String>();
      if (contig.getDBRefs() != null)
@@ -2096,7 +2134,7 @@ public class AlignmentUtils
     *          subtypes in the Sequence Ontology)
     * @param omitting
     */
-  public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
+  protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
            MapList mapping, String select, String... omitting)
    {
      SequenceI copyTo = toSeq;
@@ -2179,7 +2217,10 @@ public class AlignmentUtils
  
    /**
     * Returns a mapping from dna to protein by inspecting sequence features of
-   * type "CDS" on the dna.
+   * type "CDS" on the dna. A mapping is constructed if the total CDS feature
+   * length is 3 times the peptide length (optionally after dropping a trailing
+   * stop codon). This method does not check whether the CDS nucleotide sequence
+   * translates to the peptide sequence.
     * 
     * @param dnaSeq
     * @param proteinSeq
@@ -2191,6 +2232,15 @@ public class AlignmentUtils
      List<int[]> ranges = findCdsPositions(dnaSeq);
      int mappedDnaLength = MappingUtils.getLength(ranges);
  
+    /*
+     * if not a whole number of codons, something is wrong,
+     * abort mapping
+     */
+    if (mappedDnaLength % CODON_LENGTH > 0)
+    {
+      return null;
+    }
+
      int proteinLength = proteinSeq.getLength();
      int proteinStart = proteinSeq.getStart();
      int proteinEnd = proteinSeq.getEnd();
@@ -2214,8 +2264,12 @@ public class AlignmentUtils
      if (codesForResidues == (proteinLength + 1))
      {
        // assuming extra codon is for STOP and not in peptide
+      // todo: check trailing codon is indeed a STOP codon
        codesForResidues--;
+      mappedDnaLength -= CODON_LENGTH;
+      MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
      }
+
      if (codesForResidues == proteinLength)
      {
        proteinRange.add(new int[] { proteinStart, proteinEnd });
@@ -2226,7 +2280,7 @@ public class AlignmentUtils
  
    /**
     * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
-   * start/end positions of sequence features of type "CDS" (or a sub-type of
+   * [start, end] positions of sequence features of type "CDS" (or a sub-type of
     * CDS in the Sequence Ontology). The ranges are sorted into ascending start
     * position order, so this method is only valid for linear CDS in the same
     * sense as the protein product.
@@ -2234,7 +2288,7 @@ public class AlignmentUtils
     * @param dnaSeq
     * @return
     */
-  public static List<int[]> findCdsPositions(SequenceI dnaSeq)
+  protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
    {
      List<int[]> result = new ArrayList<int[]>();
  
@@ -2245,7 +2299,6 @@ public class AlignmentUtils
        return result;
      }
      SequenceFeatures.sortFeatures(sfs, true);
-    int startPhase = 0;
  
      for (SequenceFeature sf : sfs)
      {
@@ -2263,7 +2316,7 @@ public class AlignmentUtils
         */
        int begin = sf.getBegin();
        int end = sf.getEnd();
-      if (result.isEmpty())
+      if (result.isEmpty() && phase > 0)
        {
          begin += phase;
          if (begin > end)
@@ -2278,16 +2331,6 @@ public class AlignmentUtils
      }
  
      /*
-     * remove 'startPhase' positions (usually 0) from the first range 
-     * so we begin at the start of a complete codon
-     */
-    if (!result.isEmpty())
-    {
-      // TODO JAL-2022 correctly model start phase > 0
-      result.get(0)[0] += startPhase;
-    }
-
-    /*
       * Finally sort ranges by start position. This avoids a dependency on 
       * keeping features in order on the sequence (if they are in order anyway,
       * the sort will have almost no work to do). The implicit assumption is CDS
@@ -2519,7 +2562,10 @@ public class AlignmentUtils
  
    /**
     * Builds a map whose key is position in the protein sequence, and value is a
-   * list of the base and all variants for each corresponding codon position
+   * list of the base and all variants for each corresponding codon position.
+   * <p>
+   * This depends on dna variants being held as a comma-separated list as
+   * property "alleles" on variant features.
     * 
     * @param dnaSeq
     * @param dnaToProtein
@@ -2559,18 +2605,26 @@ public class AlignmentUtils
        }
  
        /*
-       * extract dna variants to a string array
+       * ignore variant if not a SNP
         */
        String alls = (String) sf.getValue(Gff3Helper.ALLELES);
        if (alls == null)
        {
          continue; // non-SNP VCF variant perhaps - can't process this
        }
+
        String[] alleles = alls.toUpperCase().split(",");
-      int i = 0;
+      boolean isSnp = true;
        for (String allele : alleles)
        {
-        alleles[i++] = allele.trim(); // lose any space characters "A, G"
+        if (allele.trim().length() > 1)
+        {
+          isSnp = false;
+        }
+      }
+      if (!isSnp)
+      {
+        continue;
        }
  
        int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);