JAL-653 AlignedCodonFrame collections changed from Set to List

[jalview.git] / src / jalview / analysis / AlignmentUtils.java
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java

index 719dbfb..0e30d8c 100644 (file)
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -45,7 +45,6 @@ import java.util.HashMap;
  import java.util.HashSet;
  import java.util.Iterator;
  import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
  import java.util.List;
  import java.util.Map;
  import java.util.Map.Entry;
@@ -338,12 +337,12 @@ public class AlignmentUtils
     * Answers true if the mappings include one between the given (dataset)
     * sequences.
     */
-  public static boolean mappingExists(Set<AlignedCodonFrame> set,
+  public static boolean mappingExists(List<AlignedCodonFrame> mappings,
            SequenceI aaSeq, SequenceI cdnaSeq)
    {
-    if (set != null)
+    if (mappings != null)
      {
-      for (AlignedCodonFrame acf : set)
+      for (AlignedCodonFrame acf : mappings)
        {
          if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
          {
@@ -514,8 +513,8 @@ public class AlignmentUtils
  
      /*
       * Locate the aligned source sequence whose dataset sequence is mapped. We
-     * just take the first match here (as we can't align cDNA like more than one
-     * protein sequence).
+     * just take the first match here (as we can't align like more than one
+     * sequence).
       */
      SequenceI alignFrom = null;
      AlignedCodonFrame mapping = null;
@@ -541,8 +540,8 @@ public class AlignmentUtils
    /**
     * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
     * match residues and codons. Flags control whether existing gaps in unmapped
-   * (intron) and mapped (exon) regions are preserved or not. Gaps linking intro
-   * and exon are only retained if both flags are set.
+   * (intron) and mapped (exon) regions are preserved or not. Gaps between
+   * intron and exon are only retained if both flags are set.
     * 
     * @param alignTo
     * @param alignFrom
@@ -558,9 +557,6 @@ public class AlignmentUtils
            boolean preserveUnmappedGaps)
    {
      // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
-    final char[] thisSeq = alignTo.getSequence();
-    final char[] thatAligned = alignFrom.getSequence();
-    StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
  
      // aligned and dataset sequence positions, all base zero
      int thisSeqPos = 0;
@@ -570,13 +566,17 @@ public class AlignmentUtils
      char myGapChar = myGap.charAt(0);
      int ratio = myGap.length();
  
-    /*
-     * Traverse the aligned protein sequence.
-     */
      int fromOffset = alignFrom.getStart() - 1;
      int toOffset = alignTo.getStart() - 1;
      int sourceGapMappedLength = 0;
      boolean inExon = false;
+    final char[] thisSeq = alignTo.getSequence();
+    final char[] thatAligned = alignFrom.getSequence();
+    StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length);
+
+    /*
+     * Traverse the 'model' aligned sequence
+     */
      for (char sourceChar : thatAligned)
      {
        if (sourceChar == sourceGap)
@@ -586,7 +586,7 @@ public class AlignmentUtils
        }
  
        /*
-       * Found a residue. Locate its mapped codon (start) position.
+       * Found a non-gap character. Locate its mapped region if any.
         */
        sourceDsPos++;
        // Note mapping positions are base 1, our sequence positions base 0
@@ -595,11 +595,13 @@ public class AlignmentUtils
        if (mappedPos == null)
        {
          /*
-         * Abort realignment if unmapped protein. Or could ignore it??
+         * unmapped position; treat like a gap
           */
-        System.err.println("Can't align: no codon mapping to residue "
-                + sourceDsPos + "(" + sourceChar + ")");
-        return;
+        sourceGapMappedLength += ratio;
+        // System.err.println("Can't align: no codon mapping to residue "
+        // + sourceDsPos + "(" + sourceChar + ")");
+        // return;
+        continue;
        }
  
        int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
@@ -669,8 +671,8 @@ public class AlignmentUtils
      }
  
      /*
-     * At end of protein sequence. Copy any remaining dna sequence, optionally
-     * including (intron) gaps. We do not copy trailing gaps in protein.
+     * At end of model aligned sequence. Copy any remaining target sequence, optionally
+     * including (intron) gaps.
       */
      while (thisSeqPos < thisSeq.length)
      {
@@ -679,6 +681,20 @@ public class AlignmentUtils
        {
          thisAligned.append(c);
        }
+      sourceGapMappedLength--;
+    }
+
+    /*
+     * finally add gaps to pad for any trailing source gaps or
+     * unmapped characters
+     */
+    if (preserveUnmappedGaps)
+    {
+      while (sourceGapMappedLength > 0)
+      {
+        thisAligned.append(myGapChar);
+        sourceGapMappedLength--;
+      }
      }
  
      /*
@@ -909,7 +925,7 @@ public class AlignmentUtils
      List<SequenceI> unmappedProtein = new ArrayList<SequenceI>();
      unmappedProtein.addAll(protein.getSequences());
  
-    Set<AlignedCodonFrame> mappings = protein.getCodonFrames();
+    List<AlignedCodonFrame> mappings = protein.getCodonFrames();
  
      /*
       * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
@@ -1048,7 +1064,7 @@ public class AlignmentUtils
      }
      AlignmentI dna = al1.isNucleotide() ? al1 : al2;
      AlignmentI protein = dna == al1 ? al2 : al1;
-    Set<AlignedCodonFrame> mappings = protein.getCodonFrames();
+    List<AlignedCodonFrame> mappings = protein.getCodonFrames();
      for (SequenceI dnaSeq : dna.getSequences())
      {
        for (SequenceI proteinSeq : protein.getSequences())
@@ -1072,7 +1088,7 @@ public class AlignmentUtils
     * @return
     */
    protected static boolean isMappable(SequenceI dnaSeq,
-          SequenceI proteinSeq, Set<AlignedCodonFrame> mappings)
+          SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
    {
      if (dnaSeq == null || proteinSeq == null)
      {
@@ -1312,9 +1328,9 @@ public class AlignmentUtils
     *         sequences (or null if no exons are found)
     */
    public static AlignmentI makeExonAlignment(SequenceI[] dna,
-          Set<AlignedCodonFrame> mappings)
+          List<AlignedCodonFrame> mappings)
    {
-    Set<AlignedCodonFrame> newMappings = new LinkedHashSet<AlignedCodonFrame>();
+    List<AlignedCodonFrame> newMappings = new ArrayList<AlignedCodonFrame>();
      List<SequenceI> exonSequences = new ArrayList<SequenceI>();
  
      for (SequenceI dnaSeq : dna)