JAL-1619 load/align cDNA for protein (wip)

[jalview.git] / src / jalview / datamodel / Alignment.java
diff --git a/src/jalview/datamodel/Alignment.java b/src/jalview/datamodel/Alignment.java

index 91108bc..5d11a20 100755 (executable)
--- a/src/jalview/datamodel/Alignment.java
+++ b/src/jalview/datamodel/Alignment.java
@@ -20,6 +20,7 @@
   */
  package jalview.datamodel;
  
+import jalview.analysis.AlignmentUtils;
  import jalview.util.MessageManager;
  
  import java.util.ArrayList;
@@ -157,6 +158,17 @@ public class Alignment implements AlignmentI
    }
  
    /**
+   * Returns a map of lists of sequences keyed by sequence name.
+   * 
+   * @return
+   */
+  @Override
+  public Map<String, List<SequenceI>> getSequencesByName()
+  {
+    return AlignmentUtils.getSequencesByName(this);
+  }
+
+  /**
     * DOCUMENT ME!
     * 
     * @param i
@@ -1596,114 +1608,156 @@ public class Alignment implements AlignmentI
    }
  
    /**
-   * Answers true if the supplied alignment has the same number of sequences,
-   * and they are of equivalent length, ignoring gaps. Alignments should be of
-   * the same type (protein/nucleotide) or different types with 3:1 length
-   * scaling.
+   * Align this alignment 'the same as' the given one. Mapped sequences only are
+   * realigned. If both of the same type (nucleotide/protein) then align both
+   * identically. If this is nucleotide and the other is protein, make 3 gaps
+   * for each gap in the protein sequences. If this is protein and the other is
+   * nucleotide, insert a gap for each 3 gaps (or part thereof) between
+   * nucleotide bases. Does nothing if alignment of protein from cDNA is
+   * requested (not yet implemented).
     * 
     * @param al
     */
    @Override
-  public boolean isMappableTo(AlignmentI al)
+  public int alignAs(AlignmentI al)
    {
-    int thisCodonScale = this.isNucleotide() ? 1 : 3;
-    int thatCodonScale = al.isNucleotide() ? 1 : 3;
-    if (this == al || this.getHeight() != al.getHeight())
+    int count = 0;
+    boolean thisIsNucleotide = this.isNucleotide();
+    boolean thatIsProtein = !al.isNucleotide();
+    if (!thatIsProtein && !thisIsNucleotide)
      {
-      return false;
+      System.err
+              .println("Alignment of protein from cDNA not yet implemented");
+      return 0;
+      // todo: build it - a variant of Dna.CdnaTranslate()
      }
+    char thisGapChar = this.getGapCharacter();
+    char thatGapChar = al.getGapCharacter();
+    String gap = thisIsNucleotide && thatIsProtein ? String
+            .valueOf(new char[]
+            { thisGapChar, thisGapChar, thisGapChar }) : String
+            .valueOf(thisGapChar);
+    int ratio = thisIsNucleotide && thatIsProtein ? 3 : 1;
  
-    // TODO: match sequence ids, allow different sequence ordering?
-    // TODO: allow for stop/start codons?
-    // TODO: exclude introns
-    int i = 0;
-    for (SequenceI seq : this.getSequences())
-    {
-      final int thisSequenceDnaLength = seq.getDatasetSequence()
-              .getLength() * thisCodonScale;
-      final int thatSequenceDnaLength = al.getSequenceAt(i)
-              .getDatasetSequence().getLength()
-              * thatCodonScale;
-      if (thisSequenceDnaLength != thatSequenceDnaLength)
+    /*
+     * Get mappings from 'that' alignment's sequences to this.
+     */
+    for (SequenceI alignTo : getSequences())
+    {
+      AlignedCodonFrame[] mappings = al.getCodonFrame(alignTo);
+      if (mappings != null)
        {
-        return false;
+        for (AlignedCodonFrame mapping : mappings)
+        {
+          count += alignSequenceAs(alignTo, mapping, thatGapChar, gap,
+                  ratio) ? 1 : 0;
+        }
        }
-      i++;
      }
-    return true;
+    return count;
    }
  
    /**
-   * Align this alignment the same as the given one. If both of the same type
-   * (nucleotide/protein) then align both identically. If this is nucleotide and
-   * the other is protein, make 3 gaps for each gap in the protein sequences. If
-   * this is protein and the other is nucleotide, insert a gap for each 3 gaps
-   * (or part thereof) between nucleotide bases. The two alignments should be
-   * compatible in height and lengths, but if not, then discrepancies will be
-   * ignored with unpredictable results.
+   * Align sequence 'seq' the same way as 'other'. Note this currently assumes
+   * that we are aligned cDNA to match protein.
     * 
-   * @param al
-   * @throws UnsupportedOperation
-   *           if alignment of protein from cDNA is requested (not yet
-   *           implemented)
+   * @param seq
+   *          the sequence to be realigned
+   * @param mapping
+   *          holds mapping from the sequence whose alignment is to be 'copied'
+   * @param thatGapChar
+   *          gap character used in the 'other' sequence
+   * @param gap
+   *          character string represent a gap in the realigned sequence
+   * @param ratio
+   *          the number of positions in the realigned sequence corresponding to
+   *          one in the 'other'
+   * @return true if the sequence was realigned, false if it could not be
     */
-  @Override
-  public void alignAs(AlignmentI al)
-  {
-    boolean thisIsNucleotide = this.isNucleotide();
-    boolean thatIsProtein = !al.isNucleotide();
-    if (!thatIsProtein && !thisIsNucleotide)
+  protected boolean alignSequenceAs(SequenceI seq,
+          AlignedCodonFrame mapping,
+          char thatGapChar,
+          String gap, int ratio)
+  {
+    char myGapChar = gap.charAt(0);
+    // TODO rework this to use the mapping to match 'this' to 'that' residue
+    // position, to handle introns and exons correctly.
+    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
+    SequenceI alignFrom = mapping.getAaForDnaSeq(seq, false);
+    if (alignFrom == null)
      {
-      throw new UnsupportedOperationException(
-              "Alignment of protein from cDNA not implemented");
+      return false;
      }
-    char thisGapChar = this.getGapCharacter();
-    char thatGapChar = al.getGapCharacter();
-    String gap = thisIsNucleotide && thatIsProtein ? String
-            .valueOf(new char[]
-            { thisGapChar, thisGapChar, thisGapChar }) : String
-            .valueOf(thisGapChar);
-    int ratio = thisIsNucleotide && thatIsProtein ? 3 : 1;
-    int i = 0;
-    for (SequenceI seq : this.getSequences())
+    final char[] thisSeq = seq.getSequence();
+    final char[] thisDs = seq.getDatasetSequence().getSequence();
+    final char[] thatAligned = alignFrom.getSequence();
+    StringBuilder thisAligned = new StringBuilder(2 * thisDs.length);
+
+    /*
+     * Find the DNA dataset position that corresponds to the first protein
+     * residue (e.g. ignoring start codon in cDNA).
+     */
+    int[] dnaStart = mapping.getDnaPosition(seq.getDatasetSequence(), 1);
+    int thisDsPosition = dnaStart == null ? 0 : dnaStart[0] - 1;
+    int thisSeqPos = 0;
+
+    /*
+     * Copy aligned cDNA up to (excluding) the first mapped base.
+     */
+    int basesWritten = 0;
+    while (basesWritten < thisDsPosition && thisSeqPos < thisSeq.length)
+    {
+      char c = thisSeq[thisSeqPos++];
+      thisAligned.append(c);
+      if (c != myGapChar)
+      {
+        basesWritten++;
+      }
+    }
+
+    /*
+     * Now traverse the aligned protein mirroring its gaps in cDNA.
+     */
+    for (char thatChar : thatAligned)
      {
-      SequenceI other = al.getSequenceAt(i++);
-      if (other == null)
+      if (thatChar == thatGapChar)
        {
-        continue;
+        /*
+         * Add (equivalent of) a gap
+         */
+        thisAligned.append(gap);
        }
-      char[] thisDs = seq.getDatasetSequence().getSequence();
-      char[] thatDs = other.getSequence();
-      StringBuilder thisAligned = new StringBuilder(2 * thisDs.length);
-      int thisDsPosition = 0;
-      for (char thatChar : thatDs)
+      else
        {
-        if (thatChar == thatGapChar)
-        {
-          /*
-           * Add (equivalent of) a gap
-           */
-          thisAligned.append(gap);
-        }
-        else
+        /*
+         * Add (equivalent of) a residue
+         */
+        for (int j = 0; j < ratio && thisDsPosition < thisDs.length; j++)
          {
+          thisAligned.append(thisDs[thisDsPosition++]);
+
            /*
-           * Add (equivalent of) a residue
+           * Also advance over any gaps and the next residue in the old aligned
+           * sequence
             */
-          for (int j = 0; j < ratio && thisDsPosition < thisDs.length; j++)
+          while (thisSeq[thisSeqPos] == myGapChar
+                  && thisSeqPos < thisSeq.length)
            {
-            thisAligned.append(thisDs[thisDsPosition++]);
+            thisSeqPos++;
            }
+          thisSeqPos++;
          }
        }
-      /*
-       * Include any 'extra' residues (there shouldn't be).
-       */
-      while (thisDsPosition < thisDs.length)
-      {
-        thisAligned.append(thisDs[thisDsPosition++]);
-      }
-      seq.setSequence(new String(thisAligned));
      }
+
+    /*
+     * Finally copy any 'extra' aligned cDNA (e.g. stop codon, introns).
+     */
+    while (thisSeqPos < thisSeq.length)
+    {
+      thisAligned.append(thisSeq[thisSeqPos++]);
+    }
+    seq.setSequence(new String(thisAligned));
+    return true;
    }
  }