fix aligned DNA codon translation bug(s) and generate AlignedCodonFrame mappings...

[jalview.git] / src / jalview / analysis / Dna.java
diff --git a/src/jalview/analysis/Dna.java b/src/jalview/analysis/Dna.java

index 48d03b3..960a6db 100644 (file)
--- a/src/jalview/analysis/Dna.java
+++ b/src/jalview/analysis/Dna.java
@@ -3,34 +3,61 @@ package jalview.analysis;
  import java.util.Hashtable;\r
  import java.util.Vector;\r
  \r
+import jalview.datamodel.AlignedCodonFrame;\r
  import jalview.datamodel.Alignment;\r
  import jalview.datamodel.AlignmentAnnotation;\r
  import jalview.datamodel.AlignmentI;\r
  import jalview.datamodel.Annotation;\r
  import jalview.datamodel.ColumnSelection;\r
+import jalview.datamodel.FeatureProperties;\r
+import jalview.datamodel.Mapping;\r
  import jalview.datamodel.Sequence;\r
  import jalview.datamodel.SequenceFeature;\r
  import jalview.datamodel.SequenceI;\r
  import jalview.schemes.ResidueProperties;\r
  import jalview.util.MapList;\r
+import jalview.util.ShiftList;\r
  \r
-public class Dna {\r
+public class Dna\r
+{\r
    /**\r
     * \r
     * @param cdp1\r
     * @param cdp2\r
-   * @return -1 if cdp1 aligns before cdp2, 0 if in the same column or cdp2 is null, +1 if after cdp2\r
+   * @return -1 if cdp1 aligns before cdp2, 0 if in the same column or cdp2 is\r
+   *         null, +1 if after cdp2\r
     */\r
-  private static int compare_codonpos(int[] cdp1, int[] cdp2) {\r
-    if (cdp2==null || (cdp1[0]==cdp2[0] && cdp1[1] == cdp2[1] && cdp1[2] == cdp2[2]))\r
+  private static int compare_codonpos(int[] cdp1, int[] cdp2)\r
+  {\r
+    if (cdp2 == null\r
+            || (cdp1[0] == cdp2[0] && cdp1[1] == cdp2[1] && cdp1[2] == cdp2[2]))\r
        return 0;\r
-    if (cdp1[0]<cdp2[0] || cdp1[1]<cdp2[1] || cdp1[2]<cdp2[2])\r
-      return -1; // one base in cdp1 precedes the corresponding base in the other codon\r
-    return 1; // one base in cdp1 appears after the corresponding base in the other codon.\r
+    if (cdp1[0] < cdp2[0] || cdp1[1] < cdp2[1] || cdp1[2] < cdp2[2])\r
+      return -1; // one base in cdp1 precedes the corresponding base in the\r
+                  // other codon\r
+    return 1; // one base in cdp1 appears after the corresponding base in the\r
+              // other codon.\r
    }\r
+\r
+  /**\r
+   * DNA->mapped protein sequence alignment translation given set of sequences\r
+   * 1. id distinct coding regions within selected region for each sequence 2.\r
+   * generate peptides based on inframe (or given) translation or (optionally\r
+   * and where specified) out of frame translations (annotated appropriately) 3.\r
+   * align peptides based on codon alignment\r
+   */\r
+  /**\r
+   * id potential products from dna 1. search for distinct products within\r
+   * selected region for each selected sequence 2. group by associated DB type.\r
+   * 3. return as form for input into above function\r
+   */\r
+  /**\r
+   * \r
+   */\r
    /**\r
-   * create a new alignment of protein sequences\r
-   * by an inframe translation of the provided NA sequences\r
+   * create a new alignment of protein sequences by an inframe translation of\r
+   * the provided NA sequences\r
+   * \r
     * @param selection\r
     * @param seqstring\r
     * @param viscontigs\r
@@ -39,198 +66,343 @@ public class Dna {
     * @param aWidth\r
     * @return\r
     */\r
-  public static AlignmentI CdnaTranslate(SequenceI[] selection, String[] seqstring, int viscontigs[], char gapCharacter, \r
-      AlignmentAnnotation[] annotations, int aWidth) {\r
+  public static AlignmentI CdnaTranslate(SequenceI[] selection,\r
+          String[] seqstring, int viscontigs[], char gapCharacter,\r
+          AlignmentAnnotation[] annotations, int aWidth)\r
+  {\r
+    AlignedCodonFrame codons = new AlignedCodonFrame(aWidth); // stores hash of\r
+                                                              // subsequent\r
+                                                              // positions for\r
+                                                              // each codon\r
+                                                              // start position\r
+                                                              // in alignment\r
      int s, sSize = selection.length;\r
-    SequenceI [] newSeq = new SequenceI[sSize];\r
-    int res, resSize;\r
-    StringBuffer protein;\r
-    String seq;\r
-\r
-    int[][] codons = new int[aWidth][]; // stores hash of subsequent positions for each codon start position in alignment\r
-\r
-    for (res=0;res<aWidth;res++)\r
-      codons[res]=null;\r
-    int aslen=0; // final width of aligned translated aa sequences\r
-    for(s=0; s<sSize; s++)\r
+    Vector pepseqs = new Vector();\r
+    for (s = 0; s < sSize; s++)\r
      {\r
-      int vc,scontigs[]=new int[viscontigs.length];\r
-\r
-      for (vc=0;vc<scontigs.length; vc+=2)\r
+      SequenceI newseq = translateCodingRegion(selection[s], seqstring[s],\r
+              viscontigs, codons, gapCharacter);\r
+      if (newseq != null)\r
        {\r
-        scontigs[vc]=selection[s].findPosition(viscontigs[vc]); // not from 1!\r
-        scontigs[vc+1]=selection[s].findPosition(viscontigs[vc+1]-1); // exclusive\r
-        if (scontigs[vc+1]==selection[s].getEnd())\r
-          break;\r
+        pepseqs.addElement(newseq);\r
        }\r
-      if ((vc+2)<scontigs.length) {\r
-        int t[] = new int[vc+2];\r
-        System.arraycopy(scontigs, 0, t, 0, vc+2);\r
-        scontigs = t;\r
-      }\r
-      protein = new StringBuffer();\r
-      seq = seqstring[s].replace('U', 'T');\r
-      char codon[]=new char[3];\r
-      int cdp[]=new int[3],rf=0,gf=0,nend,npos;\r
-      int aspos=0;\r
-      resSize=0;\r
-      for (npos=0,nend=seq.length(); npos<nend; npos++) {\r
-        if (!jalview.util.Comparison.isGap(seq.charAt(npos))) { \r
-          cdp[rf] = npos; // store position\r
-          codon[rf++]=seq.charAt(npos); // store base\r
-        }\r
-        // filled an RF yet ?\r
-        if (rf==3) {\r
-          String aa = ResidueProperties.codonTranslate(new String(codon));\r
-          rf=0;\r
-          if(aa==null)\r
-            aa=String.valueOf(gapCharacter);\r
-          else {\r
-            if(aa.equals("STOP"))\r
-            {\r
-              aa="X";\r
-            }\r
-            resSize++;\r
-          }\r
-          // insert/delete gaps prior to this codon - if necessary\r
-          boolean findpos=true;\r
-          while (findpos) \r
-          {\r
-            // first ensure that the codons array is long enough.\r
-            if (codons.length<=aslen+1) {\r
-              // probably never have to do this ?\r
-              int[][] c = new int[codons.length+10][];\r
-              for (int i=0; i<codons.length; i++) {\r
-                c[i] = codons[i];\r
-                codons[i]=null;\r
-              }\r
-              codons = c;\r
-            }\r
-            // now check to see if we place the aa at the current aspos in the protein alignment\r
-            switch (Dna.compare_codonpos(cdp, codons[aspos])) \r
-            {\r
-            case -1:\r
-              // this aa appears before the aligned codons at aspos - so shift them.\r
-              aslen++;\r
-              for (int sq=0;sq<s; sq++) {\r
-                newSeq[sq].insertCharAt(aspos, gapCharacter);\r
-              }\r
-              System.arraycopy(codons, aspos, codons, aspos+1, aslen-aspos);\r
-              codons[aspos]=null; // clear so new codon position can be marked.\r
-              findpos=false;\r
-              break;\r
-            case +1:\r
-              // this aa appears after the aligned codons at aspos, so prefix it with a gap\r
-              aa = ""+gapCharacter+aa;\r
-              aspos++;\r
-              if (aspos>=aslen)\r
-                aslen=aspos+1;\r
-              break; // check the next position for alignment\r
-            case 0:\r
-              // codon aligns at aspos position.\r
-              findpos = false;\r
-            }\r
-          }\r
-          // codon aligns with all other sequence residues found at aspos\r
-          protein.append(aa);\r
-          if (codons[aspos]==null) \r
-          {\r
-            // mark this column as aligning to this aligned reading frame \r
-            codons[aspos] = new int[] { cdp[0], cdp[1], cdp[2] };\r
-          }\r
-          aspos++;\r
-          if (aspos>=aslen)\r
-            aslen=aspos+1;\r
-        }\r
-      }\r
-      if (resSize>0) \r
-      {\r
-        newSeq[s] = new Sequence(selection[s].getName(),\r
-            protein.toString());\r
-        if (rf!=0) \r
-        {\r
-          jalview.bin.Cache.log.debug("trimming contigs for incomplete terminal codon.");\r
-          // trim contigs\r
-          vc=scontigs.length-1;\r
-          nend-=rf;\r
-          // incomplete ORF could be broken over one or two visible contig intervals.\r
-          while (vc>0 && scontigs[vc]>nend)\r
-          {\r
-            if (scontigs[vc-1]>nend) \r
-            {\r
-              vc-=2;\r
-            } else {\r
-              // correct last interval in list.\r
-              scontigs[vc]=nend;\r
-            }\r
-          }\r
-          if ((vc+2)<scontigs.length) {\r
-            // truncate map list\r
-            int t[] = new int[vc+1];\r
-            System.arraycopy(scontigs,0,t,0,vc+1);\r
-            scontigs=t;\r
-          }\r
-        }\r
-        MapList map = new MapList(scontigs, new int[] { 1, resSize },3,1); // TODO: store mapping on newSeq for linked DNA/Protein viewing.\r
-      }\r
-      // register the mapping somehow\r
-      // \r
      }\r
-    if (aslen==0)\r
+    if (codons.aaWidth == 0)\r
        return null;\r
-    AlignmentI al = new Alignment(newSeq);\r
-    al.padGaps();  // ensure we look aligned.\r
+    SequenceI[] newseqs = new SequenceI[pepseqs.size()];\r
+    pepseqs.copyInto(newseqs);\r
+    AlignmentI al = new Alignment(newseqs);\r
+    al.padGaps(); // ensure we look aligned.\r
      al.setDataset(null);\r
+    translateAlignedAnnotations(annotations, al, codons);\r
+    al.addCodonFrame(codons);\r
+    return al;\r
+  }\r
  \r
-\r
-    ////////////////////////////////\r
+  /**\r
+   * translate na alignment annotations onto translated amino acid alignment al\r
+   * using codon mapping codons\r
+   * \r
+   * @param annotations\r
+   * @param al\r
+   * @param codons\r
+   */\r
+  public static void translateAlignedAnnotations(\r
+          AlignmentAnnotation[] annotations, AlignmentI al,\r
+          AlignedCodonFrame codons)\r
+  {\r
+    // //////////////////////////////\r
      // Copy annotations across\r
      //\r
      // Can only do this for columns with consecutive codons, or where\r
      // annotation is sequence associated.\r
-    \r
-    int pos,a,aSize;\r
-    if(annotations!=null)\r
+\r
+    int pos, a, aSize;\r
+    if (annotations != null)\r
      {\r
        for (int i = 0; i < annotations.length; i++)\r
        {\r
          // Skip any autogenerated annotation\r
-        if (annotations[i].autoCalculated) {\r
+        if (annotations[i].autoCalculated)\r
+        {\r
            continue;\r
          }\r
-        \r
-        aSize = aslen; // aa alignment width.\r
-        jalview.datamodel.Annotation[] anots = \r
-          (annotations[i].annotations==null) \r
-          ? null :\r
-            new jalview.datamodel.Annotation[aSize];\r
-        if (anots!=null)\r
+\r
+        aSize = codons.getaaWidth(); // aa alignment width.\r
+        jalview.datamodel.Annotation[] anots = (annotations[i].annotations == null) ? null\r
+                : new jalview.datamodel.Annotation[aSize];\r
+        if (anots != null)\r
          {\r
            for (a = 0; a < aSize; a++)\r
            {\r
              // process through codon map.\r
-            if (codons[a]!=null && codons[a][0]==(codons[a][2]-2))\r
+            if (codons.codons[a] != null\r
+                    && codons.codons[a][0] == (codons.codons[a][2] - 2))\r
              {\r
-              pos = codons[a][0];\r
+              pos = codons.codons[a][0];\r
                if (annotations[i].annotations[pos] == null\r
                        || annotations[i].annotations[pos] == null)\r
                  continue;\r
-            \r
+              // We just take the annotation in the first base in the codon\r
                anots[a] = new Annotation(annotations[i].annotations[pos]);\r
              }\r
            }\r
          }\r
  \r
-        jalview.datamodel.AlignmentAnnotation aa\r
-        = new jalview.datamodel.AlignmentAnnotation(annotations[i].label,\r
-            annotations[i].description, anots);\r
+        jalview.datamodel.AlignmentAnnotation aa = new jalview.datamodel.AlignmentAnnotation(\r
+                annotations[i].label, annotations[i].description, anots);\r
          if (annotations[i].hasScore)\r
          {\r
            aa.setScore(annotations[i].getScore());\r
          }\r
+        if (annotations[i].sequenceRef != null)\r
+        {\r
+          SequenceI aaSeq = codons\r
+                  .getAaForDnaSeq(annotations[i].sequenceRef);\r
+          if (aaSeq != null)\r
+          {\r
+            // aa.compactAnnotationArray(); // throw away alignment annotation\r
+            // positioning\r
+            aa.setSequenceRef(aaSeq);\r
+            aa.createSequenceMapping(aaSeq, aaSeq.getStart(), true); // rebuild\r
+                                                                      // mapping\r
+            aa.adjustForAlignment();\r
+            aaSeq.addAlignmentAnnotation(aa);\r
+          }\r
+\r
+        }\r
          al.addAnnotation(aa);\r
        }\r
      }\r
-    return al;\r
+  }\r
+\r
+  /**\r
+   * Translate a na sequence\r
+   * \r
+   * @param selection\r
+   * @param seqstring\r
+   * @param viscontigs\r
+   * @param codons\r
+   * @param gapCharacter\r
+   * @param newSeq\r
+   * @return sequence ready to be added to alignment.\r
+   */\r
+  public static SequenceI translateCodingRegion(SequenceI selection,\r
+          String seqstring, int[] viscontigs, AlignedCodonFrame codons,\r
+          char gapCharacter)\r
+  {\r
+    ShiftList vismapping = new ShiftList(); // map from viscontigs to seqstring\r
+                                            // intervals\r
+    int vc, scontigs[] = new int[viscontigs.length];\r
+    int npos = 0;\r
+    for (vc = 0; vc < viscontigs.length; vc += 2)\r
+    {\r
+      vismapping.addShift(npos, viscontigs[vc]);\r
+      scontigs[vc] = npos;\r
+      npos += viscontigs[vc + 1];\r
+      scontigs[vc + 1] = npos;\r
+    }\r
+\r
+    StringBuffer protein = new StringBuffer();\r
+    String seq = seqstring.replace('U', 'T');\r
+    char codon[] = new char[3];\r
+    int cdp[] = new int[3], rf = 0, lastnpos = 0, nend;\r
+    int aspos = 0;\r
+    int resSize = 0;\r
+    for (npos = 0, nend = seq.length(); npos < nend; npos++)\r
+    {\r
+      if (!jalview.util.Comparison.isGap(seq.charAt(npos)))\r
+      {\r
+        cdp[rf] = npos; // store position\r
+        codon[rf++] = seq.charAt(npos); // store base\r
+      }\r
+      // filled an RF yet ?\r
+      if (rf == 3)\r
+      {\r
+        String aa = ResidueProperties.codonTranslate(new String(codon));\r
+        rf = 0;\r
+        if (aa == null)\r
+          aa = String.valueOf(gapCharacter);\r
+        else\r
+        {\r
+          if (aa.equals("STOP"))\r
+          {\r
+            aa = "X";\r
+          }\r
+          resSize++;\r
+        }\r
+        // insert/delete gaps prior to this codon - if necessary\r
+        boolean findpos = true;\r
+        while (findpos)\r
+        {\r
+          // first ensure that the codons array is long enough.\r
+          codons.checkCodonFrameWidth(aspos);\r
+          // now check to see if we place the aa at the current aspos in the\r
+          // protein alignment\r
+          switch (Dna.compare_codonpos(cdp, codons.codons[aspos]))\r
+          {\r
+          case -1:\r
+            codons.insertAAGap(aspos, gapCharacter);\r
+            findpos = false;\r
+            break;\r
+          case +1:\r
+            // this aa appears after the aligned codons at aspos, so prefix it\r
+            // with a gap\r
+            aa = "" + gapCharacter + aa;\r
+            aspos++;\r
+            if (aspos >= codons.aaWidth)\r
+              codons.aaWidth = aspos + 1;\r
+            break; // check the next position for alignment\r
+          case 0:\r
+            // codon aligns at aspos position.\r
+            findpos = false;\r
+          }\r
+        }\r
+        // codon aligns with all other sequence residues found at aspos\r
+        protein.append(aa);\r
+        lastnpos = npos;\r
+        if (codons.codons[aspos] == null)\r
+        {\r
+          // mark this column as aligning to this aligned reading frame\r
+          codons.codons[aspos] = new int[]\r
+          { cdp[0], cdp[1], cdp[2] };\r
+        }\r
+        aspos++;\r
+        if (aspos >= codons.aaWidth)\r
+          codons.aaWidth = aspos + 1;\r
+      }\r
+    }\r
+    if (resSize > 0)\r
+    {\r
+      SequenceI newseq = new Sequence(selection.getName(), protein\r
+              .toString());\r
+      if (rf != 0)\r
+      {\r
+        jalview.bin.Cache.log\r
+                .debug("trimming contigs for incomplete terminal codon.");\r
+        // map and trim contigs to ORF region\r
+        vc = scontigs.length - 1;\r
+        lastnpos = vismapping.shift(lastnpos); // place npos in context of\r
+                                                // whole dna alignment (rather\r
+                                                // than visible contigs)\r
+        // incomplete ORF could be broken over one or two visible contig\r
+        // intervals.\r
+        while (vc >= 0 && scontigs[vc] > lastnpos)\r
+        {\r
+          if (vc > 0 && scontigs[vc - 1] > lastnpos)\r
+          {\r
+            vc -= 2;\r
+          }\r
+          else\r
+          {\r
+            // correct last interval in list.\r
+            scontigs[vc] = lastnpos;\r
+          }\r
+        }\r
+\r
+        if (vc > 0 && (vc + 1) < scontigs.length)\r
+        {\r
+          // truncate map list to just vc elements\r
+          int t[] = new int[vc + 1];\r
+          System.arraycopy(scontigs, 0, t, 0, vc + 1);\r
+          scontigs = t;\r
+        }\r
+        if (vc <= 0)\r
+          scontigs = null;\r
+      }\r
+      if (scontigs != null)\r
+      {\r
+        npos = 0;\r
+        // Find sequence position for scontigs positions on the nucleotide\r
+        // sequence string we were passed.\r
+        for (vc = 0; vc < viscontigs.length; vc += 2)\r
+        {\r
+          scontigs[vc] = selection.findPosition(scontigs[vc]); // not from 1!\r
+          npos += viscontigs[vc];\r
+          scontigs[vc + 1] = selection\r
+                  .findPosition(npos + scontigs[vc + 1]); // exclusive\r
+          if (scontigs[vc + 1] == selection.getEnd())\r
+            break;\r
+        }\r
+        // trim trailing empty intervals.\r
+        if ((vc + 2) < scontigs.length)\r
+        {\r
+          int t[] = new int[vc + 2];\r
+          System.arraycopy(scontigs, 0, t, 0, vc + 2);\r
+          scontigs = t;\r
+        }\r
+\r
+        MapList map = new MapList(scontigs, new int[]\r
+        { 1, resSize }, 3, 1); // TODO: store mapping on newSeq for linked\r
+                                // DNA/Protein viewing.\r
+        transferCodedFeatures(selection, newseq, map, null, null);\r
+        SequenceI rseq = newseq.deriveSequence(); // construct a dataset\r
+                                                  // sequence for our new\r
+                                                  // peptide, regardless.\r
+        // store a mapping (this actually stores a mapping between the dataset\r
+        // sequences for the two sequences\r
+        codons.addMap(selection, newseq, map);\r
+        return rseq;\r
+      }\r
+    }\r
+    // register the mapping somehow\r
+    // \r
+    return null;\r
+  }\r
+\r
+  /**\r
+   * Given a peptide newly translated from a dna sequence, copy over and set any\r
+   * features on the peptide from the DNA. If featureTypes is null, all features\r
+   * on the dna sequence are searched (rather than just the displayed ones), and\r
+   * similarly for featureGroups.\r
+   * \r
+   * @param dna\r
+   * @param pep\r
+   * @param map\r
+   * @param featureTypes\r
+   *          hash who's keys are the displayed feature type strings\r
+   * @param featureGroups\r
+   *          hash where keys are feature groups and values are Boolean objects\r
+   *          indicating if they are displayed.\r
+   */\r
+  private static void transferCodedFeatures(SequenceI dna, SequenceI pep,\r
+          MapList map, Hashtable featureTypes, Hashtable featureGroups)\r
+  {\r
+    SequenceFeature[] sf = dna.getDatasetSequence().getSequenceFeatures();\r
+    Boolean fgstate;\r
+    jalview.datamodel.DBRefEntry[] dnarefs = jalview.util.DBRefUtils\r
+            .selectRefs(dna.getDBRef(),\r
+                    jalview.datamodel.DBRefSource.DNACODINGDBS);\r
+    if (dnarefs != null)\r
+    {\r
+      // intersect with pep\r
+      for (int d = 0; d < dnarefs.length; d++)\r
+      {\r
+        Mapping mp = dnarefs[d].getMap();\r
+        if (mp != null)\r
+        {\r
+        }\r
+      }\r
+    }\r
+    if (sf != null)\r
+    {\r
+      for (int f = 0; f < sf.length; f++)\r
+      {\r
+        fgstate = (featureGroups == null) ? null : ((Boolean) featureGroups\r
+                .get(sf[f].featureGroup));\r
+        if ((featureTypes == null || featureTypes.containsKey(sf[f]\r
+                .getType()))\r
+                && (fgstate == null || fgstate.booleanValue()))\r
+        {\r
+          if (FeatureProperties.isCodingFeature(null, sf[f].getType()))\r
+          {\r
+            // if (map.intersectsFrom(sf[f].begin, sf[f].end))\r
+            {\r
+\r
+            }\r
+          }\r
+        }\r
+      }\r
+    }\r
    }\r
  }\r