From 6173092ff5cb03f039cac674bfc8bc4f969976a5 Mon Sep 17 00:00:00 2001
From: jprocter <Jim Procter>
Date: Sun, 17 Jun 2007 13:02:19 +0000
Subject: [PATCH] fix aligned DNA codon translation bug(s) and generate
 AlignedCodonFrame mappings for each translated sequence

---
 src/jalview/analysis/Dna.java |  500 +++++++++++++++++++++++++++--------------
 1 file changed, 336 insertions(+), 164 deletions(-)
diff --git a/src/jalview/analysis/Dna.java b/src/jalview/analysis/Dna.java
index 48d03b3..960a6db 100644
--- a/src/jalview/analysis/Dna.java
+++ b/src/jalview/analysis/Dna.java
@@ -3,34 +3,61 @@ package jalview.analysis;
 import java.util.Hashtable;
 import java.util.Vector;
 
+import jalview.datamodel.AlignedCodonFrame;
 import jalview.datamodel.Alignment;
 import jalview.datamodel.AlignmentAnnotation;
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.Annotation;
 import jalview.datamodel.ColumnSelection;
+import jalview.datamodel.FeatureProperties;
+import jalview.datamodel.Mapping;
 import jalview.datamodel.Sequence;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
 import jalview.schemes.ResidueProperties;
 import jalview.util.MapList;
+import jalview.util.ShiftList;
 
-public class Dna {
+public class Dna
+{
   /**
    * 
    * @param cdp1
    * @param cdp2
-   * @return -1 if cdp1 aligns before cdp2, 0 if in the same column or cdp2 is null, +1 if after cdp2
+   * @return -1 if cdp1 aligns before cdp2, 0 if in the same column or cdp2 is
+   *         null, +1 if after cdp2
    */
-  private static int compare_codonpos(int[] cdp1, int[] cdp2) {
-    if (cdp2==null || (cdp1[0]==cdp2[0] && cdp1[1] == cdp2[1] && cdp1[2] == cdp2[2]))
+  private static int compare_codonpos(int[] cdp1, int[] cdp2)
+  {
+    if (cdp2 == null
+            || (cdp1[0] == cdp2[0] && cdp1[1] == cdp2[1] && cdp1[2] == cdp2[2]))
       return 0;
-    if (cdp1[0]<cdp2[0] || cdp1[1]<cdp2[1] || cdp1[2]<cdp2[2])
-      return -1; // one base in cdp1 precedes the corresponding base in the other codon
-    return 1; // one base in cdp1 appears after the corresponding base in the other codon.
+    if (cdp1[0] < cdp2[0] || cdp1[1] < cdp2[1] || cdp1[2] < cdp2[2])
+      return -1; // one base in cdp1 precedes the corresponding base in the
+                  // other codon
+    return 1; // one base in cdp1 appears after the corresponding base in the
+              // other codon.
   }
+
+  /**
+   * DNA->mapped protein sequence alignment translation given set of sequences
+   * 1. id distinct coding regions within selected region for each sequence 2.
+   * generate peptides based on inframe (or given) translation or (optionally
+   * and where specified) out of frame translations (annotated appropriately) 3.
+   * align peptides based on codon alignment
+   */
+  /**
+   * id potential products from dna 1. search for distinct products within
+   * selected region for each selected sequence 2. group by associated DB type.
+   * 3. return as form for input into above function
+   */
+  /**
+   * 
+   */
   /**
-   * create a new alignment of protein sequences
-   * by an inframe translation of the provided NA sequences
+   * create a new alignment of protein sequences by an inframe translation of
+   * the provided NA sequences
+   * 
    * @param selection
    * @param seqstring
    * @param viscontigs
@@ -39,198 +66,343 @@ public class Dna {
    * @param aWidth
    * @return
    */
-  public static AlignmentI CdnaTranslate(SequenceI[] selection, String[] seqstring, int viscontigs[], char gapCharacter, 
-      AlignmentAnnotation[] annotations, int aWidth) {
+  public static AlignmentI CdnaTranslate(SequenceI[] selection,
+          String[] seqstring, int viscontigs[], char gapCharacter,
+          AlignmentAnnotation[] annotations, int aWidth)
+  {
+    AlignedCodonFrame codons = new AlignedCodonFrame(aWidth); // stores hash of
+                                                              // subsequent
+                                                              // positions for
+                                                              // each codon
+                                                              // start position
+                                                              // in alignment
     int s, sSize = selection.length;
-    SequenceI [] newSeq = new SequenceI[sSize];
-    int res, resSize;
-    StringBuffer protein;
-    String seq;
-
-    int[][] codons = new int[aWidth][]; // stores hash of subsequent positions for each codon start position in alignment
-
-    for (res=0;res<aWidth;res++)
-      codons[res]=null;
-    int aslen=0; // final width of aligned translated aa sequences
-    for(s=0; s<sSize; s++)
+    Vector pepseqs = new Vector();
+    for (s = 0; s < sSize; s++)
     {
-      int vc,scontigs[]=new int[viscontigs.length];
-
-      for (vc=0;vc<scontigs.length; vc+=2)
+      SequenceI newseq = translateCodingRegion(selection[s], seqstring[s],
+              viscontigs, codons, gapCharacter);
+      if (newseq != null)
       {
-        scontigs[vc]=selection[s].findPosition(viscontigs[vc]); // not from 1!
-        scontigs[vc+1]=selection[s].findPosition(viscontigs[vc+1]-1); // exclusive
-        if (scontigs[vc+1]==selection[s].getEnd())
-          break;
+        pepseqs.addElement(newseq);
       }
-      if ((vc+2)<scontigs.length) {
-        int t[] = new int[vc+2];
-        System.arraycopy(scontigs, 0, t, 0, vc+2);
-        scontigs = t;
-      }
-      protein = new StringBuffer();
-      seq = seqstring[s].replace('U', 'T');
-      char codon[]=new char[3];
-      int cdp[]=new int[3],rf=0,gf=0,nend,npos;
-      int aspos=0;
-      resSize=0;
-      for (npos=0,nend=seq.length(); npos<nend; npos++) {
-        if (!jalview.util.Comparison.isGap(seq.charAt(npos))) { 
-          cdp[rf] = npos; // store position
-          codon[rf++]=seq.charAt(npos); // store base
-        }
-        // filled an RF yet ?
-        if (rf==3) {
-          String aa = ResidueProperties.codonTranslate(new String(codon));
-          rf=0;
-          if(aa==null)
-            aa=String.valueOf(gapCharacter);
-          else {
-            if(aa.equals("STOP"))
-            {
-              aa="X";
-            }
-            resSize++;
-          }
-          // insert/delete gaps prior to this codon - if necessary
-          boolean findpos=true;
-          while (findpos) 
-          {
-            // first ensure that the codons array is long enough.
-            if (codons.length<=aslen+1) {
-              // probably never have to do this ?
-              int[][] c = new int[codons.length+10][];
-              for (int i=0; i<codons.length; i++) {
-                c[i] = codons[i];
-                codons[i]=null;
-              }
-              codons = c;
-            }
-            // now check to see if we place the aa at the current aspos in the protein alignment
-            switch (Dna.compare_codonpos(cdp, codons[aspos])) 
-            {
-            case -1:
-              // this aa appears before the aligned codons at aspos - so shift them.
-              aslen++;
-              for (int sq=0;sq<s; sq++) {
-                newSeq[sq].insertCharAt(aspos, gapCharacter);
-              }
-              System.arraycopy(codons, aspos, codons, aspos+1, aslen-aspos);
-              codons[aspos]=null; // clear so new codon position can be marked.
-              findpos=false;
-              break;
-            case +1:
-              // this aa appears after the aligned codons at aspos, so prefix it with a gap
-              aa = ""+gapCharacter+aa;
-              aspos++;
-              if (aspos>=aslen)
-                aslen=aspos+1;
-              break; // check the next position for alignment
-            case 0:
-              // codon aligns at aspos position.
-              findpos = false;
-            }
-          }
-          // codon aligns with all other sequence residues found at aspos
-          protein.append(aa);
-          if (codons[aspos]==null) 
-          {
-            // mark this column as aligning to this aligned reading frame 
-            codons[aspos] = new int[] { cdp[0], cdp[1], cdp[2] };
-          }
-          aspos++;
-          if (aspos>=aslen)
-            aslen=aspos+1;
-        }
-      }
-      if (resSize>0) 
-      {
-        newSeq[s] = new Sequence(selection[s].getName(),
-            protein.toString());
-        if (rf!=0) 
-        {
-          jalview.bin.Cache.log.debug("trimming contigs for incomplete terminal codon.");
-          // trim contigs
-          vc=scontigs.length-1;
-          nend-=rf;
-          // incomplete ORF could be broken over one or two visible contig intervals.
-          while (vc>0 && scontigs[vc]>nend)
-          {
-            if (scontigs[vc-1]>nend) 
-            {
-              vc-=2;
-            } else {
-              // correct last interval in list.
-              scontigs[vc]=nend;
-            }
-          }
-          if ((vc+2)<scontigs.length) {
-            // truncate map list
-            int t[] = new int[vc+1];
-            System.arraycopy(scontigs,0,t,0,vc+1);
-            scontigs=t;
-          }
-        }
-        MapList map = new MapList(scontigs, new int[] { 1, resSize },3,1); // TODO: store mapping on newSeq for linked DNA/Protein viewing.
-      }
-      // register the mapping somehow
-      // 
     }
-    if (aslen==0)
+    if (codons.aaWidth == 0)
       return null;
-    AlignmentI al = new Alignment(newSeq);
-    al.padGaps();  // ensure we look aligned.
+    SequenceI[] newseqs = new SequenceI[pepseqs.size()];
+    pepseqs.copyInto(newseqs);
+    AlignmentI al = new Alignment(newseqs);
+    al.padGaps(); // ensure we look aligned.
     al.setDataset(null);
+    translateAlignedAnnotations(annotations, al, codons);
+    al.addCodonFrame(codons);
+    return al;
+  }
 
-
-    ////////////////////////////////
+  /**
+   * translate na alignment annotations onto translated amino acid alignment al
+   * using codon mapping codons
+   * 
+   * @param annotations
+   * @param al
+   * @param codons
+   */
+  public static void translateAlignedAnnotations(
+          AlignmentAnnotation[] annotations, AlignmentI al,
+          AlignedCodonFrame codons)
+  {
+    // //////////////////////////////
     // Copy annotations across
     //
     // Can only do this for columns with consecutive codons, or where
     // annotation is sequence associated.
-    
-    int pos,a,aSize;
-    if(annotations!=null)
+
+    int pos, a, aSize;
+    if (annotations != null)
     {
       for (int i = 0; i < annotations.length; i++)
       {
         // Skip any autogenerated annotation
-        if (annotations[i].autoCalculated) {
+        if (annotations[i].autoCalculated)
+        {
           continue;
         }
-        
-        aSize = aslen; // aa alignment width.
-        jalview.datamodel.Annotation[] anots = 
-          (annotations[i].annotations==null) 
-          ? null :
-            new jalview.datamodel.Annotation[aSize];
-        if (anots!=null)
+
+        aSize = codons.getaaWidth(); // aa alignment width.
+        jalview.datamodel.Annotation[] anots = (annotations[i].annotations == null) ? null
+                : new jalview.datamodel.Annotation[aSize];
+        if (anots != null)
         {
           for (a = 0; a < aSize; a++)
           {
             // process through codon map.
-            if (codons[a]!=null && codons[a][0]==(codons[a][2]-2))
+            if (codons.codons[a] != null
+                    && codons.codons[a][0] == (codons.codons[a][2] - 2))
             {
-              pos = codons[a][0];
+              pos = codons.codons[a][0];
               if (annotations[i].annotations[pos] == null
                       || annotations[i].annotations[pos] == null)
                 continue;
-            
+              // We just take the annotation in the first base in the codon
               anots[a] = new Annotation(annotations[i].annotations[pos]);
             }
           }
         }
 
-        jalview.datamodel.AlignmentAnnotation aa
-        = new jalview.datamodel.AlignmentAnnotation(annotations[i].label,
-            annotations[i].description, anots);
+        jalview.datamodel.AlignmentAnnotation aa = new jalview.datamodel.AlignmentAnnotation(
+                annotations[i].label, annotations[i].description, anots);
         if (annotations[i].hasScore)
         {
           aa.setScore(annotations[i].getScore());
         }
+        if (annotations[i].sequenceRef != null)
+        {
+          SequenceI aaSeq = codons
+                  .getAaForDnaSeq(annotations[i].sequenceRef);
+          if (aaSeq != null)
+          {
+            // aa.compactAnnotationArray(); // throw away alignment annotation
+            // positioning
+            aa.setSequenceRef(aaSeq);
+            aa.createSequenceMapping(aaSeq, aaSeq.getStart(), true); // rebuild
+                                                                      // mapping
+            aa.adjustForAlignment();
+            aaSeq.addAlignmentAnnotation(aa);
+          }
+
+        }
         al.addAnnotation(aa);
       }
     }
-    return al;
+  }
+
+  /**
+   * Translate a na sequence
+   * 
+   * @param selection
+   * @param seqstring
+   * @param viscontigs
+   * @param codons
+   * @param gapCharacter
+   * @param newSeq
+   * @return sequence ready to be added to alignment.
+   */
+  public static SequenceI translateCodingRegion(SequenceI selection,
+          String seqstring, int[] viscontigs, AlignedCodonFrame codons,
+          char gapCharacter)
+  {
+    ShiftList vismapping = new ShiftList(); // map from viscontigs to seqstring
+                                            // intervals
+    int vc, scontigs[] = new int[viscontigs.length];
+    int npos = 0;
+    for (vc = 0; vc < viscontigs.length; vc += 2)
+    {
+      vismapping.addShift(npos, viscontigs[vc]);
+      scontigs[vc] = npos;
+      npos += viscontigs[vc + 1];
+      scontigs[vc + 1] = npos;
+    }
+
+    StringBuffer protein = new StringBuffer();
+    String seq = seqstring.replace('U', 'T');
+    char codon[] = new char[3];
+    int cdp[] = new int[3], rf = 0, lastnpos = 0, nend;
+    int aspos = 0;
+    int resSize = 0;
+    for (npos = 0, nend = seq.length(); npos < nend; npos++)
+    {
+      if (!jalview.util.Comparison.isGap(seq.charAt(npos)))
+      {
+        cdp[rf] = npos; // store position
+        codon[rf++] = seq.charAt(npos); // store base
+      }
+      // filled an RF yet ?
+      if (rf == 3)
+      {
+        String aa = ResidueProperties.codonTranslate(new String(codon));
+        rf = 0;
+        if (aa == null)
+          aa = String.valueOf(gapCharacter);
+        else
+        {
+          if (aa.equals("STOP"))
+          {
+            aa = "X";
+          }
+          resSize++;
+        }
+        // insert/delete gaps prior to this codon - if necessary
+        boolean findpos = true;
+        while (findpos)
+        {
+          // first ensure that the codons array is long enough.
+          codons.checkCodonFrameWidth(aspos);
+          // now check to see if we place the aa at the current aspos in the
+          // protein alignment
+          switch (Dna.compare_codonpos(cdp, codons.codons[aspos]))
+          {
+          case -1:
+            codons.insertAAGap(aspos, gapCharacter);
+            findpos = false;
+            break;
+          case +1:
+            // this aa appears after the aligned codons at aspos, so prefix it
+            // with a gap
+            aa = "" + gapCharacter + aa;
+            aspos++;
+            if (aspos >= codons.aaWidth)
+              codons.aaWidth = aspos + 1;
+            break; // check the next position for alignment
+          case 0:
+            // codon aligns at aspos position.
+            findpos = false;
+          }
+        }
+        // codon aligns with all other sequence residues found at aspos
+        protein.append(aa);
+        lastnpos = npos;
+        if (codons.codons[aspos] == null)
+        {
+          // mark this column as aligning to this aligned reading frame
+          codons.codons[aspos] = new int[]
+          { cdp[0], cdp[1], cdp[2] };
+        }
+        aspos++;
+        if (aspos >= codons.aaWidth)
+          codons.aaWidth = aspos + 1;
+      }
+    }
+    if (resSize > 0)
+    {
+      SequenceI newseq = new Sequence(selection.getName(), protein
+              .toString());
+      if (rf != 0)
+      {
+        jalview.bin.Cache.log
+                .debug("trimming contigs for incomplete terminal codon.");
+        // map and trim contigs to ORF region
+        vc = scontigs.length - 1;
+        lastnpos = vismapping.shift(lastnpos); // place npos in context of
+                                                // whole dna alignment (rather
+                                                // than visible contigs)
+        // incomplete ORF could be broken over one or two visible contig
+        // intervals.
+        while (vc >= 0 && scontigs[vc] > lastnpos)
+        {
+          if (vc > 0 && scontigs[vc - 1] > lastnpos)
+          {
+            vc -= 2;
+          }
+          else
+          {
+            // correct last interval in list.
+            scontigs[vc] = lastnpos;
+          }
+        }
+
+        if (vc > 0 && (vc + 1) < scontigs.length)
+        {
+          // truncate map list to just vc elements
+          int t[] = new int[vc + 1];
+          System.arraycopy(scontigs, 0, t, 0, vc + 1);
+          scontigs = t;
+        }
+        if (vc <= 0)
+          scontigs = null;
+      }
+      if (scontigs != null)
+      {
+        npos = 0;
+        // Find sequence position for scontigs positions on the nucleotide
+        // sequence string we were passed.
+        for (vc = 0; vc < viscontigs.length; vc += 2)
+        {
+          scontigs[vc] = selection.findPosition(scontigs[vc]); // not from 1!
+          npos += viscontigs[vc];
+          scontigs[vc + 1] = selection
+                  .findPosition(npos + scontigs[vc + 1]); // exclusive
+          if (scontigs[vc + 1] == selection.getEnd())
+            break;
+        }
+        // trim trailing empty intervals.
+        if ((vc + 2) < scontigs.length)
+        {
+          int t[] = new int[vc + 2];
+          System.arraycopy(scontigs, 0, t, 0, vc + 2);
+          scontigs = t;
+        }
+
+        MapList map = new MapList(scontigs, new int[]
+        { 1, resSize }, 3, 1); // TODO: store mapping on newSeq for linked
+                                // DNA/Protein viewing.
+        transferCodedFeatures(selection, newseq, map, null, null);
+        SequenceI rseq = newseq.deriveSequence(); // construct a dataset
+                                                  // sequence for our new
+                                                  // peptide, regardless.
+        // store a mapping (this actually stores a mapping between the dataset
+        // sequences for the two sequences
+        codons.addMap(selection, newseq, map);
+        return rseq;
+      }
+    }
+    // register the mapping somehow
+    // 
+    return null;
+  }
+
+  /**
+   * Given a peptide newly translated from a dna sequence, copy over and set any
+   * features on the peptide from the DNA. If featureTypes is null, all features
+   * on the dna sequence are searched (rather than just the displayed ones), and
+   * similarly for featureGroups.
+   * 
+   * @param dna
+   * @param pep
+   * @param map
+   * @param featureTypes
+   *          hash who's keys are the displayed feature type strings
+   * @param featureGroups
+   *          hash where keys are feature groups and values are Boolean objects
+   *          indicating if they are displayed.
+   */
+  private static void transferCodedFeatures(SequenceI dna, SequenceI pep,
+          MapList map, Hashtable featureTypes, Hashtable featureGroups)
+  {
+    SequenceFeature[] sf = dna.getDatasetSequence().getSequenceFeatures();
+    Boolean fgstate;
+    jalview.datamodel.DBRefEntry[] dnarefs = jalview.util.DBRefUtils
+            .selectRefs(dna.getDBRef(),
+                    jalview.datamodel.DBRefSource.DNACODINGDBS);
+    if (dnarefs != null)
+    {
+      // intersect with pep
+      for (int d = 0; d < dnarefs.length; d++)
+      {
+        Mapping mp = dnarefs[d].getMap();
+        if (mp != null)
+        {
+        }
+      }
+    }
+    if (sf != null)
+    {
+      for (int f = 0; f < sf.length; f++)
+      {
+        fgstate = (featureGroups == null) ? null : ((Boolean) featureGroups
+                .get(sf[f].featureGroup));
+        if ((featureTypes == null || featureTypes.containsKey(sf[f]
+                .getType()))
+                && (fgstate == null || fgstate.booleanValue()))
+        {
+          if (FeatureProperties.isCodingFeature(null, sf[f].getType()))
+          {
+            // if (map.intersectsFrom(sf[f].begin, sf[f].end))
+            {
+
+            }
+          }
+        }
+      }
+    }
   }
 }
-- 
1.7.10.2