src/jalview/datamodel/AlignedCodonFrame.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel;
  22
  23 import jalview.util.MapList;
  24
  25 import java.util.ArrayList;
  26 import java.util.List;
  27
  28 /**
  29  * Stores mapping between the columns of a protein alignment and a DNA alignment
  30  * and a list of individual codon to amino acid mappings between sequences.
  31  */
  32 public class AlignedCodonFrame
  33 {
  34
  35   /**
  36    * <pre>
  37    * Aligned nucleotide positions for codons mapped to column positions of of aligned
  38    * proteins. e.g.
  39    * codons[3] = [12, 14, 15] means:
  40    *     column 4 in the protein alignment translates cols 13, 15, 16 in cDNA
  41    * codons[5] = null means column 6 in the protein alignment is a gap
  42    * </pre>
  43    */
  44   public int[][] codons = null;
  45
  46   /**
  47    * Width of protein sequence alignment (implicit assertion that codons.length
  48    * >= aaWidth)
  49    */
  50   public int aaWidth = 0;
  51
  52   /*
  53    * TODO: not an ideal solution - we reference the aligned amino acid sequences
  54    * in order to make insertions on them Better would be dnaAlignment and
  55    * aaAlignment reference....
  56    */
  57   private List<SequenceI> a_aaSeqs = new ArrayList<SequenceI>();
  58
  59   /*
  60    * tied array of na Sequence objects.
  61    */
  62   private SequenceI[] dnaSeqs = null;
  63
  64   /*
  65    * tied array of Mappings to protein sequence Objects and SequenceI[]
  66    * aaSeqs=null; MapLists where eac maps from the corresponding dnaSeqs element
  67    * to corresponding aaSeqs element
  68    */
  69   private Mapping[] dnaToProt = null;
  70
  71   /**
  72    * initialise codon frame with a nominal alignment width
  73    *
  74    * @param aWidth
  75    */
  76   public AlignedCodonFrame(int aWidth)
  77   {
  78     if (aWidth <= 0)
  79     {
  80       codons = null;
  81       return;
  82     }
  83     codons = new int[aWidth][];
  84     for (int res = 0; res < aWidth; res++)
  85     {
  86       codons[res] = null;
  87     }
  88   }
  89
  90   /**
  91    * Construct a 'near copy' of the given AlignedCodonFrame, that references the
  92    * same dataset sequences, but the given protein aligned sequences.
  93    *
  94    * @param acf
  95    * @param alignment
  96    * @throws IllegalStateException
  97    *           if the copied mapping references any dataset not in the alignment
  98    */
  99   public AlignedCodonFrame(AlignedCodonFrame acf, SequenceI[] alignment)
 100   {
 101     this.codons = acf.codons;
 102     this.dnaSeqs = acf.dnaSeqs;
 103     this.dnaToProt = acf.dnaToProt;
 104
 105     for (SequenceI seq : acf.a_aaSeqs)
 106     {
 107       boolean found = false;
 108       // TODO may not correctly handle the case where the same sequence appears
 109       // twice in the source alignment i.e. same dataset sequence
 110       // the copy will reference the first aligned sequence for both
 111       // ?not solvable if realignment may reorder the sequences
 112       // or check on sequence name as well????
 113       for (SequenceI newseq : alignment)
 114       {
 115         if (seq.getDatasetSequence() == newseq.getDatasetSequence())
 116         {
 117           this.a_aaSeqs.add(newseq);
 118           found = true;
 119           break;
 120         }
 121       }
 122       if (!found)
 123       {
 124         throw new IllegalStateException("Copying codon mapping for"
 125                 + seq.getSequenceAsString());
 126       }
 127     }
 128   }
 129
 130   /**
 131    * ensure that codons array is at least as wide as aslen residues
 132    *
 133    * @param aslen
 134    * @return (possibly newly expanded) codon array
 135    */
 136   public int[][] checkCodonFrameWidth(int aslen)
 137   {
 138     // TODO why not codons.length < aslen ?
 139     // should codons expand if length is 2 or 3 and aslen==2 ?
 140     if (codons.length <= aslen + 1)
 141     {
 142       // probably never have to do this ?
 143       int[][] c = new int[codons.length + 10][];
 144       for (int i = 0; i < codons.length; i++)
 145       {
 146         c[i] = codons[i];
 147         codons[i] = null;
 148       }
 149       codons = c;
 150     }
 151     return codons;
 152   }
 153
 154   /**
 155    * @return width of aligned translated amino acid residues
 156    */
 157   public int getaaWidth()
 158   {
 159     return aaWidth;
 160   }
 161
 162   /**
 163    * increase aaWidth by one and insert a new aligned codon position space at
 164    * aspos.
 165    *
 166    * @param aspos
 167    */
 168   public void insertAAGap(int aspos, char gapCharacter)
 169   {
 170     // this aa appears before the aligned codons at aspos - so shift them in
 171     // each pair of mapped sequences
 172     aaWidth++;
 173     // we actually have to modify the aligned sequences here, so use the
 174     // a_aaSeqs vector
 175     for (SequenceI seq : a_aaSeqs)
 176     {
 177       seq.insertCharAt(aspos, gapCharacter);
 178     }
 179
 180     if (aspos < aaWidth)
 181     {
 182       aaWidth++;
 183       System.arraycopy(codons, aspos, codons, aspos + 1, codons.length
 184               - aspos - 1);
 185       codons[aspos] = null; // clear so new codon position can be marked.
 186     }
 187   }
 188
 189   public void setAaWidth(int aapos)
 190   {
 191     aaWidth = aapos;
 192   }
 193
 194   /**
 195    * add a mapping between the dataset sequences for the associated dna and
 196    * protein sequence objects
 197    *
 198    * @param dnaseq
 199    * @param aaseq
 200    * @param map
 201    */
 202   public void addMap(SequenceI dnaseq, SequenceI aaseq, MapList map)
 203   {
 204     int nlen = 1;
 205     if (dnaSeqs != null)
 206     {
 207       nlen = dnaSeqs.length + 1;
 208     }
 209     SequenceI[] ndna = new SequenceI[nlen];
 210     Mapping[] ndtp = new Mapping[nlen];
 211     if (dnaSeqs != null)
 212     {
 213       System.arraycopy(dnaSeqs, 0, ndna, 0, dnaSeqs.length);
 214       System.arraycopy(dnaToProt, 0, ndtp, 0, dnaSeqs.length);
 215     }
 216     dnaSeqs = ndna;
 217     dnaToProt = ndtp;
 218     nlen--;
 219     dnaSeqs[nlen] = (dnaseq.getDatasetSequence() == null) ? dnaseq : dnaseq
 220             .getDatasetSequence();
 221     Mapping mp = new Mapping(map);
 222     // JBPNote DEBUG! THIS !
 223     // dnaseq.transferAnnotation(aaseq, mp);
 224     // aaseq.transferAnnotation(dnaseq, new Mapping(map.getInverse()));
 225     mp.to = (aaseq.getDatasetSequence() == null) ? aaseq : aaseq
 226             .getDatasetSequence();
 227     a_aaSeqs.add(aaseq);
 228     dnaToProt[nlen] = mp;
 229   }
 230
 231   public SequenceI[] getdnaSeqs()
 232   {
 233     return dnaSeqs;
 234   }
 235
 236   public SequenceI[] getAaSeqs()
 237   {
 238     if (dnaToProt == null)
 239     {
 240       return null;
 241     }
 242     SequenceI[] sqs = new SequenceI[dnaToProt.length];
 243     for (int sz = 0; sz < dnaToProt.length; sz++)
 244     {
 245       sqs[sz] = dnaToProt[sz].to;
 246     }
 247     return sqs;
 248   }
 249
 250   public MapList[] getdnaToProt()
 251   {
 252     if (dnaToProt == null)
 253     {
 254       return null;
 255     }
 256     MapList[] sqs = new MapList[dnaToProt.length];
 257     for (int sz = 0; sz < dnaToProt.length; sz++)
 258     {
 259       sqs[sz] = dnaToProt[sz].map;
 260     }
 261     return sqs;
 262   }
 263
 264   public Mapping[] getProtMappings()
 265   {
 266     return dnaToProt;
 267   }
 268
 269   /**
 270    *
 271    * @param sequenceRef
 272    * @return null or corresponding aaSeq dataset sequence for dnaSeq entry
 273    */
 274   public SequenceI getAaForDnaSeq(SequenceI dnaSeqRef)
 275   {
 276     return getAaForDnaSeq(dnaSeqRef, true);
 277   }
 278
 279   /**
 280    * Return the corresponding aligned or dataset aa sequence for given dna
 281    * sequence, null if not found.
 282    *
 283    * @param sequenceRef
 284    * @param returnDataset
 285    *          if true, return the aa dataset, else the aligned sequence
 286    * @return
 287    */
 288   public SequenceI getAaForDnaSeq(SequenceI dnaSeqRef, boolean returnDataset)
 289   {
 290     if (dnaSeqs == null)
 291     {
 292       return null;
 293     }
 294     SequenceI dnads = dnaSeqRef.getDatasetSequence();
 295     for (int ds = 0; ds < dnaSeqs.length; ds++)
 296     {
 297       if (dnaSeqs[ds] == dnaSeqRef || dnaSeqs[ds] == dnads)
 298       {
 299         if (returnDataset)
 300         {
 301           return dnaToProt[ds].to;
 302         }
 303         else
 304         {
 305           // TODO very fragile - depends on dnaSeqs, dnaToProt, a_aaSeqs moving
 306           // in parallel; revise data model to guarantee this
 307           return a_aaSeqs.get(ds);
 308         }
 309       }
 310     }
 311     return null;
 312   }
 313
 314   /**
 315    *
 316    * @param sequenceRef
 317    * @return null or corresponding aaSeq entry for dnaSeq entry
 318    */
 319   public SequenceI getDnaForAaSeq(SequenceI aaSeqRef)
 320   {
 321     if (dnaToProt == null)
 322     {
 323       return null;
 324     }
 325     SequenceI aads = aaSeqRef.getDatasetSequence();
 326     for (int as = 0; as < dnaToProt.length; as++)
 327     {
 328       if (dnaToProt[as].to == aaSeqRef || dnaToProt[as].to == aads)
 329       {
 330         return dnaSeqs[as];
 331       }
 332     }
 333     return null;
 334   }
 335
 336   /**
 337    * test to see if codon frame involves seq in any way
 338    *
 339    * @param seq
 340    *          a nucleotide or protein sequence
 341    * @return true if a mapping exists to or from this sequence to any translated
 342    *         sequence
 343    */
 344   public boolean involvesSequence(SequenceI seq)
 345   {
 346     return getAaForDnaSeq(seq) != null || getDnaForAaSeq(seq) != null;
 347   }
 348
 349   /**
 350    * Add search results for regions in other sequences that translate or are
 351    * translated from a particular position in seq
 352    *
 353    * @param seq
 354    * @param index
 355    *          position in seq
 356    * @param results
 357    *          where highlighted regions go
 358    */
 359   public void markMappedRegion(SequenceI seq, int index,
 360           SearchResults results)
 361   {
 362     if (dnaToProt == null)
 363     {
 364       return;
 365     }
 366     int[] codon;
 367     SequenceI ds = seq.getDatasetSequence();
 368     for (int mi = 0; mi < dnaToProt.length; mi++)
 369     {
 370       if (dnaSeqs[mi] == seq || dnaSeqs[mi] == ds)
 371       {
 372         // DEBUG System.err.println("dna pos "+index);
 373         codon = dnaToProt[mi].map.locateInTo(index, index);
 374         if (codon != null)
 375         {
 376           for (int i = 0; i < codon.length; i += 2)
 377           {
 378             results.addResult(dnaToProt[mi].to, codon[i], codon[i + 1]);
 379           }
 380         }
 381       }
 382       else if (dnaToProt[mi].to == seq || dnaToProt[mi].to == ds)
 383       {
 384         // DEBUG System.err.println("aa pos "+index);
 385         {
 386           codon = dnaToProt[mi].map.locateInFrom(index, index);
 387           if (codon != null)
 388           {
 389             for (int i = 0; i < codon.length; i += 2)
 390             {
 391               results.addResult(dnaSeqs[mi], codon[i], codon[i + 1]);
 392             }
 393           }
 394         }
 395       }
 396     }
 397   }
 398
 399   /**
 400    * Returns the DNA codon positions (base 1) for the given position (base 1) in
 401    * a mapped protein sequence, or null if no mapping is found.
 402    *
 403    * Intended for use in aligning cDNA to match aligned protein. Only the first
 404    * mapping found is returned, so not suitable for use if multiple protein
 405    * sequences are mapped to the same cDNA (but aligning cDNA as protein is
 406    * ill-defined for this case anyway).
 407    *
 408    * @param seq
 409    *          the DNA dataset sequence
 410    * @param aaPos
 411    *          residue position (base 1) in a protein sequence
 412    * @return
 413    */
 414   public int[] getDnaPosition(SequenceI seq, int aaPos)
 415   {
 416     /*
 417      * Adapted from markMappedRegion().
 418      */
 419     MapList ml = null;
 420     for (int i = 0; i < dnaToProt.length; i++)
 421     {
 422       if (dnaSeqs[i] == seq)
 423       {
 424         ml = getdnaToProt()[i];
 425         break;
 426       }
 427     }
 428     return ml == null ? null : ml.locateInFrom(aaPos, aaPos);
 429   }
 430 }