src/jalview/datamodel/AlignedCodonFrame.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
   3  * Copyright (C) 2014 The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.datamodel;
  22
  23 import jalview.util.MapList;
  24
  25 import java.util.ArrayList;
  26 import java.util.List;
  27
  28 /**
  29  * Stores mapping between the columns of a protein alignment and a DNA alignment
  30  * and a list of individual codon to amino acid mappings between sequences.
  31  */
  32 public class AlignedCodonFrame
  33 {
  34
  35   /**
  36    * <pre>
  37    * Aligned nucleotide positions for codons mapped to column positions of of aligned
  38    * proteins. e.g.
  39    * codons[3] = [12, 14, 15] means:
  40    *     column 4 in the protein alignment translates cols 13, 15, 16 in cDNA
  41    * codons[5] = null means column 6 in the protein alignment is a gap
  42    * </pre>
  43    */
  44   public int[][] codons = null;
  45
  46   /**
  47    * Width of protein sequence alignment (implicit assertion that codons.length
  48    * >= aaWidth)
  49    */
  50   public int aaWidth = 0;
  51
  52   /*
  53    * TODO: not an ideal solution - we reference the aligned amino acid sequences
  54    * in order to make insertions on them Better would be dnaAlignment and
  55    * aaAlignment reference....
  56    */
  57   private List<SequenceI> a_aaSeqs = new ArrayList<SequenceI>();
  58
  59   /*
  60    * tied array of na Sequence objects.
  61    */
  62   private SequenceI[] dnaSeqs = null;
  63
  64   /*
  65    * tied array of Mappings to protein sequence Objects and SequenceI[]
  66    * aaSeqs=null; MapLists where eac maps from the corresponding dnaSeqs element
  67    * to corresponding aaSeqs element
  68    */
  69   private Mapping[] dnaToProt = null;
  70
  71   /**
  72    * initialise codon frame with a nominal alignment width
  73    *
  74    * @param aWidth
  75    */
  76   public AlignedCodonFrame()
  77   {
  78   }
  79
  80   /**
  81    * Construct a 'near copy' of the given AlignedCodonFrame, that references the
  82    * same dataset sequences, but the given protein aligned sequences.
  83    *
  84    * @param acf
  85    * @param alignment
  86    * @throws IllegalStateException
  87    *           if the copied mapping references any dataset not in the alignment
  88    */
  89   public AlignedCodonFrame(AlignedCodonFrame acf, SequenceI[] alignment)
  90   {
  91     this.codons = acf.codons;
  92     this.dnaSeqs = acf.dnaSeqs;
  93     this.dnaToProt = acf.dnaToProt;
  94
  95     for (SequenceI seq : acf.a_aaSeqs)
  96     {
  97       boolean found = false;
  98       // TODO may not correctly handle the case where the same sequence appears
  99       // twice in the source alignment i.e. same dataset sequence
 100       // the copy will reference the first aligned sequence for both
 101       // ?not solvable if realignment may reorder the sequences
 102       // or check on sequence name as well????
 103       for (SequenceI newseq : alignment)
 104       {
 105         if (seq.getDatasetSequence() == newseq.getDatasetSequence())
 106         {
 107           this.a_aaSeqs.add(newseq);
 108           found = true;
 109           break;
 110         }
 111       }
 112       if (!found)
 113       {
 114         throw new IllegalStateException("Copying codon mapping for"
 115                 + seq.getSequenceAsString());
 116       }
 117     }
 118   }
 119
 120   /**
 121    * ensure that codons array is at least as wide as aslen residues
 122    *
 123    * @param aslen
 124    * @return (possibly newly expanded) codon array
 125    */
 126   public int[][] checkCodonFrameWidth(int aslen)
 127   {
 128     // TODO why not codons.length < aslen ?
 129     // should codons expand if length is 2 or 3 and aslen==2 ?
 130     if (codons.length <= aslen + 1)
 131     {
 132       // probably never have to do this ?
 133       int[][] c = new int[codons.length + 10][];
 134       for (int i = 0; i < codons.length; i++)
 135       {
 136         c[i] = codons[i];
 137         codons[i] = null;
 138       }
 139       codons = c;
 140     }
 141     return codons;
 142   }
 143
 144   /**
 145    * @return width of aligned translated amino acid residues
 146    */
 147   public int getaaWidth()
 148   {
 149     return aaWidth;
 150   }
 151
 152   /**
 153    * increase aaWidth by one and insert a new aligned codon position space at
 154    * aspos.
 155    *
 156    * @param aspos
 157    */
 158   public void insertAAGap(int aspos, char gapCharacter)
 159   {
 160     // this aa appears before the aligned codons at aspos - so shift them in
 161     // each pair of mapped sequences
 162     aaWidth++;
 163     // we actually have to modify the aligned sequences here, so use the
 164     // a_aaSeqs vector
 165     for (SequenceI seq : a_aaSeqs)
 166     {
 167       seq.insertCharAt(aspos, gapCharacter);
 168     }
 169
 170     if (aspos < aaWidth)
 171     {
 172       aaWidth++;
 173       System.arraycopy(codons, aspos, codons, aspos + 1, codons.length
 174               - aspos - 1);
 175       codons[aspos] = null; // clear so new codon position can be marked.
 176     }
 177   }
 178
 179   public void setAaWidth(int aapos)
 180   {
 181     aaWidth = aapos;
 182   }
 183
 184   /**
 185    * add a mapping between the dataset sequences for the associated dna and
 186    * protein sequence objects
 187    *
 188    * @param dnaseq
 189    * @param aaseq
 190    * @param map
 191    */
 192   public void addMap(SequenceI dnaseq, SequenceI aaseq, MapList map)
 193   {
 194     int nlen = 1;
 195     if (dnaSeqs != null)
 196     {
 197       nlen = dnaSeqs.length + 1;
 198     }
 199     SequenceI[] ndna = new SequenceI[nlen];
 200     Mapping[] ndtp = new Mapping[nlen];
 201     if (dnaSeqs != null)
 202     {
 203       System.arraycopy(dnaSeqs, 0, ndna, 0, dnaSeqs.length);
 204       System.arraycopy(dnaToProt, 0, ndtp, 0, dnaSeqs.length);
 205     }
 206     dnaSeqs = ndna;
 207     dnaToProt = ndtp;
 208     nlen--;
 209     dnaSeqs[nlen] = (dnaseq.getDatasetSequence() == null) ? dnaseq : dnaseq
 210             .getDatasetSequence();
 211     Mapping mp = new Mapping(map);
 212     // JBPNote DEBUG! THIS !
 213     // dnaseq.transferAnnotation(aaseq, mp);
 214     // aaseq.transferAnnotation(dnaseq, new Mapping(map.getInverse()));
 215     mp.to = (aaseq.getDatasetSequence() == null) ? aaseq : aaseq
 216             .getDatasetSequence();
 217     a_aaSeqs.add(aaseq);
 218     dnaToProt[nlen] = mp;
 219   }
 220
 221   public SequenceI[] getdnaSeqs()
 222   {
 223     return dnaSeqs;
 224   }
 225
 226   public SequenceI[] getAaSeqs()
 227   {
 228     if (dnaToProt == null)
 229     {
 230       return null;
 231     }
 232     SequenceI[] sqs = new SequenceI[dnaToProt.length];
 233     for (int sz = 0; sz < dnaToProt.length; sz++)
 234     {
 235       sqs[sz] = dnaToProt[sz].to;
 236     }
 237     return sqs;
 238   }
 239
 240   public MapList[] getdnaToProt()
 241   {
 242     if (dnaToProt == null)
 243     {
 244       return null;
 245     }
 246     MapList[] sqs = new MapList[dnaToProt.length];
 247     for (int sz = 0; sz < dnaToProt.length; sz++)
 248     {
 249       sqs[sz] = dnaToProt[sz].map;
 250     }
 251     return sqs;
 252   }
 253
 254   public Mapping[] getProtMappings()
 255   {
 256     return dnaToProt;
 257   }
 258
 259   /**
 260    *
 261    * @param sequenceRef
 262    * @return null or corresponding aaSeq dataset sequence for dnaSeq entry
 263    */
 264   public SequenceI getAaForDnaSeq(SequenceI dnaSeqRef)
 265   {
 266     return getAaForDnaSeq(dnaSeqRef, true);
 267   }
 268
 269   /**
 270    * Return the corresponding aligned or dataset aa sequence for given dna
 271    * sequence, null if not found.
 272    *
 273    * @param sequenceRef
 274    * @param returnDataset
 275    *          if true, return the aa dataset, else the aligned sequence
 276    * @return
 277    */
 278   public SequenceI getAaForDnaSeq(SequenceI dnaSeqRef, boolean returnDataset)
 279   {
 280     if (dnaSeqs == null)
 281     {
 282       return null;
 283     }
 284     SequenceI dnads = dnaSeqRef.getDatasetSequence();
 285     for (int ds = 0; ds < dnaSeqs.length; ds++)
 286     {
 287       if (dnaSeqs[ds] == dnaSeqRef || dnaSeqs[ds] == dnads)
 288       {
 289         if (returnDataset)
 290         {
 291           return dnaToProt[ds].to;
 292         }
 293         else
 294         {
 295           // TODO very fragile - depends on dnaSeqs, dnaToProt, a_aaSeqs moving
 296           // in parallel; revise data model to guarantee this
 297           return a_aaSeqs.get(ds);
 298         }
 299       }
 300     }
 301     return null;
 302   }
 303
 304   /**
 305    *
 306    * @param sequenceRef
 307    * @return null or corresponding aaSeq entry for dnaSeq entry
 308    */
 309   public SequenceI getDnaForAaSeq(SequenceI aaSeqRef)
 310   {
 311     if (dnaToProt == null)
 312     {
 313       return null;
 314     }
 315     SequenceI aads = aaSeqRef.getDatasetSequence();
 316     for (int as = 0; as < dnaToProt.length; as++)
 317     {
 318       if (dnaToProt[as].to == aaSeqRef || dnaToProt[as].to == aads)
 319       {
 320         return dnaSeqs[as];
 321       }
 322     }
 323     return null;
 324   }
 325
 326   /**
 327    * test to see if codon frame involves seq in any way
 328    *
 329    * @param seq
 330    *          a nucleotide or protein sequence
 331    * @return true if a mapping exists to or from this sequence to any translated
 332    *         sequence
 333    */
 334   public boolean involvesSequence(SequenceI seq)
 335   {
 336     return getAaForDnaSeq(seq) != null || getDnaForAaSeq(seq) != null;
 337   }
 338
 339   /**
 340    * Add search results for regions in other sequences that translate or are
 341    * translated from a particular position in seq
 342    *
 343    * @param seq
 344    * @param index
 345    *          position in seq
 346    * @param results
 347    *          where highlighted regions go
 348    */
 349   public void markMappedRegion(SequenceI seq, int index,
 350           SearchResults results)
 351   {
 352     if (dnaToProt == null)
 353     {
 354       return;
 355     }
 356     int[] codon;
 357     SequenceI ds = seq.getDatasetSequence();
 358     for (int mi = 0; mi < dnaToProt.length; mi++)
 359     {
 360       if (dnaSeqs[mi] == seq || dnaSeqs[mi] == ds)
 361       {
 362         // DEBUG System.err.println("dna pos "+index);
 363         codon = dnaToProt[mi].map.locateInTo(index, index);
 364         if (codon != null)
 365         {
 366           for (int i = 0; i < codon.length; i += 2)
 367           {
 368             results.addResult(dnaToProt[mi].to, codon[i], codon[i + 1]);
 369           }
 370         }
 371       }
 372       else if (dnaToProt[mi].to == seq || dnaToProt[mi].to == ds)
 373       {
 374         // DEBUG System.err.println("aa pos "+index);
 375         {
 376           codon = dnaToProt[mi].map.locateInFrom(index, index);
 377           if (codon != null)
 378           {
 379             for (int i = 0; i < codon.length; i += 2)
 380             {
 381               results.addResult(dnaSeqs[mi], codon[i], codon[i + 1]);
 382             }
 383           }
 384         }
 385       }
 386     }
 387   }
 388
 389   /**
 390    * Returns the DNA codon positions (base 1) for the given position (base 1) in
 391    * a mapped protein sequence, or null if no mapping is found.
 392    *
 393    * Intended for use in aligning cDNA to match aligned protein. Only the first
 394    * mapping found is returned, so not suitable for use if multiple protein
 395    * sequences are mapped to the same cDNA (but aligning cDNA as protein is
 396    * ill-defined for this case anyway).
 397    *
 398    * @param seq
 399    *          the DNA dataset sequence
 400    * @param aaPos
 401    *          residue position (base 1) in a protein sequence
 402    * @return
 403    */
 404   public int[] getDnaPosition(SequenceI seq, int aaPos)
 405   {
 406     /*
 407      * Adapted from markMappedRegion().
 408      */
 409     MapList ml = null;
 410     for (int i = 0; i < dnaToProt.length; i++)
 411     {
 412       if (dnaSeqs[i] == seq)
 413       {
 414         ml = getdnaToProt()[i];
 415         break;
 416       }
 417     }
 418     return ml == null ? null : ml.locateInFrom(aaPos, aaPos);
 419   }
 420 }