2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2)
3 * Copyright (C) 2014 The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.bin.Cache;
24 import jalview.datamodel.AlignedCodonFrame;
25 import jalview.datamodel.Alignment;
26 import jalview.datamodel.AlignmentAnnotation;
27 import jalview.datamodel.AlignmentI;
28 import jalview.datamodel.Annotation;
29 import jalview.datamodel.DBRefEntry;
30 import jalview.datamodel.FeatureProperties;
31 import jalview.datamodel.GraphLine;
32 import jalview.datamodel.Mapping;
33 import jalview.datamodel.Sequence;
34 import jalview.datamodel.SequenceFeature;
35 import jalview.datamodel.SequenceI;
36 import jalview.schemes.ResidueProperties;
37 import jalview.util.Comparison;
38 import jalview.util.DBRefUtils;
39 import jalview.util.MapList;
40 import jalview.util.ShiftList;
42 import java.util.ArrayList;
43 import java.util.Arrays;
44 import java.util.Hashtable;
45 import java.util.List;
50 * Test whether codon positions cdp1 should align before, with, or after cdp2.
51 * Returns zero if all positions match (or either argument is null). Returns
52 * -1 if any position in the first codon precedes the corresponding position
53 * in the second codon. Else returns +1 (some position in the second codon
54 * precedes the corresponding position in the first).
56 * Note this is not necessarily symmetric, for example:
58 * <li>compareCodonPos([2,5,6], [3,4,5]) returns -1</li>
59 * <li>compareCodonPos([3,4,5], [2,5,6]) also returns -1</li>
66 public static int compareCodonPos(int[] cdp1, int[] cdp2)
70 || (cdp1[0] == cdp2[0] && cdp1[1] == cdp2[1] && cdp1[2] == cdp2[2]))
74 if (cdp1[0] < cdp2[0] || cdp1[1] < cdp2[1] || cdp1[2] < cdp2[2])
76 // one base in cdp1 precedes the corresponding base in the other codon
79 // one base in cdp1 appears after the corresponding base in the other codon.
84 * DNA->mapped protein sequence alignment translation given set of sequences
85 * 1. id distinct coding regions within selected region for each sequence 2.
86 * generate peptides based on inframe (or given) translation or (optionally
87 * and where specified) out of frame translations (annotated appropriately) 3.
88 * align peptides based on codon alignment
91 * id potential products from dna 1. search for distinct products within
92 * selected region for each selected sequence 2. group by associated DB type.
93 * 3. return as form for input into above function
99 * create a new alignment of protein sequences by an inframe translation of
100 * the provided NA sequences
105 * @param gapCharacter
109 * destination dataset for translated sequences and mappings
112 public static AlignmentI cdnaTranslate(SequenceI[] selection,
113 String[] seqstring, int viscontigs[], char gapCharacter,
114 AlignmentAnnotation[] annotations, int aWidth, Alignment dataset)
116 return cdnaTranslate(Arrays.asList(selection), seqstring, null,
117 viscontigs, gapCharacter, annotations, aWidth, dataset);
125 * - array of DbRefEntry objects from which exon map in seqstring is
128 * @param gapCharacter
134 public static AlignmentI cdnaTranslate(List<SequenceI> cdnaseqs,
135 String[] seqstring, DBRefEntry[] product, int viscontigs[],
136 char gapCharacter, AlignmentAnnotation[] annotations, int aWidth,
139 AlignedCodonFrame acf = new AlignedCodonFrame(aWidth);
142 * This array will be built up so that position i holds the codon positions
143 * e.g. [7, 9, 10] that match column i (base 0) in the aligned translation.
144 * Note this implies a contract that if two codons do not align exactly,
145 * their translated products must occupy different column positions.
147 int[][] alignedCodons = new int[aWidth][];
150 int sSize = cdnaseqs.size();
151 List<SequenceI> pepseqs = new ArrayList<SequenceI>();
152 for (s = 0; s < sSize; s++)
154 SequenceI newseq = translateCodingRegion(cdnaseqs.get(s),
155 seqstring[s], viscontigs, acf, alignedCodons, gapCharacter,
161 SequenceI ds = newseq;
164 while (ds.getDatasetSequence() != null)
166 ds = ds.getDatasetSequence();
168 dataset.addSequence(ds);
173 SequenceI[] newseqs = pepseqs.toArray(new SequenceI[pepseqs.size()]);
174 AlignmentI al = new Alignment(newseqs);
175 // ensure we look aligned.
177 // link the protein translation to the DNA dataset
178 al.setDataset(dataset);
179 translateAlignedAnnotations(annotations, al, acf, alignedCodons);
180 al.addCodonFrame(acf);
185 * fake the collection of DbRefs with associated exon mappings to identify if
186 * a translation would generate distinct product in the currently selected
193 public static boolean canTranslate(SequenceI[] selection,
196 for (int gd = 0; gd < selection.length; gd++)
198 SequenceI dna = selection[gd];
199 DBRefEntry[] dnarefs = DBRefUtils
200 .selectRefs(dna.getDBRef(),
201 jalview.datamodel.DBRefSource.DNACODINGDBS);
204 // intersect with pep
205 List<DBRefEntry> mappedrefs = new ArrayList<DBRefEntry>();
206 DBRefEntry[] refs = dna.getDBRef();
207 for (int d = 0; d < refs.length; d++)
209 if (refs[d].getMap() != null && refs[d].getMap().getMap() != null
210 && refs[d].getMap().getMap().getFromRatio() == 3
211 && refs[d].getMap().getMap().getToRatio() == 1)
213 mappedrefs.add(refs[d]); // add translated protein maps
216 dnarefs = mappedrefs.toArray(new DBRefEntry[mappedrefs.size()]);
217 for (int d = 0; d < dnarefs.length; d++)
219 Mapping mp = dnarefs[d].getMap();
222 for (int vc = 0; vc < viscontigs.length; vc += 2)
224 int[] mpr = mp.locateMappedRange(viscontigs[vc],
239 * Translate na alignment annotations onto translated amino acid alignment al
240 * using codon mapping codons
246 protected static void translateAlignedAnnotations(
247 AlignmentAnnotation[] annotations, AlignmentI al,
248 AlignedCodonFrame acf, int[][] codons)
250 // Can only do this for columns with consecutive codons, or where
251 // annotation is sequence associated.
253 if (annotations != null)
255 for (AlignmentAnnotation annotation : annotations)
258 * Skip hidden or autogenerated annotation. Also (for now), RNA
259 * secondary structure annotation. If we want to show this against
260 * protein we need a smarter way to 'translate' without generating
261 * invalid (unbalanced) structure annotation.
263 if (annotation.autoCalculated || !annotation.visible
264 || annotation.isRNA())
269 int aSize = acf.getaaWidth(); // aa alignment width.
270 Annotation[] anots = (annotation.annotations == null) ? null
271 : new Annotation[aSize];
274 for (int a = 0; a < aSize; a++)
276 // process through codon map.
277 if (a < codons.length && codons[a] != null
278 && codons[a][0] == (codons[a][2] - 2))
280 anots[a] = getCodonAnnotation(codons[a],
281 annotation.annotations);
286 AlignmentAnnotation aa = new AlignmentAnnotation(annotation.label,
287 annotation.description, anots);
288 aa.graph = annotation.graph;
289 aa.graphGroup = annotation.graphGroup;
290 aa.graphHeight = annotation.graphHeight;
291 if (annotation.getThreshold() != null)
293 aa.setThreshold(new GraphLine(annotation
296 if (annotation.hasScore)
298 aa.setScore(annotation.getScore());
301 final SequenceI seqRef = annotation.sequenceRef;
304 SequenceI aaSeq = acf.getAaForDnaSeq(seqRef);
307 // aa.compactAnnotationArray(); // throw away alignment annotation
309 aa.setSequenceRef(aaSeq);
311 aa.createSequenceMapping(aaSeq, aaSeq.getStart(), true);
312 aa.adjustForAlignment();
313 aaSeq.addAlignmentAnnotation(aa);
316 al.addAnnotation(aa);
321 private static Annotation getCodonAnnotation(int[] is,
322 Annotation[] annotations)
324 // Have a look at all the codon positions for annotation and put the first
325 // one found into the translated annotation pos.
327 Annotation annot = null;
328 for (int p = 0; p < 3; p++)
330 if (annotations[is[p]] != null)
334 annot = new Annotation(annotations[is[p]]);
340 Annotation cpy = new Annotation(annotations[is[p]]);
341 if (annot.colour == null)
343 annot.colour = cpy.colour;
345 if (annot.description == null || annot.description.length() == 0)
347 annot.description = cpy.description;
349 if (annot.displayCharacter == null)
351 annot.displayCharacter = cpy.displayCharacter;
353 if (annot.secondaryStructure == 0)
355 annot.secondaryStructure = cpy.secondaryStructure;
357 annot.value += cpy.value;
364 annot.value /= contrib;
370 * Translate a na sequence
373 * sequence displayed under viscontigs visible columns
375 * ORF read in some global alignment reference frame
377 * mapping from global reference frame to visible seqstring ORF read
379 * Definition of global ORF alignment reference frame
380 * @param alignedCodons
381 * @param gapCharacter
383 * when true stop codons will translate as '*', otherwise as 'X'
384 * @return sequence ready to be added to alignment.
386 protected static SequenceI translateCodingRegion(SequenceI selection,
387 String seqstring, int[] viscontigs, AlignedCodonFrame acf,
388 int[][] alignedCodons, char gapCharacter,
389 final boolean starForStop)
391 List<int[]> skip = new ArrayList<int[]>();
392 int skipint[] = null;
393 ShiftList vismapping = new ShiftList(); // map from viscontigs to seqstring
395 int vc, scontigs[] = new int[viscontigs.length];
397 for (vc = 0; vc < viscontigs.length; vc += 2)
401 vismapping.addShift(npos, viscontigs[vc]);
406 vismapping.addShift(npos, viscontigs[vc] - viscontigs[vc - 1] + 1);
408 scontigs[vc] = viscontigs[vc];
409 scontigs[vc + 1] = viscontigs[vc + 1];
412 // allocate a roughly sized buffer for the protein sequence
413 StringBuilder protein = new StringBuilder(seqstring.length() / 2);
414 String seq = seqstring.replace('U', 'T').replace('u', 'T');
415 char codon[] = new char[3];
416 int cdp[] = new int[3], rf = 0, lastnpos = 0, nend;
419 for (npos = 0, nend = seq.length(); npos < nend; npos++)
421 if (!Comparison.isGap(seq.charAt(npos)))
423 cdp[rf] = npos; // store position
424 codon[rf++] = seq.charAt(npos); // store base
429 * Filled up a reading frame...
431 String aa = ResidueProperties.codonTranslate(new String(codon));
433 final String gapString = String.valueOf(gapCharacter);
449 skipint[0] = vismapping.shift(skipint[0]);
450 skipint[1] = vismapping.shift(skipint[1]);
451 for (vc = 0; vc < scontigs.length;)
453 if (scontigs[vc + 1] < skipint[0])
455 // before skipint starts
459 if (scontigs[vc] > skipint[1])
461 // finished editing so
464 // Edit the contig list to include the skipped region which did
467 // from : s1 e1 s2 e2 s3 e3
468 // to s: s1 e1 s2 k0 k1 e2 s3 e3
469 // list increases by one unless one boundary (s2==k0 or e2==k1)
470 // matches, and decreases by one if skipint intersects whole
472 if (scontigs[vc] <= skipint[0])
474 if (skipint[0] == scontigs[vc])
476 // skipint at start of contig
477 // shift the start of this contig
478 if (scontigs[vc + 1] > skipint[1])
480 scontigs[vc] = skipint[1];
485 if (scontigs[vc + 1] == skipint[1])
488 t = new int[scontigs.length - 2];
491 System.arraycopy(scontigs, 0, t, 0, vc - 1);
493 if (vc + 2 < t.length)
495 System.arraycopy(scontigs, vc + 2, t, vc, t.length
502 // truncate contig to before the skipint region
503 scontigs[vc + 1] = skipint[0] - 1;
510 // scontig starts before start of skipint
511 if (scontigs[vc + 1] < skipint[1])
513 // skipint truncates end of scontig
514 scontigs[vc + 1] = skipint[0] - 1;
519 // divide region to new contigs
520 t = new int[scontigs.length + 2];
521 System.arraycopy(scontigs, 0, t, 0, vc + 1);
522 t[vc + 1] = skipint[0];
523 t[vc + 2] = skipint[1];
524 System.arraycopy(scontigs, vc + 1, t, vc + 3,
525 scontigs.length - (vc + 1));
535 if (aa.equals("STOP"))
537 aa = starForStop ? "*" : "X";
541 // insert gaps prior to this codon - if necessary
542 boolean findpos = true;
545 // expand the codons array if necessary
546 alignedCodons = checkCodonFrameWidth(alignedCodons, aspos);
549 * Compare this codon's base positions with those currently aligned to
550 * this column in the translation.
552 final int compareCodonPos = Dna.compareCodonPos(cdp,
553 alignedCodons[aspos]);
555 // System.out.println(seq + "/" + aa + " codons: "
556 // + Arrays.deepToString(alignedCodons));
558 // .println(("Compare " + Arrays.toString(cdp) + " at pos "
559 // + aspos + " with "
560 // + Arrays.toString(alignedCodons[aspos]) + " got " +
561 // compareCodonPos));
563 switch (compareCodonPos)
568 * This codon should precede the mapped positions - need to insert a
569 * gap in all prior sequences.
571 acf.insertAAGap(aspos, gapCharacter);
578 * This codon belongs after the aligned codons at aspos. Prefix it
579 * with a gap and try the next position.
588 * Exact match - codon 'belongs' at this translated position.
595 if (alignedCodons[aspos] == null)
597 // mark this column as aligning to this aligned reading frame
598 alignedCodons[aspos] = new int[]
599 { cdp[0], cdp[1], cdp[2] };
601 else if (!Arrays.equals(alignedCodons[aspos], cdp))
603 throw new IllegalStateException("Tried to coalign "
604 + Arrays.asList(alignedCodons[aspos], cdp));
606 if (aspos >= acf.aaWidth)
608 // update maximum alignment width
609 // (we can do this without calling checkCodonFrameWidth because it was
610 // already done above)
611 acf.setAaWidth(aspos);
613 // ready for next translated reading frame alignment position (if any)
619 SequenceI newseq = new Sequence(selection.getName(),
623 final String errMsg = "trimming contigs for incomplete terminal codon.";
624 if (Cache.log != null)
626 Cache.log.debug(errMsg);
630 System.err.println(errMsg);
632 // map and trim contigs to ORF region
633 vc = scontigs.length - 1;
634 lastnpos = vismapping.shift(lastnpos); // place npos in context of
635 // whole dna alignment (rather
636 // than visible contigs)
637 // incomplete ORF could be broken over one or two visible contig
639 while (vc >= 0 && scontigs[vc] > lastnpos)
641 if (vc > 0 && scontigs[vc - 1] > lastnpos)
647 // correct last interval in list.
648 scontigs[vc] = lastnpos;
652 if (vc > 0 && (vc + 1) < scontigs.length)
654 // truncate map list to just vc elements
655 int t[] = new int[vc + 1];
656 System.arraycopy(scontigs, 0, t, 0, vc + 1);
664 if (scontigs != null)
667 // map scontigs to actual sequence positions on selection
668 for (vc = 0; vc < scontigs.length; vc += 2)
670 scontigs[vc] = selection.findPosition(scontigs[vc]); // not from 1!
671 scontigs[vc + 1] = selection.findPosition(scontigs[vc + 1]); // exclusive
672 if (scontigs[vc + 1] == selection.getEnd())
677 // trim trailing empty intervals.
678 if ((vc + 2) < scontigs.length)
680 int t[] = new int[vc + 2];
681 System.arraycopy(scontigs, 0, t, 0, vc + 2);
685 * delete intervals in scontigs which are not translated. 1. map skip
686 * into sequence position intervals 2. truncate existing ranges and add
687 * new ranges to exclude untranslated regions. if (skip.size()>0) {
688 * Vector narange = new Vector(); for (vc=0; vc<scontigs.length; vc++) {
689 * narange.addElement(new int[] {scontigs[vc]}); } int sint=0,iv[]; vc =
690 * 0; while (sint<skip.size()) { skipint = (int[]) skip.elementAt(sint);
691 * do { iv = (int[]) narange.elementAt(vc); if (iv[0]>=skipint[0] &&
692 * iv[0]<=skipint[1]) { if (iv[0]==skipint[0]) { // delete beginning of
693 * range } else { // truncate range and create new one if necessary iv =
694 * (int[]) narange.elementAt(vc+1); if (iv[0]<=skipint[1]) { // truncate
695 * range iv[0] = skipint[1]; } else { } } } else if (iv[0]<skipint[0]) {
696 * iv = (int[]) narange.elementAt(vc+1); } } while (iv[0]) } }
698 MapList map = new MapList(scontigs, new int[]
699 { 1, resSize }, 3, 1);
701 transferCodedFeatures(selection, newseq, map, null, null);
702 SequenceI rseq = newseq.deriveSequence(); // construct a dataset
703 // sequence for our new
704 // peptide, regardless.
705 // store a mapping (this actually stores a mapping between the dataset
706 // sequences for the two sequences
707 acf.addMap(selection, rseq, map);
711 // register the mapping somehow
717 * Check the codons array is big enough to accommodate the given position, if
720 * @param alignedCodons
722 * @return the resized array (or the original if no resize needed)
724 protected static int[][] checkCodonFrameWidth(int[][] alignedCodons,
727 // TODO why not codons.length < aspos ?
728 // should codons expand if length is 2 or 3 and aslen==2 ?
729 if (alignedCodons.length <= aspos + 1)
731 // probably never have to do this ?
732 int[][] c = new int[alignedCodons.length + 10][];
733 for (int i = 0; i < alignedCodons.length; i++)
735 c[i] = alignedCodons[i];
739 return alignedCodons;
743 * Given a peptide newly translated from a dna sequence, copy over and set any
744 * features on the peptide from the DNA. If featureTypes is null, all features
745 * on the dna sequence are searched (rather than just the displayed ones), and
746 * similarly for featureGroups.
751 * @param featureTypes
752 * hash who's keys are the displayed feature type strings
753 * @param featureGroups
754 * hash where keys are feature groups and values are Boolean objects
755 * indicating if they are displayed.
757 private static void transferCodedFeatures(SequenceI dna, SequenceI pep,
758 MapList map, Hashtable featureTypes, Hashtable featureGroups)
760 SequenceFeature[] sf = (dna.getDatasetSequence() != null ? dna
761 .getDatasetSequence() : dna).getSequenceFeatures();
763 jalview.datamodel.DBRefEntry[] dnarefs = jalview.util.DBRefUtils
764 .selectRefs(dna.getDBRef(),
765 jalview.datamodel.DBRefSource.DNACODINGDBS);
768 // intersect with pep
769 for (int d = 0; d < dnarefs.length; d++)
771 Mapping mp = dnarefs[d].getMap();
779 for (int f = 0; f < sf.length; f++)
781 fgstate = (featureGroups == null) ? null : ((Boolean) featureGroups
782 .get(sf[f].featureGroup));
783 if ((featureTypes == null || featureTypes.containsKey(sf[f]
784 .getType())) && (fgstate == null || fgstate.booleanValue()))
786 if (FeatureProperties.isCodingFeature(null, sf[f].getType()))
788 // if (map.intersectsFrom(sf[f].begin, sf[f].end))