/* * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2) * Copyright (C) 2014 The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import jalview.bin.Cache; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Annotation; import jalview.datamodel.DBRefEntry; import jalview.datamodel.FeatureProperties; import jalview.datamodel.GraphLine; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.schemes.ResidueProperties; import jalview.util.Comparison; import jalview.util.DBRefUtils; import jalview.util.MapList; import jalview.util.ShiftList; import java.util.ArrayList; import java.util.Arrays; import java.util.Hashtable; import java.util.List; public class Dna { /** * Test whether codon positions cdp1 should align before, with, or after cdp2. * Returns zero if all positions match (or either argument is null). Returns * -1 if any position in the first codon precedes the corresponding position * in the second codon. Else returns +1 (some position in the second codon * precedes the corresponding position in the first). * * Note this is not necessarily symmetric, for example: *

compareCodonPos([2,5,6], [3,4,5]) returns -1
compareCodonPos([3,4,5], [2,5,6]) also returns -1

* * @param cdp1 * @param cdp2 * @return */ public static int compareCodonPos(int[] cdp1, int[] cdp2) { if (cdp1 == null || cdp2 == null || (cdp1[0] == cdp2[0] && cdp1[1] == cdp2[1] && cdp1[2] == cdp2[2])) { return 0; } if (cdp1[0] < cdp2[0] || cdp1[1] < cdp2[1] || cdp1[2] < cdp2[2]) { // one base in cdp1 precedes the corresponding base in the other codon return -1; } // one base in cdp1 appears after the corresponding base in the other codon. return 1; } /** * DNA->mapped protein sequence alignment translation given set of sequences * 1. id distinct coding regions within selected region for each sequence 2. * generate peptides based on inframe (or given) translation or (optionally * and where specified) out of frame translations (annotated appropriately) 3. * align peptides based on codon alignment */ /** * id potential products from dna 1. search for distinct products within * selected region for each selected sequence 2. group by associated DB type. * 3. return as form for input into above function */ /** * */ /** * create a new alignment of protein sequences by an inframe translation of * the provided NA sequences * * @param selection * @param seqstring * @param viscontigs * @param gapCharacter * @param annotations * @param aWidth * @param dataset * destination dataset for translated sequences and mappings * @return */ public static AlignmentI cdnaTranslate(SequenceI[] selection, String[] seqstring, int viscontigs[], char gapCharacter, AlignmentAnnotation[] annotations, int aWidth, Alignment dataset) { return cdnaTranslate(Arrays.asList(selection), seqstring, null, viscontigs, gapCharacter, annotations, aWidth, dataset); } /** * * @param cdnaseqs * @param seqstring * @param product * - array of DbRefEntry objects from which exon map in seqstring is * derived * @param viscontigs * @param gapCharacter * @param annotations * @param aWidth * @param dataset * @return */ public static AlignmentI cdnaTranslate(List cdnaseqs, String[] seqstring, DBRefEntry[] product, int viscontigs[], char gapCharacter, AlignmentAnnotation[] annotations, int aWidth, Alignment dataset) { AlignedCodonFrame acf = new AlignedCodonFrame(aWidth); /* * This array will be built up so that position i holds the codon positions * e.g. [7, 9, 10] that match column i (base 0) in the aligned translation. * Note this implies a contract that if two codons do not align exactly, * their translated products must occupy different column positions. */ int[][] alignedCodons = new int[aWidth][]; int s; int sSize = cdnaseqs.size(); List pepseqs = new ArrayList(); for (s = 0; s < sSize; s++) { SequenceI newseq = translateCodingRegion(cdnaseqs.get(s), seqstring[s], viscontigs, acf, alignedCodons, gapCharacter, false); if (newseq != null) { pepseqs.add(newseq); SequenceI ds = newseq; if (dataset != null) { while (ds.getDatasetSequence() != null) { ds = ds.getDatasetSequence(); } dataset.addSequence(ds); } } } SequenceI[] newseqs = pepseqs.toArray(new SequenceI[pepseqs.size()]); AlignmentI al = new Alignment(newseqs); // ensure we look aligned. al.padGaps(); // link the protein translation to the DNA dataset al.setDataset(dataset); translateAlignedAnnotations(annotations, al, acf, alignedCodons); al.addCodonFrame(acf); return al; } /** * fake the collection of DbRefs with associated exon mappings to identify if * a translation would generate distinct product in the currently selected * region. * * @param selection * @param viscontigs * @return */ public static boolean canTranslate(SequenceI[] selection, int viscontigs[]) { for (int gd = 0; gd < selection.length; gd++) { SequenceI dna = selection[gd]; DBRefEntry[] dnarefs = DBRefUtils .selectRefs(dna.getDBRef(), jalview.datamodel.DBRefSource.DNACODINGDBS); if (dnarefs != null) { // intersect with pep List mappedrefs = new ArrayList(); DBRefEntry[] refs = dna.getDBRef(); for (int d = 0; d < refs.length; d++) { if (refs[d].getMap() != null && refs[d].getMap().getMap() != null && refs[d].getMap().getMap().getFromRatio() == 3 && refs[d].getMap().getMap().getToRatio() == 1) { mappedrefs.add(refs[d]); // add translated protein maps } } dnarefs = mappedrefs.toArray(new DBRefEntry[mappedrefs.size()]); for (int d = 0; d < dnarefs.length; d++) { Mapping mp = dnarefs[d].getMap(); if (mp != null) { for (int vc = 0; vc < viscontigs.length; vc += 2) { int[] mpr = mp.locateMappedRange(viscontigs[vc], viscontigs[vc + 1]); if (mpr != null) { return true; } } } } } } return false; } /** * Translate na alignment annotations onto translated amino acid alignment al * using codon mapping codons * * @param annotations * @param al * @param acf */ protected static void translateAlignedAnnotations( AlignmentAnnotation[] annotations, AlignmentI al, AlignedCodonFrame acf, int[][] codons) { // Can only do this for columns with consecutive codons, or where // annotation is sequence associated. if (annotations != null) { for (AlignmentAnnotation annotation : annotations) { /* * Skip hidden or autogenerated annotation. Also (for now), RNA * secondary structure annotation. If we want to show this against * protein we need a smarter way to 'translate' without generating * invalid (unbalanced) structure annotation. */ if (annotation.autoCalculated || !annotation.visible || annotation.isRNA()) { continue; } int aSize = acf.getaaWidth(); // aa alignment width. Annotation[] anots = (annotation.annotations == null) ? null : new Annotation[aSize]; if (anots != null) { for (int a = 0; a < aSize; a++) { // process through codon map. if (a < codons.length && codons[a] != null && codons[a][0] == (codons[a][2] - 2)) { anots[a] = getCodonAnnotation(codons[a], annotation.annotations); } } } AlignmentAnnotation aa = new AlignmentAnnotation(annotation.label, annotation.description, anots); aa.graph = annotation.graph; aa.graphGroup = annotation.graphGroup; aa.graphHeight = annotation.graphHeight; if (annotation.getThreshold() != null) { aa.setThreshold(new GraphLine(annotation .getThreshold())); } if (annotation.hasScore) { aa.setScore(annotation.getScore()); } final SequenceI seqRef = annotation.sequenceRef; if (seqRef != null) { SequenceI aaSeq = acf.getAaForDnaSeq(seqRef); if (aaSeq != null) { // aa.compactAnnotationArray(); // throw away alignment annotation // positioning aa.setSequenceRef(aaSeq); // rebuild mapping aa.createSequenceMapping(aaSeq, aaSeq.getStart(), true); aa.adjustForAlignment(); aaSeq.addAlignmentAnnotation(aa); } } al.addAnnotation(aa); } } } private static Annotation getCodonAnnotation(int[] is, Annotation[] annotations) { // Have a look at all the codon positions for annotation and put the first // one found into the translated annotation pos. int contrib = 0; Annotation annot = null; for (int p = 0; p < 3; p++) { if (annotations[is[p]] != null) { if (annot == null) { annot = new Annotation(annotations[is[p]]); contrib = 1; } else { // merge with last Annotation cpy = new Annotation(annotations[is[p]]); if (annot.colour == null) { annot.colour = cpy.colour; } if (annot.description == null || annot.description.length() == 0) { annot.description = cpy.description; } if (annot.displayCharacter == null) { annot.displayCharacter = cpy.displayCharacter; } if (annot.secondaryStructure == 0) { annot.secondaryStructure = cpy.secondaryStructure; } annot.value += cpy.value; contrib++; } } } if (contrib > 1) { annot.value /= contrib; } return annot; } /** * Translate a na sequence * * @param selection * sequence displayed under viscontigs visible columns * @param seqstring * ORF read in some global alignment reference frame * @param viscontigs * mapping from global reference frame to visible seqstring ORF read * @param acf * Definition of global ORF alignment reference frame * @param alignedCodons * @param gapCharacter * @param starForStop * when true stop codons will translate as '*', otherwise as 'X' * @return sequence ready to be added to alignment. */ protected static SequenceI translateCodingRegion(SequenceI selection, String seqstring, int[] viscontigs, AlignedCodonFrame acf, int[][] alignedCodons, char gapCharacter, final boolean starForStop) { List skip = new ArrayList(); int skipint[] = null; ShiftList vismapping = new ShiftList(); // map from viscontigs to seqstring // intervals int vc, scontigs[] = new int[viscontigs.length]; int npos = 0; for (vc = 0; vc < viscontigs.length; vc += 2) { if (vc == 0) { vismapping.addShift(npos, viscontigs[vc]); } else { // hidden region vismapping.addShift(npos, viscontigs[vc] - viscontigs[vc - 1] + 1); } scontigs[vc] = viscontigs[vc]; scontigs[vc + 1] = viscontigs[vc + 1]; } // allocate a roughly sized buffer for the protein sequence StringBuilder protein = new StringBuilder(seqstring.length() / 2); String seq = seqstring.replace('U', 'T').replace('u', 'T'); char codon[] = new char[3]; int cdp[] = new int[3], rf = 0, lastnpos = 0, nend; int aspos = 0; int resSize = 0; for (npos = 0, nend = seq.length(); npos < nend; npos++) { if (!Comparison.isGap(seq.charAt(npos))) { cdp[rf] = npos; // store position codon[rf++] = seq.charAt(npos); // store base } if (rf == 3) { /* * Filled up a reading frame... */ String aa = ResidueProperties.codonTranslate(new String(codon)); rf = 0; final String gapString = String.valueOf(gapCharacter); if (aa == null) { aa = gapString; if (skipint == null) { skipint = new int[] { cdp[0], cdp[2] }; } skipint[1] = cdp[2]; } else { if (skipint != null) { // edit scontigs skipint[0] = vismapping.shift(skipint[0]); skipint[1] = vismapping.shift(skipint[1]); for (vc = 0; vc < scontigs.length;) { if (scontigs[vc + 1] < skipint[0]) { // before skipint starts vc += 2; continue; } if (scontigs[vc] > skipint[1]) { // finished editing so break; } // Edit the contig list to include the skipped region which did // not translate int[] t; // from : s1 e1 s2 e2 s3 e3 // to s: s1 e1 s2 k0 k1 e2 s3 e3 // list increases by one unless one boundary (s2==k0 or e2==k1) // matches, and decreases by one if skipint intersects whole // visible contig if (scontigs[vc] <= skipint[0]) { if (skipint[0] == scontigs[vc]) { // skipint at start of contig // shift the start of this contig if (scontigs[vc + 1] > skipint[1]) { scontigs[vc] = skipint[1]; vc += 2; } else { if (scontigs[vc + 1] == skipint[1]) { // remove the contig t = new int[scontigs.length - 2]; if (vc > 0) { System.arraycopy(scontigs, 0, t, 0, vc - 1); } if (vc + 2 < t.length) { System.arraycopy(scontigs, vc + 2, t, vc, t.length - vc + 2); } scontigs = t; } else { // truncate contig to before the skipint region scontigs[vc + 1] = skipint[0] - 1; vc += 2; } } } else { // scontig starts before start of skipint if (scontigs[vc + 1] < skipint[1]) { // skipint truncates end of scontig scontigs[vc + 1] = skipint[0] - 1; vc += 2; } else { // divide region to new contigs t = new int[scontigs.length + 2]; System.arraycopy(scontigs, 0, t, 0, vc + 1); t[vc + 1] = skipint[0]; t[vc + 2] = skipint[1]; System.arraycopy(scontigs, vc + 1, t, vc + 3, scontigs.length - (vc + 1)); scontigs = t; vc += 4; } } } } skip.add(skipint); skipint = null; } if (aa.equals("STOP")) { aa = starForStop ? "*" : "X"; } resSize++; } // insert gaps prior to this codon - if necessary boolean findpos = true; while (findpos) { // expand the codons array if necessary alignedCodons = checkCodonFrameWidth(alignedCodons, aspos); /* * Compare this codon's base positions with those currently aligned to * this column in the translation. */ final int compareCodonPos = Dna.compareCodonPos(cdp, alignedCodons[aspos]); // debug // System.out.println(seq + "/" + aa + " codons: " // + Arrays.deepToString(alignedCodons)); // System.out // .println(("Compare " + Arrays.toString(cdp) + " at pos " // + aspos + " with " // + Arrays.toString(alignedCodons[aspos]) + " got " + // compareCodonPos)); // end debug switch (compareCodonPos) { case -1: /* * This codon should precede the mapped positions - need to insert a * gap in all prior sequences. */ acf.insertAAGap(aspos, gapCharacter); findpos = false; break; case +1: /* * This codon belongs after the aligned codons at aspos. Prefix it * with a gap and try the next position. */ aa = gapString + aa; aspos++; break; case 0: /* * Exact match - codon 'belongs' at this translated position. */ findpos = false; } } protein.append(aa); lastnpos = npos; if (alignedCodons[aspos] == null) { // mark this column as aligning to this aligned reading frame alignedCodons[aspos] = new int[] { cdp[0], cdp[1], cdp[2] }; } else if (!Arrays.equals(alignedCodons[aspos], cdp)) { throw new IllegalStateException("Tried to coalign " + Arrays.asList(alignedCodons[aspos], cdp)); } if (aspos >= acf.aaWidth) { // update maximum alignment width // (we can do this without calling checkCodonFrameWidth because it was // already done above) acf.setAaWidth(aspos); } // ready for next translated reading frame alignment position (if any) aspos++; } } if (resSize > 0) { SequenceI newseq = new Sequence(selection.getName(), protein.toString()); if (rf != 0) { final String errMsg = "trimming contigs for incomplete terminal codon."; if (Cache.log != null) { Cache.log.debug(errMsg); } else { System.err.println(errMsg); } // map and trim contigs to ORF region vc = scontigs.length - 1; lastnpos = vismapping.shift(lastnpos); // place npos in context of // whole dna alignment (rather // than visible contigs) // incomplete ORF could be broken over one or two visible contig // intervals. while (vc >= 0 && scontigs[vc] > lastnpos) { if (vc > 0 && scontigs[vc - 1] > lastnpos) { vc -= 2; } else { // correct last interval in list. scontigs[vc] = lastnpos; } } if (vc > 0 && (vc + 1) < scontigs.length) { // truncate map list to just vc elements int t[] = new int[vc + 1]; System.arraycopy(scontigs, 0, t, 0, vc + 1); scontigs = t; } if (vc <= 0) { scontigs = null; } } if (scontigs != null) { npos = 0; // map scontigs to actual sequence positions on selection for (vc = 0; vc < scontigs.length; vc += 2) { scontigs[vc] = selection.findPosition(scontigs[vc]); // not from 1! scontigs[vc + 1] = selection.findPosition(scontigs[vc + 1]); // exclusive if (scontigs[vc + 1] == selection.getEnd()) { break; } } // trim trailing empty intervals. if ((vc + 2) < scontigs.length) { int t[] = new int[vc + 2]; System.arraycopy(scontigs, 0, t, 0, vc + 2); scontigs = t; } /* * delete intervals in scontigs which are not translated. 1. map skip * into sequence position intervals 2. truncate existing ranges and add * new ranges to exclude untranslated regions. if (skip.size()>0) { * Vector narange = new Vector(); for (vc=0; vc=skipint[0] && * iv[0]<=skipint[1]) { if (iv[0]==skipint[0]) { // delete beginning of * range } else { // truncate range and create new one if necessary iv = * (int[]) narange.elementAt(vc+1); if (iv[0]<=skipint[1]) { // truncate * range iv[0] = skipint[1]; } else { } } } else if (iv[0]