From f41f5deaf0c8c1bbfb3fe4ca8be208d27b8e11d7 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 1 Apr 2016 11:34:58 +0100 Subject: [PATCH] JAL-1705 include stop codon in mapped CDS range so it is included in the generated CDS sequence --- src/jalview/datamodel/xdb/embl/EmblEntry.java | 106 +++++++++++++++---------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index fda042e..691a4c9 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -33,6 +33,7 @@ import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.StringUtils; +import java.util.Arrays; import java.util.Hashtable; import java.util.List; import java.util.Map; @@ -400,7 +401,6 @@ public class EmblEntry } } - // SequenceI product = null; DBRefEntry protEMBLCDS = null; exon = MappingUtils.removeStartPositions(codonStart - 1, exon); boolean noProteinDbref = true; @@ -436,7 +436,8 @@ public class EmblEntry .println("Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are // marked. - exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; + exon = new int[] { dna.getStart() + (codonStart - 1), + dna.getEnd() }; map = new Mapping(product, exon, new int[] { 1, prseq.length() }, 3, 1); } @@ -469,9 +470,9 @@ public class EmblEntry // TODO should from range include stop codon even if not in protein // in order to include stop codon in CDS sequence (as done for // Ensembl)? - int[] cdsRanges = adjustForProteinLength(prseq.length(), - exon); - map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1); + int[] cdsRanges = adjustForProteinLength(prseq.length(), exon); + map = new Mapping(product, cdsRanges, new int[] { 1, + prseq.length() }, 3, 1); // reconstruct the EMBLCDS entry // TODO: this is only necessary when there codon annotation is // complete (I think JBPNote) @@ -536,8 +537,7 @@ public class EmblEntry if (proteinSeq == null) { proteinSeq = new Sequence(proteinSeqName, - product - .getSequenceAsString()); + product.getSequenceAsString()); matcher.add(proteinSeq); peptides.add(proteinSeq); } @@ -623,11 +623,11 @@ public class EmblEntry SequenceFeature sf = new SequenceFeature(); sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setDescription(String.format( - "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName, - proteinAccessionId)); + sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinAccessionId)); sf.setPhase(String.valueOf(codonStart - 1)); - sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-"); + sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" + : "-"); sf.setValue(FeatureProperties.EXONPOS, exonNumber); sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); if (!vals.isEmpty()) @@ -689,45 +689,69 @@ public class EmblEntry * @param exon * @return new exon */ - private int[] adjustForProteinLength(int prlength, int[] exon) + static int[] adjustForProteinLength(int prlength, int[] exon) { + if (prlength <= 0 || exon == null) + { + return exon; + } + int desiredCdsLength = prlength * 3; + int exonLength = MappingUtils.getLength(Arrays.asList(exon)); + + /* + * assuming here exon might include stop codon in addition to protein codons + */ + if (desiredCdsLength == exonLength + || desiredCdsLength == exonLength - 3) + { + return exon; + } - int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3; - // first adjust range for codon start attribute - if (prlength >= 1 && exon != null) + int origxon[]; + int sxpos = -1; + int endxon = 0; + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length; x += 2) { - origxon = new int[exon.length]; - System.arraycopy(exon, 0, origxon, 0, exon.length); - int cdspos = 0; - for (int x = 0; x < exon.length && sxpos == -1; x += 2) + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; + if (desiredCdsLength <= cdspos) { - cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; - if (cdslength <= cdspos) + // advanced beyond last codon. + sxpos = x; + if (desiredCdsLength != cdspos) { - // advanced beyond last codon. - sxpos = x; - if (cdslength != cdspos) - { - // System.err - // .println("Truncating final exon interval on region by " - // + (cdspos - cdslength)); - } - // locate the new end boundary of final exon as endxon - endxon = exon[x + 1] - cdspos + cdslength; - break; + // System.err + // .println("Truncating final exon interval on region by " + // + (cdspos - cdslength)); } - } - if (sxpos != -1) - { - // and trim the exon interval set if necessary - int[] nxon = new int[sxpos + 2]; - System.arraycopy(exon, 0, nxon, 0, sxpos + 2); - nxon[sxpos + 1] = endxon; // update the end boundary for the new exon - // set - exon = nxon; + /* + * shrink the final exon - reduce end position if forward + * strand, increase it if reverse + */ + if (exon[x + 1] >= exon[x]) + { + endxon = exon[x + 1] - cdspos + desiredCdsLength; + } + else + { + endxon = exon[x + 1] + cdspos - desiredCdsLength; + } + break; } } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } return exon; } } -- 1.7.10.2