X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2Fxdb%2Fembl%2FEmblEntry.java;h=c3d4e668e2157025041851a32497407fac32bc03;hb=ffa5c07d90b4a933762a5d9faa0578c11693d63a;hp=7da6d6ca6a99402e75f9074ba0b3109105a53e5e;hpb=4ad19b786f19aeadaf7a841e43ff8e490a39589d;p=jalview.git diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 7da6d6c..c3d4e66 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -21,6 +21,7 @@ package jalview.datamodel.xdb.embl; import jalview.analysis.SequenceIdMatcher; +import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; import jalview.datamodel.DBRefSource; import jalview.datamodel.FeatureProperties; @@ -29,10 +30,13 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.util.DBRefUtils; +import jalview.util.DnaUtils; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.StringUtils; +import java.text.ParseException; +import java.util.Arrays; import java.util.Hashtable; import java.util.List; import java.util.Map; @@ -44,9 +48,7 @@ import java.util.regex.Pattern; * Data model for one entry returned from an EMBL query, as marshalled by a * Castor binding file * - * For example: - * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321 - * &format=emblxml + * For example: http://www.ebi.ac.uk/ena/data/view/J03321&display=xml * * @see embl_mapping.xml */ @@ -56,17 +58,29 @@ public class EmblEntry String accession; - String version; + String entryVersion; - String taxDivision; + String sequenceVersion; - String desc; + String dataClass; - String rCreated; + String moleculeType; - String rLastUpdated; + String topology; - String lastUpdated; + String sequenceLength; + + String taxonomicDivision; + + String description; + + String firstPublicDate; + + String firstPublicRelease; + + String lastUpdatedDate; + + String lastUpdatedRelease; Vector keywords; @@ -111,23 +125,6 @@ public class EmblEntry } /** - * @return the desc - */ - public String getDesc() - { - return desc; - } - - /** - * @param desc - * the desc to set - */ - public void setDesc(String desc) - { - this.desc = desc; - } - - /** * @return the features */ public Vector getFeatures() @@ -162,57 +159,6 @@ public class EmblEntry } /** - * @return the lastUpdated - */ - public String getLastUpdated() - { - return lastUpdated; - } - - /** - * @param lastUpdated - * the lastUpdated to set - */ - public void setLastUpdated(String lastUpdated) - { - this.lastUpdated = lastUpdated; - } - - /** - * @return the releaseCreated - */ - public String getRCreated() - { - return rCreated; - } - - /** - * @param releaseCreated - * the releaseCreated to set - */ - public void setRCreated(String releaseCreated) - { - this.rCreated = releaseCreated; - } - - /** - * @return the releaseLastUpdated - */ - public String getRLastUpdated() - { - return rLastUpdated; - } - - /** - * @param releaseLastUpdated - * the releaseLastUpdated to set - */ - public void setRLastUpdated(String releaseLastUpdated) - { - this.rLastUpdated = releaseLastUpdated; - } - - /** * @return the sequence */ public EmblSequence getSequence() @@ -230,40 +176,6 @@ public class EmblEntry } /** - * @return the taxDivision - */ - public String getTaxDivision() - { - return taxDivision; - } - - /** - * @param taxDivision - * the taxDivision to set - */ - public void setTaxDivision(String taxDivision) - { - this.taxDivision = taxDivision; - } - - /** - * @return the version - */ - public String getVersion() - { - return version; - } - - /** - * @param version - * the version to set - */ - public void setVersion(String version) - { - this.version = version; - } - - /** * Recover annotated sequences from EMBL file * * @param sourceDb @@ -273,38 +185,40 @@ public class EmblEntry */ public SequenceI getSequence(String sourceDb, List peptides) { - SequenceI dna = new Sequence(sourceDb + "|" + accession, - sequence.getSequence()); - dna.setDescription(desc); - DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession); + SequenceI dna = makeSequence(sourceDb); + if (dna == null) + { + return null; + } + dna.setDescription(description); + DBRefEntry retrievedref = new DBRefEntry(sourceDb, + getSequenceVersion(), accession); dna.addDBRef(retrievedref); // add map to indicate the sequence is a valid coordinate frame for the // dbref retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1, 1)); - // TODO: transform EMBL Database refs to canonical form + + /* + * transform EMBL Database refs to canonical form + */ if (dbRefs != null) { for (DBRefEntry dbref : dbRefs) { + dbref.setSource(DBRefUtils.getCanonicalName(dbref.getSource())); dna.addDBRef(dbref); } } + SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); try { for (EmblFeature feature : features) { - if (feature.dbRefs != null) - { - for (DBRefEntry dbref : feature.dbRefs) - { - dna.addDBRef(dbref); - } - } if (FeatureProperties.isCodingFeature(sourceDb, feature.getName())) { - parseCodingFeature(feature, sourceDb, dna, peptides); + parseCodingFeature(feature, sourceDb, dna, peptides, matcher); } } } catch (Exception e) @@ -321,6 +235,23 @@ public class EmblEntry } /** + * @param sourceDb + * @return + */ + SequenceI makeSequence(String sourceDb) + { + if (sequence == null) + { + System.err.println("No sequence was returned for ENA accession " + + accession); + return null; + } + SequenceI dna = new Sequence(sourceDb + "|" + accession, + sequence.getSequence()); + return dna; + } + + /** * Extracts coding region and product from a CDS feature and properly decorate * it with annotations. * @@ -332,19 +263,20 @@ public class EmblEntry * parent dna sequence for this record * @param peptides * list of protein product sequences for Embl entry + * @param matcher + * helper to match xrefs in already retrieved sequences */ void parseCodingFeature(EmblFeature feature, String sourceDb, - SequenceI dna, List peptides) + SequenceI dna, List peptides, SequenceIdMatcher matcher) { boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS); - int[] exon = getCdsRanges(feature); + int[] exons = getCdsRanges(feature); - String prseq = null; - String prname = ""; - String prid = null; + String translation = null; + String proteinName = ""; + String proteinId = null; Map vals = new Hashtable(); - SequenceIdMatcher matcher = new SequenceIdMatcher(peptides); /* * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS @@ -364,17 +296,18 @@ public class EmblEntry if (qname.equals("translation")) { // remove all spaces (precompiled String.replaceAll(" ", "")) - prseq = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(""); + translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll( + ""); } else if (qname.equals("protein_id")) { - prid = q.getValues()[0]; + proteinId = q.getValues()[0].trim(); } else if (qname.equals("codon_start")) { try { - codonStart = Integer.parseInt(q.getValues()[0]); + codonStart = Integer.parseInt(q.getValues()[0].trim()); } catch (NumberFormatException e) { System.err.println("Invalid codon_start in XML for " @@ -384,7 +317,7 @@ public class EmblEntry else if (qname.equals("product")) { // sometimes name is returned e.g. for V00488 - prname = q.getValues()[0]; + proteinName = q.getValues()[0].trim(); } else { @@ -400,54 +333,59 @@ public class EmblEntry } } - // SequenceI product = null; - DBRefEntry protEMBLCDS = null; - exon = MappingUtils.removeStartPositions(codonStart - 1, exon); - boolean noProteinDbref = true; + DBRefEntry proteinToEmblProteinRef = null; + exons = MappingUtils.removeStartPositions(codonStart - 1, exons); SequenceI product = null; - Mapping map = null; - if (prseq != null && prname != null && prid != null) + Mapping dnaToProteinMapping = null; + if (translation != null && proteinName != null && proteinId != null) { + int translationLength = translation.length(); + /* * look for product in peptides list, if not found, add it */ - product = matcher.findIdMatch(prid); + product = matcher.findIdMatch(proteinId); if (product == null) { - product = new Sequence(prid, prseq, 1, prseq.length()); - product.setDescription(((prname.length() == 0) ? "Protein Product from " + product = new Sequence(proteinId, translation, 1, translationLength); + product.setDescription(((proteinName.length() == 0) ? "Protein Product from " + sourceDb - : prname)); + : proteinName)); peptides.add(product); matcher.add(product); } // we have everything - create the mapping and perhaps the protein // sequence - if (exon == null || exon.length == 0) + if (exons == null || exons.length == 0) { + /* + * workaround until we handle dna location for CDS sequence + * e.g. location="X53828.1:60..1058" correctly + */ System.err .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (prseq.length() * 3 == (1 - codonStart + dna.getSequence().length)) + if (translationLength * 3 == (1 - codonStart + dna.getSequence().length)) { System.err .println("Not allowing for additional stop codon at end of cDNA fragment... !"); - // this might occur for CDS sequences where no features are - // marked. - exon = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() }; - map = new Mapping(product, exon, new int[] { 1, prseq.length() }, - 3, 1); + // this might occur for CDS sequences where no features are marked + exons = new int[] { dna.getStart() + (codonStart - 1), + dna.getEnd() }; + dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, + translationLength }, 3, 1); } - if ((prseq.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length)) + if ((translationLength + 1) * 3 == (1 - codonStart + dna + .getSequence().length)) { System.err .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); - exon = new int[] { dna.getStart() + (codonStart - 1), + exons = new int[] { dna.getStart() + (codonStart - 1), dna.getEnd() - 3 }; - map = new Mapping(product, exon, new int[] { 1, prseq.length() }, - 3, 1); + dnaToProteinMapping = new Mapping(product, exons, new int[] { 1, + translationLength }, 3, 1); } } else @@ -466,133 +404,172 @@ public class EmblEntry else { // final product length truncation check - // TODO should from range include stop codon even if not in protein - // in order to include stop codon in CDS sequence (as done for - // Ensembl)? - int[] cdsRanges = adjustForProteinLength(prseq.length(), - exon); - map = new Mapping(product, cdsRanges, new int[] { 1, prseq.length() }, 3, 1); - // reconstruct the EMBLCDS entry - // TODO: this is only necessary when there codon annotation is - // complete (I think JBPNote) - DBRefEntry pcdnaref = new DBRefEntry(); - pcdnaref.setAccessionId(prid); - pcdnaref.setSource(DBRefSource.EMBLCDS); - pcdnaref.setVersion(getVersion()); // same as parent EMBL version. - MapList mp = new MapList(new int[] { 1, prseq.length() }, - new int[] { 1 + (codonStart - 1), - (codonStart - 1) + 3 * prseq.length() }, 1, 3); - pcdnaref.setMap(new Mapping(mp)); + int[] cdsRanges = adjustForProteinLength(translationLength, exons); + dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { + 1, translationLength }, 3, 1); if (product != null) { - product.addDBRef(pcdnaref); - protEMBLCDS = new DBRefEntry(pcdnaref); - protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct); - product.addDBRef(protEMBLCDS); + /* + * make xref with mapping from protein to EMBL dna + */ + DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL, + getSequenceVersion(), proteinId, new Mapping( + dnaToProteinMapping.getMap().getInverse())); + product.addDBRef(proteinToEmblRef); + + /* + * make xref from protein to EMBLCDS; we assume here that the + * CDS sequence version is same as dna sequence (?!) + */ + MapList proteinToCdsMapList = new MapList(new int[] { 1, + translationLength }, new int[] { 1 + (codonStart - 1), + (codonStart - 1) + 3 * translationLength }, 1, 3); + DBRefEntry proteinToEmblCdsRef = new DBRefEntry( + DBRefSource.EMBLCDS, getSequenceVersion(), proteinId, + new Mapping(proteinToCdsMapList)); + product.addDBRef(proteinToEmblCdsRef); + + /* + * make 'direct' xref from protein to EMBLCDSPROTEIN + */ + proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef); + proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct); + proteinToEmblProteinRef.setMap(null); + product.addDBRef(proteinToEmblProteinRef); } } } - // add cds feature to dna seq - this may include the stop codon - for (int xint = 0; exon != null && xint < exon.length; xint += 2) + + /* + * add cds features to dna sequence + */ + String cds = feature.getName(); // "CDS" + for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) { - SequenceFeature sf = makeCdsFeature(exon, xint, prname, prid, vals, - codonStart); - sf.setType(feature.getName()); // "CDS" - sf.setFeatureGroup(sourceDb); + int exonStart = exons[xint]; + int exonEnd = exons[xint + 1]; + int begin = Math.min(exonStart, exonEnd); + int end = Math.max(exonStart, exonEnd); + int exonNumber = xint / 2 + 1; + String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s", + exonNumber, proteinName, proteinId); + + SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, + sourceDb, vals); + + sf.setEnaLocation(feature.getLocation()); + boolean forwardStrand = exonStart <= exonEnd; + sf.setStrand(forwardStrand ? "+" : "-"); + sf.setPhase(String.valueOf(codonStart - 1)); + sf.setValue(FeatureProperties.EXONPOS, exonNumber); + sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + dna.addSequenceFeature(sf); } } - // add dbRefs to sequence + + /* + * add feature dbRefs to sequence, and mappings for Uniprot xrefs + */ + boolean hasUniprotDbref = false; if (feature.dbRefs != null) { - boolean productMapped = false; + boolean mappingUsed = false; for (DBRefEntry ref : feature.dbRefs) { - ref.setSource(DBRefUtils.getCanonicalName(ref.getSource())); - // Hard code the kind of protein product accessions that EMBL cite - if (ref.getSource().equals(DBRefSource.UNIPROT)) + /* + * ensure UniProtKB/Swiss-Prot converted to UNIPROT + */ + String source = DBRefUtils.getCanonicalName(ref.getSource()); + ref.setSource(source); + DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), + ref.getVersion(), ref.getAccessionId()); + if (source.equals(DBRefSource.UNIPROT)) { - String refSeqName = DBRefSource.UNIPROT + "|" + String proteinSeqName = DBRefSource.UNIPROT + "|" + ref.getAccessionId(); - ref.setMap(map); - if (map != null && map.getTo() != null) + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) { - // if (!productMapped) - // { - // map.getTo().setName(refSeqName); - // map.getTo().addDBRef( - // new DBRefEntry(ref.getSource(), ref.getVersion(), ref - // .getAccessionId())); // don't copy map over. - // // if (map.getTo().getName().startsWith(prid)) - // productMapped = true; - // } - // else - // { + if (mappingUsed) + { /* - * an alternate UNIPROT product for CDS - same mapping - * but to a sequence with a different name + * two or more Uniprot xrefs for the same CDS - + * each needs a distinct Mapping (as to a different sequence) */ - SequenceI newSeq = matcher.findIdMatch(refSeqName); - if (newSeq == null) - { - newSeq = new Sequence(refSeqName, map.getTo() - .getSequenceAsString()); - matcher.add(newSeq); - peptides.add(newSeq); - } - Mapping newMap = new Mapping(newSeq, map.getMap()); - ref.setMap(newMap); - // } + dnaToProteinMapping = new Mapping(dnaToProteinMapping); + } + mappingUsed = true; + + /* + * try to locate the protein mapped to (possibly by a + * previous CDS feature); if not found, construct it from + * the EMBL translation + */ + SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName); + if (proteinSeq == null) + { + proteinSeq = new Sequence(proteinSeqName, + product.getSequenceAsString()); + matcher.add(proteinSeq); + peptides.add(proteinSeq); + } + dnaToProteinMapping.setTo(proteinSeq); + dnaToProteinMapping.setMappedFromId(proteinId); + proteinSeq.addDBRef(proteinDbRef); + ref.setMap(dnaToProteinMapping); } - noProteinDbref = false; + hasUniprotDbref = true; } if (product != null) { - DBRefEntry pref = new DBRefEntry(ref.getSource(), - ref.getVersion(), ref.getAccessionId()); + /* + * copy feature dbref to our protein product + */ + DBRefEntry pref = proteinDbRef; pref.setMap(null); // reference is direct product.addDBRef(pref); // Add converse mapping reference - if (map != null) + if (dnaToProteinMapping != null) { - Mapping pmap = new Mapping(dna, map.getMap().getInverse()); - pref = new DBRefEntry(sourceDb, getVersion(), + Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap() + .getInverse()); + pref = new DBRefEntry(sourceDb, getSequenceVersion(), this.getAccession()); pref.setMap(pmap); - if (map.getTo() != null) + if (dnaToProteinMapping.getTo() != null) { - map.getTo().addDBRef(pref); + dnaToProteinMapping.getTo().addDBRef(pref); } } } dna.addDBRef(ref); } - if (noProteinDbref && product != null) + } + + /* + * if we have a product (translation) but no explicit Uniprot dbref + * (example: EMBL AAFI02000057 protein_id EAL65544.1) + * then construct mappings to an assumed EMBLCDSPROTEIN accession + */ + if (!hasUniprotDbref && product != null) + { + if (proteinToEmblProteinRef == null) { - // add protein coding reference to dna sequence so xref matches - if (protEMBLCDS == null) - { - protEMBLCDS = new DBRefEntry(); - protEMBLCDS.setAccessionId(prid); - protEMBLCDS.setSource(DBRefSource.EMBLCDSProduct); - protEMBLCDS.setVersion(getVersion()); - protEMBLCDS - .setMap(new Mapping(product, map.getMap().getInverse())); - } - product.addDBRef(protEMBLCDS); + // assuming CDSPROTEIN sequence version = dna version (?!) + proteinToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); + } + product.addDBRef(proteinToEmblProteinRef); - // Add converse mapping reference - if (map != null) - { - Mapping pmap = new Mapping(product, protEMBLCDS.getMap().getMap() - .getInverse()); - DBRefEntry ncMap = new DBRefEntry(protEMBLCDS); - ncMap.setMap(pmap); - if (map.getTo() != null) - { - dna.addDBRef(ncMap); - } - } + if (dnaToProteinMapping != null + && dnaToProteinMapping.getTo() != null) + { + DBRefEntry dnaToEmblProteinRef = new DBRefEntry( + DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId); + dnaToEmblProteinRef.setMap(dnaToProteinMapping); + dnaToProteinMapping.setMappedFromId(proteinId); + dna.addDBRef(dnaToEmblProteinRef); } } } @@ -600,33 +577,25 @@ public class EmblEntry /** * Helper method to construct a SequenceFeature for one cds range * - * @param exons - * array of cds [start, end, ...] positions - * @param exonStartIndex - * offset into the exons array - * @param proteinName - * @param proteinAccessionId + * @param type + * feature type ("CDS") + * @param desc + * description + * @param begin + * start position + * @param end + * end position + * @param group + * feature group * @param vals * map of 'miscellaneous values' for feature - * @param codonStart - * codon start position for CDS (1/2/3, normally 1) * @return */ - protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex, - String proteinName, String proteinAccessionId, - Map vals, int codonStart) - { - int exonNumber = exonStartIndex / 2 + 1; - SequenceFeature sf = new SequenceFeature(); - sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1])); - sf.setDescription(String.format( - "Exon %d for protein '%s' EMBLCDS:%s", exonNumber, proteinName, - proteinAccessionId)); - sf.setPhase(String.valueOf(codonStart - 1)); - sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+" : "-"); - sf.setValue(FeatureProperties.EXONPOS, exonNumber); - sf.setValue(FeatureProperties.EXONPRODUCT, proteinName); + protected SequenceFeature makeCdsFeature(String type, String desc, + int begin, int end, String group, Map vals) + { + SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); + if (!vals.isEmpty()) { StringBuilder sb = new StringBuilder(); @@ -647,7 +616,7 @@ public class EmblEntry } /** - * Returns the CDS positions as a list of [start, end, start, end...] + * Returns the CDS positions as a single array of [start, end, start, end...] * positions. If on the reverse strand, these will be in descending order. * * @param feature @@ -655,76 +624,237 @@ public class EmblEntry */ protected int[] getCdsRanges(EmblFeature feature) { - if (feature.locations == null) + if (feature.location == null) { return new int[] {}; } - int cdsBoundaryCount = 0; // count of all start/stop locations - int[][] cdsLocations = new int[feature.locations.size()][]; - int locationNumber = 0; - for (EmblFeatureLocations loc : feature.locations) + + try { - int[] locationRanges = loc.getElementRanges(accession); - cdsLocations[locationNumber++] = locationRanges; - cdsBoundaryCount += locationRanges.length; - } - int[] cdsRanges = new int[cdsBoundaryCount]; - int copyTo = 0; - for (int[] ranges : cdsLocations) + List ranges = DnaUtils.parseLocation(feature.location); + return listToArray(ranges); + } catch (ParseException e) { - System.arraycopy(ranges, 0, cdsRanges, copyTo, ranges.length); - copyTo += ranges.length; + Cache.log.warn(String.format( + "Not parsing inexact CDS location %s in ENA %s", + feature.location, this.accession)); + return new int[] {}; } - return cdsRanges; + } + /** + * Converts a list of [start, end] ranges to a single array of [start, end, + * start, end ...] + * + * @param ranges + * @return + */ + int[] listToArray(List ranges) + { + int[] result = new int[ranges.size() * 2]; + int i = 0; + for (int[] range : ranges) + { + result[i++] = range[0]; + result[i++] = range[1]; + } + return result; } /** - * truncate the last exon interval to the prlength'th codon + * Truncates (if necessary) the exon intervals to match 3 times the length of + * the protein; also accepts 3 bases longer (for stop codon not included in + * protein) * - * @param prlength + * @param proteinLength * @param exon - * @return new exon + * an array of [start, end, start, end...] intervals + * @return the same array (if unchanged) or a truncated copy */ - private int[] adjustForProteinLength(int prlength, int[] exon) + static int[] adjustForProteinLength(int proteinLength, int[] exon) { + if (proteinLength <= 0 || exon == null) + { + return exon; + } + int expectedCdsLength = proteinLength * 3; + int exonLength = MappingUtils.getLength(Arrays.asList(exon)); - int origxon[], sxpos = -1, endxon = 0, cdslength = prlength * 3; - // first adjust range for codon start attribute - if (prlength >= 1 && exon != null) + /* + * if exon length matches protein, or is shorter, or longer by the + * length of a stop codon (3 bases), then leave it unchanged + */ + if (expectedCdsLength >= exonLength + || expectedCdsLength == exonLength - 3) { - origxon = new int[exon.length]; - System.arraycopy(exon, 0, origxon, 0, exon.length); - int cdspos = 0; - for (int x = 0; x < exon.length && sxpos == -1; x += 2) + return exon; + } + + int origxon[]; + int sxpos = -1; + int endxon = 0; + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length; x += 2) + { + cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; + if (expectedCdsLength <= cdspos) { - cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; - if (cdslength <= cdspos) + // advanced beyond last codon. + sxpos = x; + if (expectedCdsLength != cdspos) { - // advanced beyond last codon. - sxpos = x; - if (cdslength != cdspos) - { - System.err - .println("Truncating final exon interval on region by " - + (cdspos - cdslength)); - } - // locate the new end boundary of final exon as endxon - endxon = exon[x + 1] - cdspos + cdslength; - break; + // System.err + // .println("Truncating final exon interval on region by " + // + (cdspos - cdslength)); } - } - if (sxpos != -1) - { - // and trim the exon interval set if necessary - int[] nxon = new int[sxpos + 2]; - System.arraycopy(exon, 0, nxon, 0, sxpos + 2); - nxon[sxpos + 1] = endxon; // update the end boundary for the new exon - // set - exon = nxon; + /* + * shrink the final exon - reduce end position if forward + * strand, increase it if reverse + */ + if (exon[x + 1] >= exon[x]) + { + endxon = exon[x + 1] - cdspos + expectedCdsLength; + } + else + { + endxon = exon[x + 1] + cdspos - expectedCdsLength; + } + break; } } + + if (sxpos != -1) + { + // and trim the exon interval set if necessary + int[] nxon = new int[sxpos + 2]; + System.arraycopy(exon, 0, nxon, 0, sxpos + 2); + nxon[sxpos + 1] = endxon; // update the end boundary for the new exon + // set + exon = nxon; + } return exon; } + + public String getSequenceVersion() + { + return sequenceVersion; + } + + public void setSequenceVersion(String sequenceVersion) + { + this.sequenceVersion = sequenceVersion; + } + + public String getSequenceLength() + { + return sequenceLength; + } + + public void setSequenceLength(String sequenceLength) + { + this.sequenceLength = sequenceLength; + } + + public String getEntryVersion() + { + return entryVersion; + } + + public void setEntryVersion(String entryVersion) + { + this.entryVersion = entryVersion; + } + + public String getMoleculeType() + { + return moleculeType; + } + + public void setMoleculeType(String moleculeType) + { + this.moleculeType = moleculeType; + } + + public String getTopology() + { + return topology; + } + + public void setTopology(String topology) + { + this.topology = topology; + } + + public String getTaxonomicDivision() + { + return taxonomicDivision; + } + + public void setTaxonomicDivision(String taxonomicDivision) + { + this.taxonomicDivision = taxonomicDivision; + } + + public String getDescription() + { + return description; + } + + public void setDescription(String description) + { + this.description = description; + } + + public String getFirstPublicDate() + { + return firstPublicDate; + } + + public void setFirstPublicDate(String firstPublicDate) + { + this.firstPublicDate = firstPublicDate; + } + + public String getFirstPublicRelease() + { + return firstPublicRelease; + } + + public void setFirstPublicRelease(String firstPublicRelease) + { + this.firstPublicRelease = firstPublicRelease; + } + + public String getLastUpdatedDate() + { + return lastUpdatedDate; + } + + public void setLastUpdatedDate(String lastUpdatedDate) + { + this.lastUpdatedDate = lastUpdatedDate; + } + + public String getLastUpdatedRelease() + { + return lastUpdatedRelease; + } + + public void setLastUpdatedRelease(String lastUpdatedRelease) + { + this.lastUpdatedRelease = lastUpdatedRelease; + } + + public String getDataClass() + { + return dataClass; + } + + public void setDataClass(String dataClass) + { + this.dataClass = dataClass; + } }