From 20c3a3bb8feb78f4b4ccd02f3e7906b3775eb870 Mon Sep 17 00:00:00 2001 From: jprocter Date: Fri, 19 Jun 2009 09:54:59 +0000 Subject: [PATCH] parent db references and mappings added to retrieved sequences, debugging of codon_start>1 for coding region and stop codon inclusion/exclusion detection --- src/jalview/datamodel/xdb/embl/EmblEntry.java | 115 +++++++++++++++++-------- 1 file changed, 77 insertions(+), 38 deletions(-) diff --git a/src/jalview/datamodel/xdb/embl/EmblEntry.java b/src/jalview/datamodel/xdb/embl/EmblEntry.java index 770bf7c..763e64a 100644 --- a/src/jalview/datamodel/xdb/embl/EmblEntry.java +++ b/src/jalview/datamodel/xdb/embl/EmblEntry.java @@ -67,7 +67,7 @@ public class EmblEntry /** * @param accession - * the accession to set + * the accession to set */ public void setAccession(String accession) { @@ -84,7 +84,7 @@ public class EmblEntry /** * @param dbRefs - * the dbRefs to set + * the dbRefs to set */ public void setDbRefs(Vector dbRefs) { @@ -101,7 +101,7 @@ public class EmblEntry /** * @param desc - * the desc to set + * the desc to set */ public void setDesc(String desc) { @@ -118,7 +118,7 @@ public class EmblEntry /** * @param features - * the features to set + * the features to set */ public void setFeatures(Vector features) { @@ -135,7 +135,7 @@ public class EmblEntry /** * @param keywords - * the keywords to set + * the keywords to set */ public void setKeywords(Vector keywords) { @@ -152,7 +152,7 @@ public class EmblEntry /** * @param lastUpdated - * the lastUpdated to set + * the lastUpdated to set */ public void setLastUpdated(String lastUpdated) { @@ -169,7 +169,7 @@ public class EmblEntry /** * @param refs - * the refs to set + * the refs to set */ public void setRefs(Vector refs) { @@ -186,7 +186,7 @@ public class EmblEntry /** * @param releaseCreated - * the releaseCreated to set + * the releaseCreated to set */ public void setRcreated(String releaseCreated) { @@ -203,7 +203,7 @@ public class EmblEntry /** * @param releaseLastUpdated - * the releaseLastUpdated to set + * the releaseLastUpdated to set */ public void setRLastUpdated(String releaseLastUpdated) { @@ -220,7 +220,7 @@ public class EmblEntry /** * @param sequence - * the sequence to set + * the sequence to set */ public void setSequence(EmblSequence sequence) { @@ -237,7 +237,7 @@ public class EmblEntry /** * @param taxDivision - * the taxDivision to set + * the taxDivision to set */ public void setTaxDivision(String taxDivision) { @@ -254,7 +254,7 @@ public class EmblEntry /** * @param version - * the version to set + * the version to set */ public void setVersion(String version) { @@ -265,8 +265,8 @@ public class EmblEntry * EMBL Feature support is limited. The text below is included for the benefit * of any developer working on improving EMBL feature import in Jalview. * Extract from EMBL feature specification see - * http://www.embl-ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html - * 3.5 Location 3.5.1 Purpose + * http://www.embl-ebi.ac.uk/embl/Documentation + * /FT_definitions/feature_table.html 3.5 Location 3.5.1 Purpose * * The location indicates the region of the presented sequence which * corresponds to a feature. @@ -390,18 +390,16 @@ public class EmblEntry * * join(1..100,J00194.1:100..202) Joins region 1..100 of the existing entry * with the region 100..202 of remote entry J00194 - * */ /** * Recover annotated sequences from EMBL file * * @param noNa - * don't return nucleic acid sequences + * don't return nucleic acid sequences * @param sourceDb - * TODO + * TODO * @param noProtein - * don't return any translated protein sequences marked in - * features + * don't return any translated protein sequences marked in features * @return dataset sequences with DBRefs and features - DNA always comes first */ public jalview.datamodel.SequenceI[] getSequences(boolean noNa, @@ -416,8 +414,13 @@ public class EmblEntry // pointer exception dna = new Sequence(sourceDb + "|" + accession, sequence.getSequence()); dna.setDescription(desc); - dna.addDBRef(new DBRefEntry(sourceDb, version, accession)); - // TODO: add mapping for parentAccession attribute + DBRefEntry retrievedref = new DBRefEntry(sourceDb, version, accession); + dna.addDBRef(retrievedref); + // add map to indicate the sequence is a valid coordinate frame for the + // dbref + retrievedref.setMap(new Mapping(null, new int[] + { 1, dna.getLength() }, new int[] + { 1, dna.getLength() }, 1, 1)); // TODO: transform EMBL Database refs to canonical form if (dbRefs != null) for (Iterator i = dbRefs.iterator(); i.hasNext(); dna @@ -483,15 +486,15 @@ public class EmblEntry * decorate it with annotations. * * @param feature - * coding feature + * coding feature * @param sourceDb - * source database for the EMBLXML + * source database for the EMBLXML * @param seqs - * place where sequences go + * place where sequences go * @param dna - * parent dna sequence for this record + * parent dna sequence for this record * @param noPeptide - * flag for generation of Peptide sequence objects + * flag for generation of Peptide sequence objects */ private void parseCodingFeature(EmblFeature feature, String sourceDb, Vector seqs, Sequence dna, boolean noPeptide) @@ -577,11 +580,39 @@ public class EmblEntry } } Sequence product = null; + int origxon[], sxpos = -1, sxstart, sxstop; + // first adjust range for codon start attribute + if (prstart > 1) + { + origxon = new int[exon.length]; + System.arraycopy(exon, 0, origxon, 0, exon.length); + int cdspos = 0; + for (int x = 0; x < exon.length && sxpos == -1; x += 2) + { + cdspos += exon[x + 1] - exon[x] + 1; + if (prstart <= cdspos) + { + sxpos = x; + sxstart = exon[x]; + sxstop = exon[x + 1]; + // and adjust start boundary of first exon. + exon[x] = exon[x + 1] - cdspos + prstart; + break; + } + } + + if (sxpos > 0) + { + int[] nxon = new int[exon.length - sxpos]; + System.arraycopy(exon, sxpos, nxon, 0, exon.length - sxpos); + exon = nxon; + } + } + if (prseq != null && prname != null && prid != null) { // extract proteins. - product = new Sequence(prid, prseq, prstart, prstart + prseq.length() - - 1); + product = new Sequence(prid, prseq, 1, prseq.length()); product .setDescription(((prname.length() == 0) ? "Protein Product from " + sourceDb @@ -599,21 +630,25 @@ public class EmblEntry System.err .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect (" + sourceDb + ":" + getAccession() + ")"); - if (prseq.length() * 3 == dna.getSequence().length) + if (prseq.length() * 3 == (1-prstart + dna.getSequence().length)) { + System.err + .println("Not allowing for additional stop codon at end of cDNA fragment... !"); // this might occur for CDS sequences where no features are // marked. exon = new int[] - { dna.getStart(), dna.getEnd() }; + { dna.getStart() + (prstart - 1), dna.getEnd() }; map = new jalview.datamodel.Mapping(product, exon, new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); + { 1, prseq.length() }, 3, 1); } - if ((prseq.length() + 1) * 3 == dna.getSequence().length) + if ((prseq.length() + 1) * 3 == (1-prstart + dna.getSequence().length)) { + System.err + .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!"); exon = new int[] - { dna.getStart(), dna.getEnd() - 3 }; + { dna.getStart() + (prstart - 1), dna.getEnd() - 3 }; map = new jalview.datamodel.Mapping(product, exon, new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); + { 1, prseq.length() }, 3, 1); } } else @@ -629,16 +664,20 @@ public class EmblEntry else { map = new jalview.datamodel.Mapping(product, exon, new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); + { 1, prseq.length() }, 3, 1); // reconstruct the EMBLCDS entry DBRefEntry pcdnaref = new DBRefEntry(); pcdnaref.setAccessionId(prid); pcdnaref.setSource(DBRefSource.EMBLCDS); pcdnaref.setVersion(getVersion()); // same as parent EMBL version. jalview.util.MapList mp = new jalview.util.MapList(new int[] - { 1 + (prstart - 1) * 3, - 1 + (prstart - 1) * 3 + (prseq.length() - 1) * 3 }, new int[] - { prstart, prstart + prseq.length() - 1 }, 3, 1); + { 1, prseq.length() }, + new int[] + { 1 + (prstart - 1), + (prstart - 1) + 3 * prseq.length() }, 1, 3); + // { 1 + (prstart - 1) * 3, + // 1 + (prstart - 1) * 3 + prseq.length() * 3 - 1 }, new int[] + // { 1prstart, prstart + prseq.length() - 1 }, 3, 1); pcdnaref.setMap(new Mapping(mp)); if (product != null) product.addDBRef(pcdnaref); -- 1.7.10.2