* Data model for one entry returned from an EMBL query, as marshalled by a
* Castor binding file
*
- * For example:
- * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
- * &format=emblxml
+ * For example: http://www.ebi.ac.uk/ena/data/view/J03321&display=xml
*
* @see embl_mapping.xml
*/
*/
public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
{
- SequenceI dna = new Sequence(sourceDb + "|" + accession,
- sequence.getSequence());
+ SequenceI dna = makeSequence(sourceDb);
+ if (dna == null)
+ {
+ return null;
+ }
dna.setDescription(description);
- DBRefEntry retrievedref = new DBRefEntry(sourceDb,
- getSequenceVersion(), accession);
+ DBRefEntry retrievedref = new DBRefEntry(sourceDb, getSequenceVersion(),
+ accession);
dna.addDBRef(retrievedref);
// add map to indicate the sequence is a valid coordinate frame for the
// dbref
- retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
- new int[] { 1, dna.getLength() }, 1, 1));
-
+ retrievedref
+ .setMap(new Mapping(null, new int[]
+ { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
+ 1));
/*
* transform EMBL Database refs to canonical form
}
/**
+ * @param sourceDb
+ * @return
+ */
+ SequenceI makeSequence(String sourceDb)
+ {
+ if (sequence == null)
+ {
+ System.err.println(
+ "No sequence was returned for ENA accession " + accession);
+ return null;
+ }
+ SequenceI dna = new Sequence(sourceDb + "|" + accession,
+ sequence.getSequence());
+ return dna;
+ }
+
+ /**
* Extracts coding region and product from a CDS feature and properly decorate
* it with annotations.
*
* helper to match xrefs in already retrieved sequences
*/
void parseCodingFeature(EmblFeature feature, String sourceDb,
- SequenceI dna, List<SequenceI> peptides, SequenceIdMatcher matcher)
+ SequenceI dna, List<SequenceI> peptides,
+ SequenceIdMatcher matcher)
{
boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
String translation = null;
String proteinName = "";
String proteinId = null;
- Map<String, String> vals = new Hashtable<String, String>();
+ Map<String, String> vals = new Hashtable<>();
/*
* codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
if (qname.equals("translation"))
{
// remove all spaces (precompiled String.replaceAll(" ", ""))
- translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
+ translation = SPACE_PATTERN.matcher(q.getValues()[0])
+ .replaceAll("");
}
else if (qname.equals("protein_id"))
{
codonStart = Integer.parseInt(q.getValues()[0].trim());
} catch (NumberFormatException e)
{
- System.err.println("Invalid codon_start in XML for "
- + accession + ": " + e.getMessage());
+ System.err.println("Invalid codon_start in XML for " + accession
+ + ": " + e.getMessage());
}
}
else if (qname.equals("product"))
Mapping dnaToProteinMapping = null;
if (translation != null && proteinName != null && proteinId != null)
{
+ int translationLength = translation.length();
+
/*
* look for product in peptides list, if not found, add it
*/
product = matcher.findIdMatch(proteinId);
if (product == null)
{
- product = new Sequence(proteinId, translation, 1, translation.length());
- product.setDescription(((proteinName.length() == 0) ? "Protein Product from "
- + sourceDb
+ product = new Sequence(proteinId, translation, 1,
+ translationLength);
+ product.setDescription(((proteinName.length() == 0)
+ ? "Protein Product from " + sourceDb
: proteinName));
peptides.add(product);
matcher.add(product);
// sequence
if (exons == null || exons.length == 0)
{
- System.err
- .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
+ /*
+ * workaround until we handle dna location for CDS sequence
+ * e.g. location="X53828.1:60..1058" correctly
+ */
+ System.err.println(
+ "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
+ sourceDb + ":" + getAccession() + ")");
- if (translation.length() * 3 == (1 - codonStart + dna.getSequence().length))
+ int dnaLength = dna.getLength();
+ if (translationLength * 3 == (1 - codonStart + dnaLength))
{
- System.err
- .println("Not allowing for additional stop codon at end of cDNA fragment... !");
- // this might occur for CDS sequences where no features are
- // marked.
+ System.err.println(
+ "Not allowing for additional stop codon at end of cDNA fragment... !");
+ // this might occur for CDS sequences where no features are marked
exons = new int[] { dna.getStart() + (codonStart - 1),
dna.getEnd() };
- dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
- translation.length() },
- 3, 1);
+ dnaToProteinMapping = new Mapping(product, exons,
+ new int[]
+ { 1, translationLength }, 3, 1);
}
- if ((translation.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
+ if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
{
- System.err
- .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
+ System.err.println(
+ "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
exons = new int[] { dna.getStart() + (codonStart - 1),
dna.getEnd() - 3 };
- dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
- translation.length() },
- 3, 1);
+ dnaToProteinMapping = new Mapping(product, exons,
+ new int[]
+ { 1, translationLength }, 3, 1);
}
}
else
else
{
// final product length truncation check
- int[] cdsRanges = adjustForProteinLength(translation.length(), exons);
- dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { 1,
- translation.length() }, 3, 1);
+ int[] cdsRanges = adjustForProteinLength(translationLength,
+ exons);
+ dnaToProteinMapping = new Mapping(product, cdsRanges,
+ new int[]
+ { 1, translationLength }, 3, 1);
if (product != null)
{
/*
- * make xrefs from protein to EMBLCDS and EMBLCDSPROTEIN
+ * make xref with mapping from protein to EMBL dna
*/
- DBRefEntry proteinToEmblCdsRef = new DBRefEntry();
- proteinToEmblCdsRef.setAccessionId(proteinId);
- proteinToEmblCdsRef.setSource(DBRefSource.EMBLCDS);
- proteinToEmblCdsRef.setVersion(getSequenceVersion()); // same as
- // parent EMBL
- // version.
- MapList mp = new MapList(new int[] { 1, translation.length() },
- new int[] { 1 + (codonStart - 1),
- (codonStart - 1) + 3 * translation.length() }, 1, 3);
- proteinToEmblCdsRef.setMap(new Mapping(mp));
+ DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
+ getSequenceVersion(), proteinId,
+ new Mapping(dnaToProteinMapping.getMap().getInverse()));
+ product.addDBRef(proteinToEmblRef);
+
+ /*
+ * make xref from protein to EMBLCDS; we assume here that the
+ * CDS sequence version is same as dna sequence (?!)
+ */
+ MapList proteinToCdsMapList = new MapList(
+ new int[]
+ { 1, translationLength },
+ new int[]
+ { 1 + (codonStart - 1),
+ (codonStart - 1) + 3 * translationLength },
+ 1, 3);
+ DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
+ DBRefSource.EMBLCDS, getSequenceVersion(), proteinId,
+ new Mapping(proteinToCdsMapList));
product.addDBRef(proteinToEmblCdsRef);
+
+ /*
+ * make 'direct' xref from protein to EMBLCDSPROTEIN
+ */
proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
- MapList mp2 = new MapList(
- new int[] { 1, translation.length() }, new int[] { 1,
- translation.length() }, 1, 1);
- proteinToEmblProteinRef.setMap(new Mapping(mp2));
proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
+ proteinToEmblProteinRef.setMap(null);
product.addDBRef(proteinToEmblProteinRef);
}
}
/*
* add cds features to dna sequence
*/
- for (int xint = 0; exons != null && xint < exons.length; xint += 2)
+ String cds = feature.getName(); // "CDS"
+ for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
{
- SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, proteinId, vals,
- codonStart);
- sf.setType(feature.getName()); // "CDS"
+ int exonStart = exons[xint];
+ int exonEnd = exons[xint + 1];
+ int begin = Math.min(exonStart, exonEnd);
+ int end = Math.max(exonStart, exonEnd);
+ int exonNumber = xint / 2 + 1;
+ String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
+ exonNumber, proteinName, proteinId);
+
+ SequenceFeature sf = makeCdsFeature(cds, desc, begin, end,
+ sourceDb, vals);
+
sf.setEnaLocation(feature.getLocation());
- sf.setFeatureGroup(sourceDb);
+ boolean forwardStrand = exonStart <= exonEnd;
+ sf.setStrand(forwardStrand ? "+" : "-");
+ sf.setPhase(String.valueOf(codonStart - 1));
+ sf.setValue(FeatureProperties.EXONPOS, exonNumber);
+ sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+
dna.addSequenceFeature(sf);
}
}
*/
String source = DBRefUtils.getCanonicalName(ref.getSource());
ref.setSource(source);
- DBRefEntry proteinToDnaRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref
- .getAccessionId());
+ DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(),
+ ref.getVersion(), ref.getAccessionId());
if (source.equals(DBRefSource.UNIPROT))
{
String proteinSeqName = DBRefSource.UNIPROT + "|"
+ ref.getAccessionId();
- if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null)
+ if (dnaToProteinMapping != null
+ && dnaToProteinMapping.getTo() != null)
{
if (mappingUsed)
{
peptides.add(proteinSeq);
}
dnaToProteinMapping.setTo(proteinSeq);
- proteinSeq.addDBRef(proteinToDnaRef);
+ dnaToProteinMapping.setMappedFromId(proteinId);
+ proteinSeq.addDBRef(proteinDbRef);
ref.setMap(dnaToProteinMapping);
}
hasUniprotDbref = true;
/*
* copy feature dbref to our protein product
*/
- DBRefEntry pref = proteinToDnaRef;
+ DBRefEntry pref = proteinDbRef;
pref.setMap(null); // reference is direct
product.addDBRef(pref);
// Add converse mapping reference
if (dnaToProteinMapping != null)
{
- Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap()
- .getInverse());
+ Mapping pmap = new Mapping(dna,
+ dnaToProteinMapping.getMap().getInverse());
pref = new DBRefEntry(sourceDb, getSequenceVersion(),
this.getAccession());
pref.setMap(pmap);
dna.addDBRef(ref);
}
}
+
/*
* if we have a product (translation) but no explicit Uniprot dbref
- * (example: EMBL AAFI02000057 protein_id EAL65544.1
- * construct mappings to an assumed EMBLCDSPROTEIN accession
+ * (example: EMBL AAFI02000057 protein_id EAL65544.1)
+ * then construct mappings to an assumed EMBLCDSPROTEIN accession
*/
if (!hasUniprotDbref && product != null)
{
if (proteinToEmblProteinRef == null)
{
- proteinToEmblProteinRef = new DBRefEntry();
- proteinToEmblProteinRef.setAccessionId(proteinId);
- proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
- proteinToEmblProteinRef.setVersion(getSequenceVersion());
- proteinToEmblProteinRef.setMap(new Mapping(product,
- dnaToProteinMapping.getMap().getInverse()));
+ // assuming CDSPROTEIN sequence version = dna version (?!)
+ proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
+ getSequenceVersion(), proteinId);
}
product.addDBRef(proteinToEmblProteinRef);
&& dnaToProteinMapping.getTo() != null)
{
DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
- proteinToEmblProteinRef);
+ DBRefSource.EMBLCDSProduct, getSequenceVersion(),
+ proteinId);
dnaToEmblProteinRef.setMap(dnaToProteinMapping);
+ dnaToProteinMapping.setMappedFromId(proteinId);
dna.addDBRef(dnaToEmblProteinRef);
}
}
/**
* Helper method to construct a SequenceFeature for one cds range
*
- * @param exons
- * array of cds [start, end, ...] positions
- * @param exonStartIndex
- * offset into the exons array
- * @param proteinName
- * @param proteinAccessionId
+ * @param type
+ * feature type ("CDS")
+ * @param desc
+ * description
+ * @param begin
+ * start position
+ * @param end
+ * end position
+ * @param group
+ * feature group
* @param vals
* map of 'miscellaneous values' for feature
- * @param codonStart
- * codon start position for CDS (1/2/3, normally 1)
* @return
*/
- protected SequenceFeature makeCdsFeature(int[] exons, int exonStartIndex,
- String proteinName, String proteinAccessionId,
- Map<String, String> vals, int codonStart)
- {
- int exonNumber = exonStartIndex / 2 + 1;
- SequenceFeature sf = new SequenceFeature();
- sf.setBegin(Math.min(exons[exonStartIndex], exons[exonStartIndex + 1]));
- sf.setEnd(Math.max(exons[exonStartIndex], exons[exonStartIndex + 1]));
- sf.setDescription(String.format("Exon %d for protein '%s' EMBLCDS:%s",
- exonNumber, proteinName, proteinAccessionId));
- sf.setPhase(String.valueOf(codonStart - 1));
- sf.setStrand(exons[exonStartIndex] <= exons[exonStartIndex + 1] ? "+"
- : "-");
- sf.setValue(FeatureProperties.EXONPOS, exonNumber);
- sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
+ protected SequenceFeature makeCdsFeature(String type, String desc,
+ int begin, int end, String group, Map<String, String> vals)
+ {
+ SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
if (!vals.isEmpty())
{
StringBuilder sb = new StringBuilder();
return listToArray(ranges);
} catch (ParseException e)
{
- Cache.log.warn(String.format(
- "Not parsing inexact CDS location %s in ENA %s",
- feature.location, this.accession));
+ Cache.log.warn(
+ String.format("Not parsing inexact CDS location %s in ENA %s",
+ feature.location, this.accession));
return new int[] {};
}
}