* Data model for one entry returned from an EMBL query, as marshalled by a
* Castor binding file
*
- * For example:
- * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id=J03321
- * &format=emblxml
+ * For example: http://www.ebi.ac.uk/ena/data/view/J03321&display=xml
*
* @see embl_mapping.xml
*/
*/
public SequenceI getSequence(String sourceDb, List<SequenceI> peptides)
{
- SequenceI dna = new Sequence(sourceDb + "|" + accession,
- sequence.getSequence());
+ SequenceI dna = makeSequence(sourceDb);
+ if (dna == null)
+ {
+ return null;
+ }
dna.setDescription(description);
DBRefEntry retrievedref = new DBRefEntry(sourceDb,
getSequenceVersion(), accession);
retrievedref.setMap(new Mapping(null, new int[] { 1, dna.getLength() },
new int[] { 1, dna.getLength() }, 1, 1));
-
/*
* transform EMBL Database refs to canonical form
*/
}
/**
+ * @param sourceDb
+ * @return
+ */
+ SequenceI makeSequence(String sourceDb)
+ {
+ if (sequence == null)
+ {
+ System.err.println("No sequence was returned for ENA accession "
+ + accession);
+ return null;
+ }
+ SequenceI dna = new Sequence(sourceDb + "|" + accession,
+ sequence.getSequence());
+ return dna;
+ }
+
+ /**
* Extracts coding region and product from a CDS feature and properly decorate
* it with annotations.
*
if (qname.equals("translation"))
{
// remove all spaces (precompiled String.replaceAll(" ", ""))
- translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
+ translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll(
+ "");
}
else if (qname.equals("protein_id"))
{
Mapping dnaToProteinMapping = null;
if (translation != null && proteinName != null && proteinId != null)
{
+ int translationLength = translation.length();
+
/*
* look for product in peptides list, if not found, add it
*/
product = matcher.findIdMatch(proteinId);
if (product == null)
{
- product = new Sequence(proteinId, translation, 1, translation.length());
+ product = new Sequence(proteinId, translation, 1, translationLength);
product.setDescription(((proteinName.length() == 0) ? "Protein Product from "
+ sourceDb
: proteinName));
// sequence
if (exons == null || exons.length == 0)
{
+ /*
+ * workaround until we handle dna location for CDS sequence
+ * e.g. location="X53828.1:60..1058" correctly
+ */
System.err
.println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
+ sourceDb + ":" + getAccession() + ")");
- if (translation.length() * 3 == (1 - codonStart + dna.getSequence().length))
+ if (translationLength * 3 == (1 - codonStart + dna.getSequence().length))
{
System.err
.println("Not allowing for additional stop codon at end of cDNA fragment... !");
- // this might occur for CDS sequences where no features are
- // marked.
+ // this might occur for CDS sequences where no features are marked
exons = new int[] { dna.getStart() + (codonStart - 1),
dna.getEnd() };
dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
- translation.length() },
- 3, 1);
+ translationLength }, 3, 1);
}
- if ((translation.length() + 1) * 3 == (1 - codonStart + dna.getSequence().length))
+ if ((translationLength + 1) * 3 == (1 - codonStart + dna
+ .getSequence().length))
{
System.err
.println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
exons = new int[] { dna.getStart() + (codonStart - 1),
dna.getEnd() - 3 };
dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
- translation.length() },
- 3, 1);
+ translationLength }, 3, 1);
}
}
else
else
{
// final product length truncation check
- int[] cdsRanges = adjustForProteinLength(translation.length(), exons);
- dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] { 1,
- translation.length() }, 3, 1);
+ int[] cdsRanges = adjustForProteinLength(translationLength, exons);
+ dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] {
+ 1, translationLength }, 3, 1);
if (product != null)
{
/*
- * make xrefs from protein to EMBLCDS and EMBLCDSPROTEIN
+ * make xref with mapping from protein to EMBL dna
*/
- DBRefEntry proteinToEmblCdsRef = new DBRefEntry();
- proteinToEmblCdsRef.setAccessionId(proteinId);
- proteinToEmblCdsRef.setSource(DBRefSource.EMBLCDS);
- proteinToEmblCdsRef.setVersion(getSequenceVersion()); // same as
- // parent EMBL
- // version.
- MapList mp = new MapList(new int[] { 1, translation.length() },
- new int[] { 1 + (codonStart - 1),
- (codonStart - 1) + 3 * translation.length() }, 1, 3);
- proteinToEmblCdsRef.setMap(new Mapping(mp));
+ DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
+ getSequenceVersion(), proteinId, new Mapping(
+ dnaToProteinMapping.getMap().getInverse()));
+ product.addDBRef(proteinToEmblRef);
+
+ /*
+ * make xref from protein to EMBLCDS; we assume here that the
+ * CDS sequence version is same as dna sequence (?!)
+ */
+ MapList proteinToCdsMapList = new MapList(new int[] { 1,
+ translationLength }, new int[] { 1 + (codonStart - 1),
+ (codonStart - 1) + 3 * translationLength }, 1, 3);
+ DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
+ DBRefSource.EMBLCDS, getSequenceVersion(), proteinId,
+ new Mapping(proteinToCdsMapList));
product.addDBRef(proteinToEmblCdsRef);
+
+ /*
+ * make 'direct' xref from protein to EMBLCDSPROTEIN
+ */
proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
- MapList mp2 = new MapList(
- new int[] { 1, translation.length() }, new int[] { 1,
- translation.length() }, 1, 1);
- proteinToEmblProteinRef.setMap(new Mapping(mp2));
proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
+ proteinToEmblProteinRef.setMap(null);
product.addDBRef(proteinToEmblProteinRef);
}
}
*/
for (int xint = 0; exons != null && xint < exons.length; xint += 2)
{
- SequenceFeature sf = makeCdsFeature(exons, xint, proteinName, proteinId, vals,
- codonStart);
+ SequenceFeature sf = makeCdsFeature(exons, xint, proteinName,
+ proteinId, vals, codonStart);
sf.setType(feature.getName()); // "CDS"
sf.setEnaLocation(feature.getLocation());
sf.setFeatureGroup(sourceDb);
*/
String source = DBRefUtils.getCanonicalName(ref.getSource());
ref.setSource(source);
- DBRefEntry proteinToDnaRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref
- .getAccessionId());
+ DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(),
+ ref.getVersion(), ref.getAccessionId());
if (source.equals(DBRefSource.UNIPROT))
{
String proteinSeqName = DBRefSource.UNIPROT + "|"
+ ref.getAccessionId();
- if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null)
+ if (dnaToProteinMapping != null
+ && dnaToProteinMapping.getTo() != null)
{
if (mappingUsed)
{
peptides.add(proteinSeq);
}
dnaToProteinMapping.setTo(proteinSeq);
- proteinSeq.addDBRef(proteinToDnaRef);
+ dnaToProteinMapping.setMappedFromId(proteinId);
+ proteinSeq.addDBRef(proteinDbRef);
ref.setMap(dnaToProteinMapping);
}
hasUniprotDbref = true;
/*
* copy feature dbref to our protein product
*/
- DBRefEntry pref = proteinToDnaRef;
+ DBRefEntry pref = proteinDbRef;
pref.setMap(null); // reference is direct
product.addDBRef(pref);
// Add converse mapping reference
dna.addDBRef(ref);
}
}
+
/*
* if we have a product (translation) but no explicit Uniprot dbref
- * (example: EMBL AAFI02000057 protein_id EAL65544.1
- * construct mappings to an assumed EMBLCDSPROTEIN accession
+ * (example: EMBL AAFI02000057 protein_id EAL65544.1)
+ * then construct mappings to an assumed EMBLCDSPROTEIN accession
*/
if (!hasUniprotDbref && product != null)
{
if (proteinToEmblProteinRef == null)
{
- proteinToEmblProteinRef = new DBRefEntry();
- proteinToEmblProteinRef.setAccessionId(proteinId);
- proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
- proteinToEmblProteinRef.setVersion(getSequenceVersion());
- proteinToEmblProteinRef.setMap(new Mapping(product,
- dnaToProteinMapping.getMap().getInverse()));
+ // assuming CDSPROTEIN sequence version = dna version (?!)
+ proteinToEmblProteinRef = new DBRefEntry(
+ DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
}
product.addDBRef(proteinToEmblProteinRef);
&& dnaToProteinMapping.getTo() != null)
{
DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
- proteinToEmblProteinRef);
+ DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
dnaToEmblProteinRef.setMap(dnaToProteinMapping);
+ dnaToProteinMapping.setMappedFromId(proteinId);
dna.addDBRef(dnaToEmblProteinRef);
}
}