+ void parseCodingFeature(EmblFeature feature, String sourceDb,
+ SequenceI dna, List<SequenceI> peptides, SequenceIdMatcher matcher)
+ {
+ boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
+
+ int[] exons = getCdsRanges(feature);
+
+ String translation = null;
+ String proteinName = "";
+ String proteinId = null;
+ Map<String, String> vals = new Hashtable<String, String>();
+
+ /*
+ * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
+ * (phase is required for CDS features in GFF3 format)
+ */
+ int codonStart = 1;
+
+ /*
+ * parse qualifiers, saving protein translation, protein id,
+ * codon start position, product (name), and 'other values'
+ */
+ if (feature.getQualifiers() != null)
+ {
+ for (Qualifier q : feature.getQualifiers())
+ {
+ String qname = q.getName();
+ if (qname.equals("translation"))
+ {
+ // remove all spaces (precompiled String.replaceAll(" ", ""))
+ translation = SPACE_PATTERN.matcher(q.getValues()[0]).replaceAll("");
+ }
+ else if (qname.equals("protein_id"))
+ {
+ proteinId = q.getValues()[0].trim();
+ }
+ else if (qname.equals("codon_start"))
+ {
+ try
+ {
+ codonStart = Integer.parseInt(q.getValues()[0].trim());
+ } catch (NumberFormatException e)
+ {
+ System.err.println("Invalid codon_start in XML for "
+ + accession + ": " + e.getMessage());
+ }
+ }
+ else if (qname.equals("product"))
+ {
+ // sometimes name is returned e.g. for V00488
+ proteinName = q.getValues()[0].trim();
+ }
+ else
+ {
+ // throw anything else into the additional properties hash
+ String[] qvals = q.getValues();
+ if (qvals != null)
+ {
+ String commaSeparated = StringUtils.arrayToSeparatorList(qvals,
+ ",");
+ vals.put(qname, commaSeparated);
+ }
+ }
+ }
+ }
+
+ DBRefEntry proteinToEmblProteinRef = null;
+ exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
+
+ SequenceI product = null;
+ Mapping dnaToProteinMapping = null;
+ if (translation != null && proteinName != null && proteinId != null)
+ {
+ int translationLength = translation.length();
+
+ /*
+ * look for product in peptides list, if not found, add it
+ */
+ product = matcher.findIdMatch(proteinId);
+ if (product == null)
+ {
+ product = new Sequence(proteinId, translation, 1, translationLength);
+ product.setDescription(((proteinName.length() == 0) ? "Protein Product from "
+ + sourceDb
+ : proteinName));
+ peptides.add(product);
+ matcher.add(product);
+ }
+
+ // we have everything - create the mapping and perhaps the protein
+ // sequence
+ if (exons == null || exons.length == 0)
+ {
+ /*
+ * workaround until we handle dna location for CDS sequence
+ * e.g. location="X53828.1:60..1058" correctly
+ */
+ System.err
+ .println("Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
+ + sourceDb + ":" + getAccession() + ")");
+ if (translationLength * 3 == (1 - codonStart + dna.getSequence().length))
+ {
+ System.err
+ .println("Not allowing for additional stop codon at end of cDNA fragment... !");
+ // this might occur for CDS sequences where no features are marked
+ exons = new int[] { dna.getStart() + (codonStart - 1),
+ dna.getEnd() };
+ dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
+ translationLength }, 3, 1);
+ }
+ if ((translationLength + 1) * 3 == (1 - codonStart + dna
+ .getSequence().length))
+ {
+ System.err
+ .println("Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
+ exons = new int[] { dna.getStart() + (codonStart - 1),
+ dna.getEnd() - 3 };
+ dnaToProteinMapping = new Mapping(product, exons, new int[] { 1,
+ translationLength }, 3, 1);
+ }
+ }
+ else
+ {
+ // Trim the exon mapping if necessary - the given product may only be a
+ // fragment of a larger protein. (EMBL:AY043181 is an example)
+
+ if (isEmblCdna)
+ {
+ // TODO: Add a DbRef back to the parent EMBL sequence with the exon
+ // map
+ // if given a dataset reference, search dataset for parent EMBL
+ // sequence if it exists and set its map
+ // make a new feature annotating the coding contig
+ }
+ else
+ {
+ // final product length truncation check
+ int[] cdsRanges = adjustForProteinLength(translationLength, exons);
+ dnaToProteinMapping = new Mapping(product, cdsRanges, new int[] {
+ 1, translationLength }, 3, 1);
+ if (product != null)
+ {
+ /*
+ * make xref with mapping from protein to EMBL dna
+ */
+ DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
+ getSequenceVersion(), proteinId, new Mapping(
+ dnaToProteinMapping.getMap().getInverse()));
+ product.addDBRef(proteinToEmblRef);
+
+ /*
+ * make xref from protein to EMBLCDS; we assume here that the
+ * CDS sequence version is same as dna sequence (?!)
+ */
+ MapList proteinToCdsMapList = new MapList(new int[] { 1,
+ translationLength }, new int[] { 1 + (codonStart - 1),
+ (codonStart - 1) + 3 * translationLength }, 1, 3);
+ DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
+ DBRefSource.EMBLCDS, getSequenceVersion(), proteinId,
+ new Mapping(proteinToCdsMapList));
+ product.addDBRef(proteinToEmblCdsRef);
+
+ /*
+ * make 'direct' xref from protein to EMBLCDSPROTEIN
+ */
+ proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
+ proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
+ proteinToEmblProteinRef.setMap(null);
+ product.addDBRef(proteinToEmblProteinRef);
+ }
+ }
+ }
+
+ /*
+ * add cds features to dna sequence
+ */
+ for (int xint = 0; exons != null && xint < exons.length; xint += 2)
+ {
+ SequenceFeature sf = makeCdsFeature(exons, xint, proteinName,
+ proteinId, vals, codonStart);
+ sf.setType(feature.getName()); // "CDS"
+ sf.setEnaLocation(feature.getLocation());
+ sf.setFeatureGroup(sourceDb);
+ dna.addSequenceFeature(sf);
+ }
+ }
+
+ /*
+ * add feature dbRefs to sequence, and mappings for Uniprot xrefs
+ */
+ boolean hasUniprotDbref = false;
+ if (feature.dbRefs != null)
+ {
+ boolean mappingUsed = false;
+ for (DBRefEntry ref : feature.dbRefs)
+ {
+ /*
+ * ensure UniProtKB/Swiss-Prot converted to UNIPROT
+ */
+ String source = DBRefUtils.getCanonicalName(ref.getSource());
+ ref.setSource(source);
+ DBRefEntry proteinDbRef = new DBRefEntry(ref.getSource(), ref.getVersion(), ref
+ .getAccessionId());
+ if (source.equals(DBRefSource.UNIPROT))
+ {
+ String proteinSeqName = DBRefSource.UNIPROT + "|"
+ + ref.getAccessionId();
+ if (dnaToProteinMapping != null && dnaToProteinMapping.getTo() != null)
+ {
+ if (mappingUsed)
+ {
+ /*
+ * two or more Uniprot xrefs for the same CDS -
+ * each needs a distinct Mapping (as to a different sequence)
+ */
+ dnaToProteinMapping = new Mapping(dnaToProteinMapping);
+ }
+ mappingUsed = true;
+
+ /*
+ * try to locate the protein mapped to (possibly by a
+ * previous CDS feature); if not found, construct it from
+ * the EMBL translation
+ */
+ SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
+ if (proteinSeq == null)
+ {
+ proteinSeq = new Sequence(proteinSeqName,
+ product.getSequenceAsString());
+ matcher.add(proteinSeq);
+ peptides.add(proteinSeq);
+ }
+ dnaToProteinMapping.setTo(proteinSeq);
+ dnaToProteinMapping.setMappedFromId(proteinId);
+ proteinSeq.addDBRef(proteinDbRef);
+ proteinSeq.setSourceDBRef(proteinDbRef);
+ ref.setMap(dnaToProteinMapping);
+ }
+ hasUniprotDbref = true;
+ }
+ if (product != null)
+ {
+ /*
+ * copy feature dbref to our protein product
+ */
+ DBRefEntry pref = proteinDbRef;
+ pref.setMap(null); // reference is direct
+ product.addDBRef(pref);
+ // Add converse mapping reference
+ if (dnaToProteinMapping != null)
+ {
+ Mapping pmap = new Mapping(dna, dnaToProteinMapping.getMap()
+ .getInverse());
+ pref = new DBRefEntry(sourceDb, getSequenceVersion(),
+ this.getAccession());
+ pref.setMap(pmap);
+ if (dnaToProteinMapping.getTo() != null)
+ {
+ dnaToProteinMapping.getTo().addDBRef(pref);
+ }
+ }
+ }
+ dna.addDBRef(ref);
+ }
+ }
+
+ /*
+ * if we have a product (translation) but no explicit Uniprot dbref
+ * (example: EMBL AAFI02000057 protein_id EAL65544.1)
+ * then construct mappings to an assumed EMBLCDSPROTEIN accession
+ */
+ if (!hasUniprotDbref && product != null)
+ {
+ if (proteinToEmblProteinRef == null)
+ {
+ // assuming CDSPROTEIN sequence version = dna version (?!)
+ proteinToEmblProteinRef = new DBRefEntry(
+ DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
+ }
+ product.addDBRef(proteinToEmblProteinRef);
+ product.setSourceDBRef(proteinToEmblProteinRef);
+
+ if (dnaToProteinMapping != null
+ && dnaToProteinMapping.getTo() != null)
+ {
+ DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
+ DBRefSource.EMBLCDSProduct, getSequenceVersion(), proteinId);
+ dnaToEmblProteinRef.setMap(dnaToProteinMapping);
+ dnaToProteinMapping.setMappedFromId(proteinId);
+ dna.addDBRef(dnaToEmblProteinRef);
+ }
+ }