mappedCds = makeCdsSequences(ds, acf,
+ newMapping);
+ if (!mappedCds.isEmpty())
+ {
+ cdsSequences.addAll(mappedCds);
+ newMappings.add(newMapping);
+ }
+ }
+ }
+ AlignmentI al = new Alignment(
+ cdsSequences.toArray(new SequenceI[cdsSequences.size()]));
+ al.setDataset(null);
+
+ /*
+ * Replace the old mappings with the new ones
+ */
+ mappings.clear();
+ mappings.addAll(newMappings);
+
+ return al;
+ }
+
+ /**
+ * Helper method to make cds-only sequences and populate their mappings to
+ * protein products
+ *
+ * For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein
+ * then generate a sequence CCTTGA with mapping [1, 6] to the same protein
+ * residues
+ *
+ * Typically eukaryotic dna will include cds encoding for a single peptide
+ * sequence i.e. return a single result. Bacterial dna may have overlapping
+ * cds mappings coding for multiple peptides so return multiple results
+ * (example EMBL KF591215).
+ *
+ * @param dnaSeq
+ * a dna dataset sequence
+ * @param mapping
+ * containing one or more mappings of the sequence to protein
+ * @param newMappings
+ * the new mapping to populate, from the cds-only sequences to their
+ * mapped protein sequences
+ * @return
+ */
+ protected static List makeCdsSequences(SequenceI dnaSeq,
+ AlignedCodonFrame mapping, AlignedCodonFrame newMappings)
+ {
+ List cdsSequences = new ArrayList();
+ List seqMappings = mapping.getMappingsForSequence(dnaSeq);
+
+ for (Mapping seqMapping : seqMappings)
+ {
+ SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, newMappings);
+ cdsSequences.add(cds);
+
+ /*
+ * add new mappings, from dna to cds, and from cds to peptide
+ */
+ MapList dnaToCds = addCdsMappings(dnaSeq, cds, seqMapping,
+ newMappings);
+
+ /*
+ * transfer any features on dna that overlap the CDS
+ */
+ transferFeatures(dnaSeq, cds, dnaToCds, "CDS" /* SequenceOntology.CDS */);
+ }
+ return cdsSequences;
+ }
+
+ /**
+ * Transfers any co-located features on 'fromSeq' to 'toSeq', adjusting the
+ * feature start/end ranges, optionally omitting specified feature types.
+ *
+ * @param fromSeq
+ * @param toSeq
+ * @param mapping
+ * the mapping from 'fromSeq' to 'toSeq'
+ * @param omitting
+ */
+ protected static void transferFeatures(SequenceI fromSeq,
+ SequenceI toSeq, MapList mapping, String... omitting)
+ {
+ SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
+ if (sfs != null)
+ {
+ for (SequenceFeature sf : sfs)
+ {
+ String type = sf.getType();
+ boolean omit = false;
+ for (String toOmit : omitting)
+ {
+ if (type.equals(toOmit))
+ {
+ omit = true;
+ }
+ }
+ if (omit)
+ {
+ continue;
+ }
+
+ /*
+ * locate the mapped range - null if either start or end is
+ * not mapped (no partial overlaps are calculated)
+ */
+ int[] mappedTo = mapping.locateInTo(sf.getBegin(), sf.getEnd());
+ if (mappedTo != null)
+ {
+ SequenceFeature copy = new SequenceFeature(sf);
+ copy.setBegin(Math.min(mappedTo[0], mappedTo[1]));
+ copy.setEnd(Math.max(mappedTo[0], mappedTo[1]));
+ toSeq.addSequenceFeature(copy);
+ }
+ }
+ }
+ }
+
+ /**
+ * Creates and adds mappings
+ *
+ * - from cds to peptide
+ * - from dna to cds
+ *
+ * and returns the dna-to-cds mapping
+ *
+ * @param dnaSeq
+ * @param cdsSeq
+ * @param dnaMapping
+ * @param newMappings
+ * @return
+ */
+ protected static MapList addCdsMappings(SequenceI dnaSeq,
+ SequenceI cdsSeq,
+ Mapping dnaMapping, AlignedCodonFrame newMappings)
+ {
+ cdsSeq.createDatasetSequence();
+
+ /*
+ * CDS to peptide is just a contiguous 3:1 mapping, with
+ * the peptide ranges taken unchanged from the dna mapping
+ */
+ List cdsRanges = new ArrayList();
+ cdsRanges.add(new int[] { 1, cdsSeq.getLength() });
+ MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap()
+ .getToRanges(), 3, 1);
+ newMappings.addMap(cdsSeq.getDatasetSequence(), dnaMapping.getTo(),
+ cdsToPeptide);
+
+ /*
+ * dna 'from' ranges map 1:1 to the contiguous extracted CDS
+ */
+ MapList dnaToCds = new MapList(
+ dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1);
+ newMappings.addMap(dnaSeq, cdsSeq.getDatasetSequence(), dnaToCds);
+ return dnaToCds;
+ }
+
+ /**
+ * Makes and returns a CDS-only sequence, where the CDS regions are identified
+ * as the 'from' ranges of the mapping on the dna. Any sequence features on
+ * the dna which overlap the CDS regions are copied to the new sequence.
+ *
+ * @param dnaSeq
+ * nucleotide sequence
+ * @param seqMapping
+ * mappings from CDS regions of nucleotide
+ * @param exonMappings
+ * CDS-to-peptide mapping (to add to)
+ * @return
+ */
+ protected static SequenceI makeCdsSequence(SequenceI dnaSeq,
+ Mapping seqMapping, AlignedCodonFrame exonMappings)
+ {
+ StringBuilder newSequence = new StringBuilder(dnaSeq.getLength());
+ final char[] dna = dnaSeq.getSequence();
+ int offset = dnaSeq.getStart() - 1;
+
+ /*
+ * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
+ */
+ final List dnaCdsRanges = seqMapping.getMap().getFromRanges();
+ for (int[] range : dnaCdsRanges)
+ {
+ // TODO handle reverse mapping as well (range[1] < range[0])
+ for (int pos = range[0]; pos <= range[1]; pos++)
+ {
+ newSequence.append(dna[pos - offset - 1]);
+ }
+ }
+
+ SequenceI cds = new Sequence(dnaSeq.getName(),
+ newSequence.toString());
+
+ transferDbRefs(seqMapping.getTo(), cds);
+ return cds;
+ }
+
+ /**
+ * Locate any xrefs to CDS databases on the protein product and attach to the
+ * CDS sequence. Also add as a sub-token of the sequence name.
+ *
+ * @param from
+ * @param to
+ */
+ protected static void transferDbRefs(SequenceI from, SequenceI to)
+ {
+ String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL);
+ DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRef(),
+ DBRefSource.CODINGDBS);
+ if (cdsRefs != null)
+ {
+ for (DBRefEntry cdsRef : cdsRefs)
+ {
+ to.addDBRef(new DBRefEntry(cdsRef));
+ cdsAccId = cdsRef.getAccessionId();
+ }
+ }
+ if (!to.getName().contains(cdsAccId))
+ {
+ to.setName(to.getName() + "|" + cdsAccId);
+ }
+ }
}