import jalview.datamodel.Mapping;
import jalview.datamodel.SearchResults;
import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceGroup;
import jalview.datamodel.SequenceI;
import jalview.schemes.ResidueProperties;
}
/**
- * Constructs an alignment consisting of the mapped exon regions in the given
+ * Constructs an alignment consisting of the mapped cds regions in the given
* nucleotide sequences, and updates mappings to match.
*
* @param dna
* aligned dna sequences
* @param mappings
* from dna to protein; these are replaced with new mappings
- * @return an alignment whose sequences are the exon-only parts of the dna
- * sequences (or null if no exons are found)
+ * @return an alignment whose sequences are the cds-only parts of the dna
+ * sequences (or null if no cds are found)
*/
- public static AlignmentI makeExonAlignment(SequenceI[] dna,
+ public static AlignmentI makeCdsAlignment(SequenceI[] dna,
List<AlignedCodonFrame> mappings)
{
List<AlignedCodonFrame> newMappings = new ArrayList<AlignedCodonFrame>();
- List<SequenceI> exonSequences = new ArrayList<SequenceI>();
+ List<SequenceI> cdsSequences = new ArrayList<SequenceI>();
for (SequenceI dnaSeq : dna)
{
for (AlignedCodonFrame acf : seqMappings)
{
AlignedCodonFrame newMapping = new AlignedCodonFrame();
- final List<SequenceI> mappedExons = makeExonSequences(ds, acf,
+ final List<SequenceI> mappedCds = makeCdsSequences(ds, acf,
newMapping);
- if (!mappedExons.isEmpty())
+ if (!mappedCds.isEmpty())
{
- exonSequences.addAll(mappedExons);
+ cdsSequences.addAll(mappedCds);
newMappings.add(newMapping);
}
}
}
AlignmentI al = new Alignment(
- exonSequences.toArray(new SequenceI[exonSequences.size()]));
+ cdsSequences.toArray(new SequenceI[cdsSequences.size()]));
al.setDataset(null);
/*
}
/**
- * Helper method to make exon-only sequences and populate their mappings to
+ * Helper method to make cds-only sequences and populate their mappings to
* protein products
* <p>
* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein
* then generate a sequence CCTTGA with mapping [1, 6] to the same protein
* residues
* <p>
- * Typically eukaryotic dna will include exons encoding for a single peptide
+ * Typically eukaryotic dna will include cds encoding for a single peptide
* sequence i.e. return a single result. Bacterial dna may have overlapping
- * exon mappings coding for multiple peptides so return multiple results
+ * cds mappings coding for multiple peptides so return multiple results
* (example EMBL KF591215).
*
* @param dnaSeq
* a dna dataset sequence
* @param mapping
* containing one or more mappings of the sequence to protein
- * @param newMapping
- * the new mapping to populate, from the exon-only sequences to their
+ * @param newMappings
+ * the new mapping to populate, from the cds-only sequences to their
* mapped protein sequences
* @return
*/
- protected static List<SequenceI> makeExonSequences(SequenceI dnaSeq,
- AlignedCodonFrame mapping, AlignedCodonFrame newMapping)
+ protected static List<SequenceI> makeCdsSequences(SequenceI dnaSeq,
+ AlignedCodonFrame mapping, AlignedCodonFrame newMappings)
{
- List<SequenceI> exonSequences = new ArrayList<SequenceI>();
+ List<SequenceI> cdsSequences = new ArrayList<SequenceI>();
List<Mapping> seqMappings = mapping.getMappingsForSequence(dnaSeq);
- final char[] dna = dnaSeq.getSequence();
+
for (Mapping seqMapping : seqMappings)
{
- StringBuilder newSequence = new StringBuilder(dnaSeq.getLength());
+ SequenceI cds = makeCdsSequence(dnaSeq, seqMapping, newMappings);
+ cdsSequences.add(cds);
+
+ /*
+ * add new mappings, from dna to cds, and from cds to peptide
+ */
+ MapList dnaToCds = addCdsMappings(dnaSeq, cds, seqMapping,
+ newMappings);
/*
- * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
+ * transfer any features on dna that overlap the CDS
*/
- final List<int[]> dnaExonRanges = seqMapping.getMap().getFromRanges();
- for (int[] range : dnaExonRanges)
+ transferFeatures(dnaSeq, cds, dnaToCds, "CDS" /* SequenceOntology.CDS */);
+ }
+ return cdsSequences;
+ }
+
+ /**
+ * Transfers any co-located features on 'fromSeq' to 'toSeq', adjusting the
+ * feature start/end ranges, optionally omitting specified feature types.
+ *
+ * @param fromSeq
+ * @param toSeq
+ * @param mapping
+ * the mapping from 'fromSeq' to 'toSeq'
+ * @param omitting
+ */
+ protected static void transferFeatures(SequenceI fromSeq,
+ SequenceI toSeq, MapList mapping, String... omitting)
+ {
+ SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
+ if (sfs != null)
+ {
+ for (SequenceFeature sf : sfs)
{
- for (int pos = range[0]; pos <= range[1]; pos++)
+ String type = sf.getType();
+ boolean omit = false;
+ for (String toOmit : omitting)
+ {
+ if (type.equals(toOmit))
+ {
+ omit = true;
+ }
+ }
+ if (omit)
{
- newSequence.append(dna[pos - 1]);
+ continue;
+ }
+
+ /*
+ * locate the mapped range - null if either start or end is
+ * not mapped (no partial overlaps are calculated)
+ */
+ int[] mappedTo = mapping.locateInTo(sf.getBegin(), sf.getEnd());
+ if (mappedTo != null)
+ {
+ SequenceFeature copy = new SequenceFeature(sf);
+ copy.setBegin(Math.min(mappedTo[0], mappedTo[1]));
+ copy.setEnd(Math.max(mappedTo[0], mappedTo[1]));
+ toSeq.addSequenceFeature(copy);
}
}
+ }
+ }
- SequenceI exon = new Sequence(dnaSeq.getName(),
- newSequence.toString());
+ /**
+ * Creates and adds mappings
+ * <ul>
+ * <li>from cds to peptide</li>
+ * <li>from dna to cds</li>
+ * </ul>
+ * and returns the dna-to-cds mapping
+ *
+ * @param dnaSeq
+ * @param cdsSeq
+ * @param dnaMapping
+ * @param newMappings
+ * @return
+ */
+ protected static MapList addCdsMappings(SequenceI dnaSeq,
+ SequenceI cdsSeq,
+ Mapping dnaMapping, AlignedCodonFrame newMappings)
+ {
+ cdsSeq.createDatasetSequence();
- /*
- * Locate any xrefs to CDS database on the protein product and attach to
- * the CDS sequence. Also add as a sub-token of the sequence name.
- */
- // default to "CDS" if we can't locate an actual gene id
- String cdsAccId = FeatureProperties
- .getCodingFeature(DBRefSource.EMBL);
- DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(seqMapping.getTo()
- .getDBRef(), DBRefSource.CODINGDBS);
- if (cdsRefs != null)
+ /*
+ * CDS to peptide is just a contiguous 3:1 mapping, with
+ * the peptide ranges taken unchanged from the dna mapping
+ */
+ List<int[]> cdsRanges = new ArrayList<int[]>();
+ cdsRanges.add(new int[] { 1, cdsSeq.getLength() });
+ MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap()
+ .getToRanges(), 3, 1);
+ newMappings.addMap(cdsSeq.getDatasetSequence(), dnaMapping.getTo(),
+ cdsToPeptide);
+
+ /*
+ * dna 'from' ranges map 1:1 to the contiguous extracted CDS
+ */
+ MapList dnaToCds = new MapList(
+ dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1);
+ newMappings.addMap(dnaSeq, cdsSeq.getDatasetSequence(), dnaToCds);
+ return dnaToCds;
+ }
+
+ /**
+ * Makes and returns a CDS-only sequence, where the CDS regions are identified
+ * as the 'from' ranges of the mapping on the dna. Any sequence features on
+ * the dna which overlap the CDS regions are copied to the new sequence.
+ *
+ * @param dnaSeq
+ * nucleotide sequence
+ * @param seqMapping
+ * mappings from CDS regions of nucleotide
+ * @param exonMappings
+ * CDS-to-peptide mapping (to add to)
+ * @return
+ */
+ protected static SequenceI makeCdsSequence(SequenceI dnaSeq,
+ Mapping seqMapping, AlignedCodonFrame exonMappings)
+ {
+ StringBuilder newSequence = new StringBuilder(dnaSeq.getLength());
+ final char[] dna = dnaSeq.getSequence();
+ int offset = dnaSeq.getStart() - 1;
+
+ /*
+ * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
+ */
+ final List<int[]> dnaCdsRanges = seqMapping.getMap().getFromRanges();
+ for (int[] range : dnaCdsRanges)
+ {
+ // TODO handle reverse mapping as well (range[1] < range[0])
+ for (int pos = range[0]; pos <= range[1]; pos++)
{
- for (DBRefEntry cdsRef : cdsRefs)
- {
- exon.addDBRef(new DBRefEntry(cdsRef));
- cdsAccId = cdsRef.getAccessionId();
- }
+ newSequence.append(dna[pos - offset - 1]);
}
- exon.setName(exon.getName() + "|" + cdsAccId);
- exon.createDatasetSequence();
+ }
- /*
- * Build new mappings - from the same protein regions, but now to
- * contiguous exons
- */
- List<int[]> exonRange = new ArrayList<int[]>();
- exonRange.add(new int[] { 1, newSequence.length() });
- MapList map = new MapList(exonRange, seqMapping.getMap()
- .getToRanges(), 3, 1);
- newMapping.addMap(exon.getDatasetSequence(), seqMapping.getTo(), map);
- MapList cdsToDnaMap = new MapList(dnaExonRanges, exonRange, 1, 1);
- newMapping.addMap(dnaSeq, exon.getDatasetSequence(), cdsToDnaMap);
-
- exonSequences.add(exon);
+ SequenceI cds = new Sequence(dnaSeq.getName(),
+ newSequence.toString());
+
+ transferDbRefs(seqMapping.getTo(), cds);
+ return cds;
+ }
+
+ /**
+ * Locate any xrefs to CDS databases on the protein product and attach to the
+ * CDS sequence. Also add as a sub-token of the sequence name.
+ *
+ * @param from
+ * @param to
+ */
+ protected static void transferDbRefs(SequenceI from, SequenceI to)
+ {
+ String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL);
+ DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRef(),
+ DBRefSource.CODINGDBS);
+ if (cdsRefs != null)
+ {
+ for (DBRefEntry cdsRef : cdsRefs)
+ {
+ to.addDBRef(new DBRefEntry(cdsRef));
+ cdsAccId = cdsRef.getAccessionId();
+ }
+ }
+ if (!to.getName().contains(cdsAccId))
+ {
+ to.setName(to.getName() + "|" + cdsAccId);
}
- return exonSequences;
}
}