maps = mapping.getMappingsForSequence(seq);
for (Mapping map : maps)
{
/*
* Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
* Find and add the overall aligned column range for each
*/
for (int[] cdsRange : map.getMap().getFromRanges())
{
int startPos = cdsRange[0];
int endPos = cdsRange[1];
int startCol = seq.findIndex(startPos);
int endCol = seq.findIndex(endPos);
columns.add(new int[] { startCol, endCol });
}
}
}
result.add(columns);
}
return result;
}
/**
* Helper method to make cds-only sequences and populate their mappings to
* protein products
*
* For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein
* then generate a sequence CCTTGA with mapping [1, 6] to the same protein
* residues
*
* Typically eukaryotic dna will include cds encoding for a single peptide
* sequence i.e. return a single result. Bacterial dna may have overlapping
* cds mappings coding for multiple peptides so return multiple results
* (example EMBL KF591215).
*
* @param dnaSeq
* a dna aligned sequence
* @param mapping
* containing one or more mappings of the sequence to protein
* @param ungappedCdsColumns
* @param newMappings
* the new mapping to populate, from the cds-only sequences to their
* mapped protein sequences
* @return
*/
protected static List makeCdsSequences(SequenceI dnaSeq,
AlignedCodonFrame mapping, List ungappedCdsColumns,
AlignedCodonFrame newMappings, char gapChar)
{
List cdsSequences = new ArrayList();
List seqMappings = mapping.getMappingsForSequence(dnaSeq);
for (Mapping seqMapping : seqMappings)
{
SequenceI cds = makeCdsSequence(dnaSeq, seqMapping,
ungappedCdsColumns, gapChar);
cds.createDatasetSequence();
cdsSequences.add(cds);
/*
* add new mappings, from dna to cds, and from cds to peptide
*/
MapList dnaToCds = addCdsMappings(dnaSeq.getDatasetSequence(), cds,
seqMapping, newMappings);
/*
* transfer any features on dna that overlap the CDS
*/
transferFeatures(dnaSeq, cds, dnaToCds, null, SequenceOntologyI.CDS);
}
return cdsSequences;
}
/**
* Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
* feature start/end ranges, optionally omitting specified feature types.
* Returns the number of features copied.
*
* @param fromSeq
* @param toSeq
* @param select
* if not null, only features of this type are copied (including
* subtypes in the Sequence Ontology)
* @param mapping
* the mapping from 'fromSeq' to 'toSeq'
* @param omitting
*/
public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
MapList mapping, String select, String... omitting)
{
SequenceI copyTo = toSeq;
while (copyTo.getDatasetSequence() != null)
{
copyTo = copyTo.getDatasetSequence();
}
SequenceOntologyI so = SequenceOntologyFactory.getInstance();
int count = 0;
SequenceFeature[] sfs = fromSeq.getSequenceFeatures();
if (sfs != null)
{
for (SequenceFeature sf : sfs)
{
String type = sf.getType();
if (select != null && !so.isA(type, select))
{
continue;
}
boolean omit = false;
for (String toOmit : omitting)
{
if (type.equals(toOmit))
{
omit = true;
}
}
if (omit)
{
continue;
}
/*
* locate the mapped range - null if either start or end is
* not mapped (no partial overlaps are calculated)
*/
int start = sf.getBegin();
int end = sf.getEnd();
int[] mappedTo = mapping.locateInTo(start, end);
/*
* if whole exon range doesn't map, try interpreting it
* as 5' or 3' exon overlapping the CDS range
*/
if (mappedTo == null)
{
mappedTo = mapping.locateInTo(end, end);
if (mappedTo != null)
{
/*
* end of exon is in CDS range - 5' overlap
* to a range from the start of the peptide
*/
mappedTo[0] = 1;
}
}
if (mappedTo == null)
{
mappedTo = mapping.locateInTo(start, start);
if (mappedTo != null)
{
/*
* start of exon is in CDS range - 3' overlap
* to a range up to the end of the peptide
*/
mappedTo[1] = toSeq.getLength();
}
}
if (mappedTo != null)
{
SequenceFeature copy = new SequenceFeature(sf);
copy.setBegin(Math.min(mappedTo[0], mappedTo[1]));
copy.setEnd(Math.max(mappedTo[0], mappedTo[1]));
copyTo.addSequenceFeature(copy);
count++;
}
}
}
return count;
}
/**
* Creates and adds mappings
*
* - from cds to peptide
* - from dna to cds
*
* and returns the dna-to-cds mapping
*
* @param dnaSeq
* @param cdsSeq
* @param dnaMapping
* @param newMappings
* @return
*/
protected static MapList addCdsMappings(SequenceI dnaSeq,
SequenceI cdsSeq, Mapping dnaMapping,
AlignedCodonFrame newMappings)
{
cdsSeq.createDatasetSequence();
/*
* CDS to peptide is just a contiguous 3:1 mapping, with
* the peptide ranges taken unchanged from the dna mapping
*/
List cdsRanges = new ArrayList();
SequenceI cdsDataset = cdsSeq.getDatasetSequence();
cdsRanges.add(new int[] { 1, cdsDataset.getLength() });
MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap()
.getToRanges(), 3, 1);
newMappings.addMap(cdsDataset, dnaMapping.getTo(), cdsToPeptide);
/*
* dna 'from' ranges map 1:1 to the contiguous extracted CDS
*/
MapList dnaToCds = new MapList(dnaMapping.getMap().getFromRanges(),
cdsRanges, 1, 1);
newMappings.addMap(dnaSeq, cdsDataset, dnaToCds);
return dnaToCds;
}
/**
* Makes and returns a CDS-only sequence, where the CDS regions are identified
* as the 'from' ranges of the mapping on the dna.
*
* @param dnaSeq
* nucleotide sequence
* @param seqMapping
* mappings from CDS regions of nucleotide
* @param ungappedCdsColumns
* @return
*/
protected static SequenceI makeCdsSequence(SequenceI dnaSeq,
Mapping seqMapping, List ungappedCdsColumns, char gapChar)
{
int cdsWidth = MappingUtils.getLength(ungappedCdsColumns);
/*
* populate CDS columns with the aligned
* column character if that column is mapped (which may be a gap
* if an intron interrupts a codon), else with a gap
*/
List fromRanges = seqMapping.getMap().getFromRanges();
char[] cdsChars = new char[cdsWidth];
int pos = 0;
for (int[] columns : ungappedCdsColumns)
{
for (int i = columns[0]; i <= columns[1]; i++)
{
char dnaChar = dnaSeq.getCharAt(i - 1);
if (Comparison.isGap(dnaChar))
{
cdsChars[pos] = gapChar;
}
else
{
int seqPos = dnaSeq.findPosition(i - 1);
if (MappingUtils.contains(fromRanges, seqPos))
{
cdsChars[pos] = dnaChar;
}
else
{
cdsChars[pos] = gapChar;
}
}
pos++;
}
}
SequenceI cdsSequence = new Sequence(dnaSeq.getName(),
String.valueOf(cdsChars));
transferDbRefs(seqMapping.getTo(), cdsSequence);
return cdsSequence;
}
/**
* Locate any xrefs to CDS databases on the protein product and attach to the
* CDS sequence. Also add as a sub-token of the sequence name.
*
* @param from
* @param to
*/
protected static void transferDbRefs(SequenceI from, SequenceI to)
{
String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL);
DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(from.getDBRefs(),
DBRefSource.CODINGDBS);
if (cdsRefs != null)
{
for (DBRefEntry cdsRef : cdsRefs)
{
to.addDBRef(new DBRefEntry(cdsRef));
cdsAccId = cdsRef.getAccessionId();
}
}
if (!to.getName().contains(cdsAccId))
{
to.setName(to.getName() + "|" + cdsAccId);
}
}
/**
* Returns a mapping from dna to protein by inspecting sequence features of
* type "CDS" on the dna.
*
* @param dnaSeq
* @param proteinSeq
* @return
*/
public static MapList mapCdsToProtein(SequenceI dnaSeq,
SequenceI proteinSeq)
{
List ranges = findCdsPositions(dnaSeq);
int mappedDnaLength = MappingUtils.getLength(ranges);
int proteinLength = proteinSeq.getLength();
int proteinStart = proteinSeq.getStart();
int proteinEnd = proteinSeq.getEnd();
/*
* incomplete start codon may mean X at start of peptide
* we ignore both for mapping purposes
*/
if (proteinSeq.getCharAt(0) == 'X')
{
// todo JAL-2022 support startPhase > 0
proteinStart++;
proteinLength--;
}
List proteinRange = new ArrayList();
/*
* dna length should map to protein (or protein plus stop codon)
*/
int codesForResidues = mappedDnaLength / 3;
if (codesForResidues == (proteinLength + 1))
{
// assuming extra codon is for STOP and not in peptide
codesForResidues--;
}
if (codesForResidues == proteinLength)
{
proteinRange.add(new int[] { proteinStart, proteinEnd });
return new MapList(ranges, proteinRange, 3, 1);
}
return null;
}
/**
* Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
* start/end positions of sequence features of type "CDS" (or a sub-type of
* CDS in the Sequence Ontology)
*
* @param dnaSeq
* @return
*/
public static List findCdsPositions(SequenceI dnaSeq)
{
List result = new ArrayList();
SequenceFeature[] sfs = dnaSeq.getSequenceFeatures();
if (sfs == null)
{
return result;
}
SequenceOntologyI so = SequenceOntologyFactory.getInstance();
for (SequenceFeature sf : sfs)
{
/*
* process a CDS feature (or a sub-type of CDS)
*/
if (so.isA(sf.getType(), SequenceOntologyI.CDS))
{
int phase = 0;
try {
phase = Integer.parseInt(sf.getPhase());
} catch (NumberFormatException e)
{
// ignore
}
/*
* phase > 0 on first codon means 5' incomplete - skip to the start
* of the next codon; example ENST00000496384
*/
int begin = sf.getBegin();
int end = sf.getEnd();
if (result.isEmpty())
{
// TODO JAL-2022 support start phase > 0
begin += phase;
if (begin > end)
{
continue; // shouldn't happen?
}
}
result.add(new int[] { begin, end });
}
}
return result;
}
/**
* Maps exon features from dna to protein, and computes variants in peptide
* product generated by variants in dna, and adds them as sequence_variant
* features on the protein sequence. Returns the number of variant features
* added.
*
* @param dnaSeq
* @param peptide
* @param dnaToProtein
*/
public static int computeProteinFeatures(SequenceI dnaSeq,
SequenceI peptide, MapList dnaToProtein)
{
while (dnaSeq.getDatasetSequence() != null)
{
dnaSeq = dnaSeq.getDatasetSequence();
}
while (peptide.getDatasetSequence() != null)
{
peptide = peptide.getDatasetSequence();
}
transferFeatures(dnaSeq, peptide, dnaToProtein,
SequenceOntologyI.EXON);
LinkedHashMap variants = buildDnaVariantsMap(
dnaSeq, dnaToProtein);
/*
* scan codon variations, compute peptide variants and add to peptide sequence
*/
int count = 0;
for (Entry variant : variants.entrySet())
{
int peptidePos = variant.getKey();
String[][] codonVariants = variant.getValue();
String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
List peptideVariants = computePeptideVariants(codonVariants,
residue);
if (!peptideVariants.isEmpty())
{
String desc = StringUtils.listToDelimitedString(peptideVariants,
", ");
SequenceFeature sf = new SequenceFeature(
SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
peptidePos, 0f, null);
peptide.addSequenceFeature(sf);
count++;
}
}
/*
* ugly sort to get sequence features in start position order
* - would be better to store in Sequence as a TreeSet instead?
*/
Arrays.sort(peptide.getSequenceFeatures(),
new Comparator()
{
@Override
public int compare(SequenceFeature o1, SequenceFeature o2)
{
int c = Integer.compare(o1.getBegin(), o2.getBegin());
return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
: c;
}
});
return count;
}
/**
* Builds a map whose key is position in the protein sequence, and value is an
* array of all variants for the coding codon positions
*
* @param dnaSeq
* @param dnaToProtein
* @return
*/
static LinkedHashMap buildDnaVariantsMap(
SequenceI dnaSeq, MapList dnaToProtein)
{
/*
* map from peptide position to all variant features of the codon for it
* LinkedHashMap ensures we add the peptide features in sequence order
*/
LinkedHashMap variants = new LinkedHashMap();
SequenceOntologyI so = SequenceOntologyFactory.getInstance();
SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
if (dnaFeatures == null)
{
return variants;
}
int dnaStart = dnaSeq.getStart();
int[] lastCodon = null;
int lastPeptidePostion = 0;
/*
* build a map of codon variations for peptides
*/
for (SequenceFeature sf : dnaFeatures)
{
int dnaCol = sf.getBegin();
if (dnaCol != sf.getEnd())
{
// not handling multi-locus variant features
continue;
}
if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT))
{
int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
if (mapsTo == null)
{
// feature doesn't lie within coding region
continue;
}
int peptidePosition = mapsTo[0];
String[][] codonVariants = variants.get(peptidePosition);
if (codonVariants == null)
{
codonVariants = new String[3][];
variants.put(peptidePosition, codonVariants);
}
/*
* extract dna variants to a string array
*/
String alls = (String) sf.getValue("alleles");
if (alls == null)
{
continue;
}
String[] alleles = alls.toUpperCase().split(",");
int i = 0;
for (String allele : alleles)
{
alleles[i++] = allele.trim(); // lose any space characters "A, G"
}
/*
* get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
*/
int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
: MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
peptidePosition, peptidePosition));
lastPeptidePostion = peptidePosition;
lastCodon = codon;
/*
* save nucleotide (and this variant) for each codon position
*/
for (int codonPos = 0; codonPos < 3; codonPos++)
{
String nucleotide = String.valueOf(
dnaSeq.getCharAt(codon[codonPos] - dnaStart))
.toUpperCase();
if (codonVariants[codonPos] == null)
{
/*
* record current dna base
*/
codonVariants[codonPos] = new String[] { nucleotide };
}
if (codon[codonPos] == dnaCol)
{
/*
* add alleles to dna base (and any previously found alleles)
*/
String[] known = codonVariants[codonPos];
String[] dnaVariants = new String[alleles.length + known.length];
System.arraycopy(known, 0, dnaVariants, 0, known.length);
System.arraycopy(alleles, 0, dnaVariants, known.length,
alleles.length);
codonVariants[codonPos] = dnaVariants;
}
}
}
}
return variants;
}
/**
* Returns a sorted, non-redundant list of all peptide translations generated
* by the given dna variants, excluding the current residue value
*
* @param codonVariants
* an array of base values (acgtACGT) for codon positions 1, 2, 3
* @param residue
* the current residue translation
* @return
*/
static List computePeptideVariants(
String[][] codonVariants, String residue)
{
List result = new ArrayList();
for (String base1 : codonVariants[0])
{
for (String base2 : codonVariants[1])
{
for (String base3 : codonVariants[2])
{
String codon = base1 + base2 + base3;
/*
* get peptide translation of codon e.g. GAT -> D
* note that variants which are not single alleles,
* e.g. multibase variants or HGMD_MUTATION etc
* are ignored here
*/
String peptide = codon.contains("-") ? "-"
: (codon.length() > 3 ? null : ResidueProperties
.codonTranslate(codon));
if (peptide != null && !result.contains(peptide)
&& !peptide.equalsIgnoreCase(residue))
{
result.add(peptide);
}
}
}
}
/*
* sort alphabetically with STOP at the end
*/
Collections.sort(result, new Comparator()
{
@Override
public int compare(String o1, String o2)
{
if ("STOP".equals(o1))
{
return 1;
}
else if ("STOP".equals(o2))
{
return -1;
}
else
{
return o1.compareTo(o2);
}
}
});
return result;
}
}