[] codonVariants)
{
String residue = String.valueOf(peptide.getCharAt(peptidePos - 1));
int count = 0;
String base1 = codonVariants[0].get(0).base;
String base2 = codonVariants[1].get(0).base;
String base3 = codonVariants[2].get(0).base;
/*
* variants in first codon base
*/
for (DnaVariant var : codonVariants[0])
{
if (var.variant != null)
{
String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES);
if (alleles != null)
{
for (String base : alleles.split(","))
{
if (!base1.equalsIgnoreCase(base))
{
String codon = base.toUpperCase() + base2.toLowerCase()
+ base3.toLowerCase();
String canonical = base1.toUpperCase() + base2.toLowerCase()
+ base3.toLowerCase();
if (addPeptideVariant(peptide, peptidePos, residue, var,
codon, canonical))
{
count++;
}
}
}
}
}
}
/*
* variants in second codon base
*/
for (DnaVariant var : codonVariants[1])
{
if (var.variant != null)
{
String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES);
if (alleles != null)
{
for (String base : alleles.split(","))
{
if (!base2.equalsIgnoreCase(base))
{
String codon = base1.toLowerCase() + base.toUpperCase()
+ base3.toLowerCase();
String canonical = base1.toLowerCase() + base2.toUpperCase()
+ base3.toLowerCase();
if (addPeptideVariant(peptide, peptidePos, residue, var,
codon, canonical))
{
count++;
}
}
}
}
}
}
/*
* variants in third codon base
*/
for (DnaVariant var : codonVariants[2])
{
if (var.variant != null)
{
String alleles = (String) var.variant.getValue(Gff3Helper.ALLELES);
if (alleles != null)
{
for (String base : alleles.split(","))
{
if (!base3.equalsIgnoreCase(base))
{
String codon = base1.toLowerCase() + base2.toLowerCase()
+ base.toUpperCase();
String canonical = base1.toLowerCase() + base2.toLowerCase()
+ base3.toUpperCase();
if (addPeptideVariant(peptide, peptidePos, residue, var,
codon, canonical))
{
count++;
}
}
}
}
}
}
return count;
}
/**
* Helper method that adds a peptide variant feature. ID and
* clinical_significance attributes of the dna variant (if present) are copied
* to the new feature.
*
* @param peptide
* @param peptidePos
* @param residue
* @param var
* @param codon
* the variant codon e.g. aCg
* @param canonical
* the 'normal' codon e.g. aTg
* @return true if a feature was added, else false
*/
static boolean addPeptideVariant(SequenceI peptide, int peptidePos,
String residue, DnaVariant var, String codon, String canonical)
{
/*
* get peptide translation of codon e.g. GAT -> D
* note that variants which are not single alleles,
* e.g. multibase variants or HGMD_MUTATION etc
* are currently ignored here
*/
String trans = codon.contains("-") ? null
: (codon.length() > CODON_LENGTH ? null
: ResidueProperties.codonTranslate(codon));
if (trans == null)
{
return false;
}
String desc = canonical + "/" + codon;
String featureType = "";
if (trans.equals(residue))
{
featureType = SequenceOntologyI.SYNONYMOUS_VARIANT;
}
else if (ResidueProperties.STOP.equals(trans))
{
featureType = SequenceOntologyI.STOP_GAINED;
}
else
{
String residue3Char = StringUtils
.toSentenceCase(ResidueProperties.aa2Triplet.get(residue));
String trans3Char = StringUtils
.toSentenceCase(ResidueProperties.aa2Triplet.get(trans));
desc = "p." + residue3Char + peptidePos + trans3Char;
featureType = SequenceOntologyI.NONSYNONYMOUS_VARIANT;
}
SequenceFeature sf = new SequenceFeature(featureType, desc, peptidePos,
peptidePos, var.getSource());
StringBuilder attributes = new StringBuilder(32);
String id = (String) var.variant.getValue(VARIANT_ID);
if (id != null)
{
if (id.startsWith(SEQUENCE_VARIANT))
{
id = id.substring(SEQUENCE_VARIANT.length());
}
sf.setValue(VARIANT_ID, id);
attributes.append(VARIANT_ID).append("=").append(id);
// TODO handle other species variants JAL-2064
StringBuilder link = new StringBuilder(32);
try
{
link.append(desc).append(" ").append(id).append(
"|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=")
.append(URLEncoder.encode(id, "UTF-8"));
sf.addLink(link.toString());
} catch (UnsupportedEncodingException e)
{
// as if
}
}
String clinSig = (String) var.variant.getValue(CLINICAL_SIGNIFICANCE);
if (clinSig != null)
{
sf.setValue(CLINICAL_SIGNIFICANCE, clinSig);
attributes.append(";").append(CLINICAL_SIGNIFICANCE).append("=")
.append(clinSig);
}
peptide.addSequenceFeature(sf);
if (attributes.length() > 0)
{
sf.setAttributes(attributes.toString());
}
return true;
}
/**
* Builds a map whose key is position in the protein sequence, and value is a
* list of the base and all variants for each corresponding codon position.
*
* This depends on dna variants being held as a comma-separated list as
* property "alleles" on variant features.
*
* @param dnaSeq
* @param dnaToProtein
* @return
*/
@SuppressWarnings("unchecked")
static LinkedHashMap[]> buildDnaVariantsMap(
SequenceI dnaSeq, MapList dnaToProtein)
{
/*
* map from peptide position to all variants of the codon which codes for it
* LinkedHashMap ensures we keep the peptide features in sequence order
*/
LinkedHashMap[]> variants = new LinkedHashMap<>();
List dnaFeatures = dnaSeq.getFeatures()
.getFeaturesByOntology(SequenceOntologyI.SEQUENCE_VARIANT);
if (dnaFeatures.isEmpty())
{
return variants;
}
int dnaStart = dnaSeq.getStart();
int[] lastCodon = null;
int lastPeptidePostion = 0;
/*
* build a map of codon variations for peptides
*/
for (SequenceFeature sf : dnaFeatures)
{
int dnaCol = sf.getBegin();
if (dnaCol != sf.getEnd())
{
// not handling multi-locus variant features
continue;
}
/*
* ignore variant if not a SNP
*/
String alls = (String) sf.getValue(Gff3Helper.ALLELES);
if (alls == null)
{
continue; // non-SNP VCF variant perhaps - can't process this
}
String[] alleles = alls.toUpperCase().split(",");
boolean isSnp = true;
for (String allele : alleles)
{
if (allele.trim().length() > 1)
{
isSnp = false;
}
}
if (!isSnp)
{
continue;
}
int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol);
if (mapsTo == null)
{
// feature doesn't lie within coding region
continue;
}
int peptidePosition = mapsTo[0];
List[] codonVariants = variants.get(peptidePosition);
if (codonVariants == null)
{
codonVariants = new ArrayList[CODON_LENGTH];
codonVariants[0] = new ArrayList<>();
codonVariants[1] = new ArrayList<>();
codonVariants[2] = new ArrayList<>();
variants.put(peptidePosition, codonVariants);
}
/*
* get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10]
*/
int[] codon = peptidePosition == lastPeptidePostion ? lastCodon
: MappingUtils.flattenRanges(dnaToProtein.locateInFrom(
peptidePosition, peptidePosition));
lastPeptidePostion = peptidePosition;
lastCodon = codon;
/*
* save nucleotide (and any variant) for each codon position
*/
for (int codonPos = 0; codonPos < CODON_LENGTH; codonPos++)
{
String nucleotide = String.valueOf(
dnaSeq.getCharAt(codon[codonPos] - dnaStart)).toUpperCase();
List codonVariant = codonVariants[codonPos];
if (codon[codonPos] == dnaCol)
{
if (!codonVariant.isEmpty()
&& codonVariant.get(0).variant == null)
{
/*
* already recorded base value, add this variant
*/
codonVariant.get(0).variant = sf;
}
else
{
/*
* add variant with base value
*/
codonVariant.add(new DnaVariant(nucleotide, sf));
}
}
else if (codonVariant.isEmpty())
{
/*
* record (possibly non-varying) base value
*/
codonVariant.add(new DnaVariant(nucleotide));
}
}
}
return variants;
}
/**
* Makes an alignment with a copy of the given sequences, adding in any
* non-redundant sequences which are mapped to by the cross-referenced
* sequences.
*
* @param seqs
* @param xrefs
* @param dataset
* the alignment dataset shared by the new copy
* @return
*/
public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
SequenceI[] xrefs, AlignmentI dataset)
{
AlignmentI copy = new Alignment(new Alignment(seqs));
copy.setDataset(dataset);
boolean isProtein = !copy.isNucleotide();
SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
if (xrefs != null)
{
for (SequenceI xref : xrefs)
{
DBRefEntry[] dbrefs = xref.getDBRefs();
if (dbrefs != null)
{
for (DBRefEntry dbref : dbrefs)
{
if (dbref.getMap() == null || dbref.getMap().getTo() == null
|| dbref.getMap().getTo().isProtein() != isProtein)
{
continue;
}
SequenceI mappedTo = dbref.getMap().getTo();
SequenceI match = matcher.findIdMatch(mappedTo);
if (match == null)
{
matcher.add(mappedTo);
copy.addSequence(mappedTo);
}
}
}
}
}
return copy;
}
/**
* Try to align sequences in 'unaligned' to match the alignment of their
* mapped regions in 'aligned'. For example, could use this to align CDS
* sequences which are mapped to their parent cDNA sequences.
*
* This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
* dna-to-protein or protein-to-dna use alternative methods.
*
* @param unaligned
* sequences to be aligned
* @param aligned
* holds aligned sequences and their mappings
* @return
*/
public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
{
/*
* easy case - aligning a copy of aligned sequences
*/
if (alignAsSameSequences(unaligned, aligned))
{
return unaligned.getHeight();
}
/*
* fancy case - aligning via mappings between sequences
*/
List unmapped = new ArrayList<>();
Map> columnMap = buildMappedColumnsMap(
unaligned, aligned, unmapped);
int width = columnMap.size();
char gap = unaligned.getGapCharacter();
int realignedCount = 0;
// TODO: verify this loop scales sensibly for very wide/high alignments
for (SequenceI seq : unaligned.getSequences())
{
if (!unmapped.contains(seq))
{
char[] newSeq = new char[width];
Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
// Integer iteration below
int newCol = 0;
int lastCol = 0;
/*
* traverse the map to find columns populated
* by our sequence
*/
for (Integer column : columnMap.keySet())
{
Character c = columnMap.get(column).get(seq);
if (c != null)
{
/*
* sequence has a character at this position
*
*/
newSeq[newCol] = c;
lastCol = newCol;
}
newCol++;
}
/*
* trim trailing gaps
*/
if (lastCol < width)
{
char[] tmp = new char[lastCol + 1];
System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
newSeq = tmp;
}
// TODO: optimise SequenceI to avoid char[]->String->char[]
seq.setSequence(String.valueOf(newSeq));
realignedCount++;
}
}
return realignedCount;
}
/**
* If unaligned and aligned sequences share the same dataset sequences, then
* simply copies the aligned sequences to the unaligned sequences and returns
* true; else returns false
*
* @param unaligned
* - sequences to be aligned based on aligned
* @param aligned
* - 'guide' alignment containing sequences derived from same dataset
* as unaligned
* @return
*/
static boolean alignAsSameSequences(AlignmentI unaligned,
AlignmentI aligned)
{
if (aligned.getDataset() == null || unaligned.getDataset() == null)
{
return false; // should only pass alignments with datasets here
}
// map from dataset sequence to alignment sequence(s)
Map> alignedDatasets = new HashMap<>();
for (SequenceI seq : aligned.getSequences())
{
SequenceI ds = seq.getDatasetSequence();
if (alignedDatasets.get(ds) == null)
{
alignedDatasets.put(ds, new ArrayList());
}
alignedDatasets.get(ds).add(seq);
}
/*
* first pass - check whether all sequences to be aligned share a dataset
* sequence with an aligned sequence
*/
for (SequenceI seq : unaligned.getSequences())
{
if (!alignedDatasets.containsKey(seq.getDatasetSequence()))
{
return false;
}
}
/*
* second pass - copy aligned sequences;
* heuristic rule: pair off sequences in order for the case where
* more than one shares the same dataset sequence
*/
for (SequenceI seq : unaligned.getSequences())
{
List alignedSequences = alignedDatasets
.get(seq.getDatasetSequence());
// TODO: getSequenceAsString() will be deprecated in the future
// TODO: need to leave to SequenceI implementor to update gaps
seq.setSequence(alignedSequences.get(0).getSequenceAsString());
if (alignedSequences.size() > 0)
{
// pop off aligned sequences (except the last one)
alignedSequences.remove(0);
}
}
return true;
}
/**
* Returns a map whose key is alignment column number (base 1), and whose
* values are a map of sequence characters in that column.
*
* @param unaligned
* @param aligned
* @param unmapped
* @return
*/
static SortedMap> buildMappedColumnsMap(
AlignmentI unaligned, AlignmentI aligned,
List unmapped)
{
/*
* Map will hold, for each aligned column position, a map of
* {unalignedSequence, characterPerSequence} at that position.
* TreeMap keeps the entries in ascending column order.
*/
SortedMap> map = new TreeMap<>();
/*
* record any sequences that have no mapping so can't be realigned
*/
unmapped.addAll(unaligned.getSequences());
List mappings = aligned.getCodonFrames();
for (SequenceI seq : unaligned.getSequences())
{
for (AlignedCodonFrame mapping : mappings)
{
SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
if (fromSeq != null)
{
Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
if (addMappedPositions(seq, fromSeq, seqMap, map))
{
unmapped.remove(seq);
}
}
}
}
return map;
}
/**
* Helper method that adds to a map the mapped column positions of a sequence.
*
* For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
* that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
* sequence.
*
* @param seq
* the sequence whose column positions we are recording
* @param fromSeq
* a sequence that is mapped to the first sequence
* @param seqMap
* the mapping from 'fromSeq' to 'seq'
* @param map
* a map to add the column positions (in fromSeq) of the mapped
* positions of seq
* @return
*/
static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
Mapping seqMap, Map> map)
{
if (seqMap == null)
{
return false;
}
/*
* invert mapping if it is from unaligned to aligned sequence
*/
if (seqMap.getTo() == fromSeq.getDatasetSequence())
{
seqMap = new Mapping(seq.getDatasetSequence(),
seqMap.getMap().getInverse());
}
int toStart = seq.getStart();
/*
* traverse [start, end, start, end...] ranges in fromSeq
*/
for (int[] fromRange : seqMap.getMap().getFromRanges())
{
for (int i = 0; i < fromRange.length - 1; i += 2)
{
boolean forward = fromRange[i + 1] >= fromRange[i];
/*
* find the range mapped to (sequence positions base 1)
*/
int[] range = seqMap.locateMappedRange(fromRange[i],
fromRange[i + 1]);
if (range == null)
{
System.err.println("Error in mapping " + seqMap + " from "
+ fromSeq.getName());
return false;
}
int fromCol = fromSeq.findIndex(fromRange[i]);
int mappedCharPos = range[0];
/*
* walk over the 'from' aligned sequence in forward or reverse
* direction; when a non-gap is found, record the column position
* of the next character of the mapped-to sequence; stop when all
* the characters of the range have been counted
*/
while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
&& fromCol >= 0)
{
if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
{
/*
* mapped from sequence has a character in this column
* record the column position for the mapped to character
*/
Map seqsMap = map.get(fromCol);
if (seqsMap == null)
{
seqsMap = new HashMap<>();
map.put(fromCol, seqsMap);
}
seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
mappedCharPos++;
}
fromCol += (forward ? 1 : -1);
}
}
}
return true;
}
// strictly temporary hack until proper criteria for aligning protein to cds
// are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
public static boolean looksLikeEnsembl(AlignmentI alignment)
{
for (SequenceI seq : alignment.getSequences())
{
String name = seq.getName();
if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
{
return false;
}
}
return true;
}
}