/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis; import jalview.datamodel.AlignedCodon; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.IncompleteCodonException; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceGroup; import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.schemes.ResidueProperties; import jalview.util.Comparison; import jalview.util.MapList; import jalview.util.MappingUtils; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.NoSuchElementException; import java.util.Set; import java.util.TreeMap; /** * grab bag of useful alignment manipulation operations Expect these to be * refactored elsewhere at some point. * * @author jimp * */ public class AlignmentUtils { private static final String SEQUENCE_VARIANT = "sequence_variant:"; private static final String ID = "ID"; private static final String CLINICAL_SIGNIFICANCE = "clinical_significance"; /** * A data model to hold the 'normal' base value at a position, and an optional * sequence variant feature */ static class DnaVariant { String base; SequenceFeature variant; DnaVariant(String nuc) { base = nuc; } DnaVariant(String nuc, SequenceFeature var) { base = nuc; variant = var; } } /** * given an existing alignment, create a new alignment including all, or up to * flankSize additional symbols from each sequence's dataset sequence * * @param core * @param flankSize * @return AlignmentI */ public static AlignmentI expandContext(AlignmentI core, int flankSize) { List sq = new ArrayList(); int maxoffset = 0; for (SequenceI s : core.getSequences()) { SequenceI newSeq = s.deriveSequence(); final int newSeqStart = newSeq.getStart() - 1; if (newSeqStart > maxoffset && newSeq.getDatasetSequence().getStart() < s.getStart()) { maxoffset = newSeqStart; } sq.add(newSeq); } if (flankSize > -1) { maxoffset = Math.min(maxoffset, flankSize); } /* * now add offset left and right to create an expanded alignment */ for (SequenceI s : sq) { SequenceI ds = s; while (ds.getDatasetSequence() != null) { ds = ds.getDatasetSequence(); } int s_end = s.findPosition(s.getStart() + s.getLength()); // find available flanking residues for sequence int ustream_ds = s.getStart() - ds.getStart(); int dstream_ds = ds.getEnd() - s_end; // build new flanked sequence // compute gap padding to start of flanking sequence int offset = maxoffset - ustream_ds; // padding is gapChar x ( maxoffset - min(ustream_ds, flank) if (flankSize >= 0) { if (flankSize < ustream_ds) { // take up to flankSize residues offset = maxoffset - flankSize; ustream_ds = flankSize; } if (flankSize <= dstream_ds) { dstream_ds = flankSize - 1; } } // TODO use Character.toLowerCase to avoid creating String objects? char[] upstream = new String(ds.getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1)).toLowerCase().toCharArray(); char[] downstream = new String(ds.getSequence(s_end - 1, s_end + dstream_ds)).toLowerCase().toCharArray(); char[] coreseq = s.getSequence(); char[] nseq = new char[offset + upstream.length + downstream.length + coreseq.length]; char c = core.getGapCharacter(); int p = 0; for (; p < offset; p++) { nseq[p] = c; } System.arraycopy(upstream, 0, nseq, p, upstream.length); System.arraycopy(coreseq, 0, nseq, p + upstream.length, coreseq.length); System.arraycopy(downstream, 0, nseq, p + coreseq.length + upstream.length, downstream.length); s.setSequence(new String(nseq)); s.setStart(s.getStart() - ustream_ds); s.setEnd(s_end + downstream.length); } AlignmentI newAl = new jalview.datamodel.Alignment( sq.toArray(new SequenceI[0])); for (SequenceI s : sq) { if (s.getAnnotation() != null) { for (AlignmentAnnotation aa : s.getAnnotation()) { aa.adjustForAlignment(); // JAL-1712 fix newAl.addAnnotation(aa); } } } newAl.setDataset(core.getDataset()); return newAl; } /** * Returns the index (zero-based position) of a sequence in an alignment, or * -1 if not found. * * @param al * @param seq * @return */ public static int getSequenceIndex(AlignmentI al, SequenceI seq) { int result = -1; int pos = 0; for (SequenceI alSeq : al.getSequences()) { if (alSeq == seq) { result = pos; break; } pos++; } return result; } /** * Returns a map of lists of sequences in the alignment, keyed by sequence * name. For use in mapping between different alignment views of the same * sequences. * * @see jalview.datamodel.AlignmentI#getSequencesByName() */ public static Map> getSequencesByName( AlignmentI al) { Map> theMap = new LinkedHashMap>(); for (SequenceI seq : al.getSequences()) { String name = seq.getName(); if (name != null) { List seqs = theMap.get(name); if (seqs == null) { seqs = new ArrayList(); theMap.put(name, seqs); } seqs.add(seq); } } return theMap; } /** * Build mapping of protein to cDNA alignment. Mappings are made between * sequences where the cDNA translates to the protein sequence. Any new * mappings are added to the protein alignment. Returns true if any mappings * either already exist or were added, else false. * * @param proteinAlignment * @param cdnaAlignment * @return */ public static boolean mapProteinAlignmentToCdna( final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment) { if (proteinAlignment == null || cdnaAlignment == null) { return false; } Set mappedDna = new HashSet(); Set mappedProtein = new HashSet(); /* * First pass - map sequences where cross-references exist. This include * 1-to-many mappings to support, for example, variant cDNA. */ boolean mappingPerformed = mapProteinToCdna(proteinAlignment, cdnaAlignment, mappedDna, mappedProtein, true); /* * Second pass - map sequences where no cross-references exist. This only * does 1-to-1 mappings and assumes corresponding sequences are in the same * order in the alignments. */ mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment, mappedDna, mappedProtein, false); return mappingPerformed; } /** * Make mappings between compatible sequences (where the cDNA translation * matches the protein). * * @param proteinAlignment * @param cdnaAlignment * @param mappedDna * a set of mapped DNA sequences (to add to) * @param mappedProtein * a set of mapped Protein sequences (to add to) * @param xrefsOnly * if true, only map sequences where xrefs exist * @return */ protected static boolean mapProteinToCdna( final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment, Set mappedDna, Set mappedProtein, boolean xrefsOnly) { boolean mappingExistsOrAdded = false; List thisSeqs = proteinAlignment.getSequences(); for (SequenceI aaSeq : thisSeqs) { boolean proteinMapped = false; AlignedCodonFrame acf = new AlignedCodonFrame(); for (SequenceI cdnaSeq : cdnaAlignment.getSequences()) { /* * Always try to map if sequences have xref to each other; this supports * variant cDNA or alternative splicing for a protein sequence. * * If no xrefs, try to map progressively, assuming that alignments have * mappable sequences in corresponding order. These are not * many-to-many, as that would risk mixing species with similar cDNA * sequences. */ if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq)) { continue; } /* * Don't map non-xrefd sequences more than once each. This heuristic * allows us to pair up similar sequences in ordered alignments. */ if (!xrefsOnly && (mappedProtein.contains(aaSeq) || mappedDna .contains(cdnaSeq))) { continue; } if (mappingExists(proteinAlignment.getCodonFrames(), aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence())) { mappingExistsOrAdded = true; } else { MapList map = mapCdnaToProtein(aaSeq, cdnaSeq); if (map != null) { acf.addMap(cdnaSeq, aaSeq, map); mappingExistsOrAdded = true; proteinMapped = true; mappedDna.add(cdnaSeq); mappedProtein.add(aaSeq); } } } if (proteinMapped) { proteinAlignment.addCodonFrame(acf); } } return mappingExistsOrAdded; } /** * Answers true if the mappings include one between the given (dataset) * sequences. */ public static boolean mappingExists(List mappings, SequenceI aaSeq, SequenceI cdnaSeq) { if (mappings != null) { for (AlignedCodonFrame acf : mappings) { if (cdnaSeq == acf.getDnaForAaSeq(aaSeq)) { return true; } } } return false; } /** * Builds a mapping (if possible) of a cDNA to a protein sequence. *

first checks if the cdna translates exactly to the protein sequence
else checks for translation after removing a STOP codon
else checks for translation after removing a START codon
if that fails, inspect CDS features on the cDNA sequence

* Returns null if no mapping is determined. * * @param proteinSeq * the aligned protein sequence * @param cdnaSeq * the aligned cdna sequence * @return */ public static MapList mapCdnaToProtein(SequenceI proteinSeq, SequenceI cdnaSeq) { /* * Here we handle either dataset sequence set (desktop) or absent (applet). * Use only the char[] form of the sequence to avoid creating possibly large * String objects. */ final SequenceI proteinDataset = proteinSeq.getDatasetSequence(); char[] aaSeqChars = proteinDataset != null ? proteinDataset .getSequence() : proteinSeq.getSequence(); final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence(); char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence() : cdnaSeq.getSequence(); if (aaSeqChars == null || cdnaSeqChars == null) { return null; } /* * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping) */ final int mappedLength = 3 * aaSeqChars.length; int cdnaLength = cdnaSeqChars.length; int cdnaStart = cdnaSeq.getStart(); int cdnaEnd = cdnaSeq.getEnd(); final int proteinStart = proteinSeq.getStart(); final int proteinEnd = proteinSeq.getEnd(); /* * If lengths don't match, try ignoring stop codon (if present) */ if (cdnaLength != mappedLength && cdnaLength > 2) { String lastCodon = String.valueOf(cdnaSeqChars, cdnaLength - 3, 3) .toUpperCase(); for (String stop : ResidueProperties.STOP) { if (lastCodon.equals(stop)) { cdnaEnd -= 3; cdnaLength -= 3; break; } } } /* * If lengths still don't match, try ignoring start codon. */ int startOffset = 0; if (cdnaLength != mappedLength && cdnaLength > 2 && String.valueOf(cdnaSeqChars, 0, 3).toUpperCase() .equals(ResidueProperties.START)) { startOffset += 3; cdnaStart += 3; cdnaLength -= 3; } if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars)) { /* * protein is translation of dna (+/- start/stop codons) */ MapList map = new MapList(new int[] { cdnaStart, cdnaEnd }, new int[] { proteinStart, proteinEnd }, 3, 1); return map; } /* * translation failed - try mapping CDS annotated regions of dna */ return mapCdsToProtein(cdnaSeq, proteinSeq); } /** * Test whether the given cdna sequence, starting at the given offset, * translates to the given amino acid sequence, using the standard translation * table. Designed to fail fast i.e. as soon as a mismatch position is found. * * @param cdnaSeqChars * @param cdnaStart * @param aaSeqChars * @return */ protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart, char[] aaSeqChars) { if (cdnaSeqChars == null || aaSeqChars == null) { return false; } int aaPos = 0; int dnaPos = cdnaStart; for (; dnaPos < cdnaSeqChars.length - 2 && aaPos < aaSeqChars.length; dnaPos += 3, aaPos++) { String codon = String.valueOf(cdnaSeqChars, dnaPos, 3); final String translated = ResidueProperties.codonTranslate(codon); /* * allow * in protein to match untranslatable in dna */ final char aaRes = aaSeqChars[aaPos]; if ((translated == null || "STOP".equals(translated)) && aaRes == '*') { continue; } if (translated == null || !(aaRes == translated.charAt(0))) { // debug // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": " // + codon + "(" + translated + ") != " + aaRes)); return false; } } /* * check we matched all of the protein sequence */ if (aaPos != aaSeqChars.length) { return false; } /* * check we matched all of the dna except * for optional trailing STOP codon */ if (dnaPos == cdnaSeqChars.length) { return true; } if (dnaPos == cdnaSeqChars.length - 3) { String codon = String.valueOf(cdnaSeqChars, dnaPos, 3); if ("STOP".equals(ResidueProperties.codonTranslate(codon))) { return true; } } return false; } /** * Align sequence 'seq' to match the alignment of a mapped sequence. Note this * currently assumes that we are aligning cDNA to match protein. * * @param seq * the sequence to be realigned * @param al * the alignment whose sequence alignment is to be 'copied' * @param gap * character string represent a gap in the realigned sequence * @param preserveUnmappedGaps * @param preserveMappedGaps * @return true if the sequence was realigned, false if it could not be */ public static boolean alignSequenceAs(SequenceI seq, AlignmentI al, String gap, boolean preserveMappedGaps, boolean preserveUnmappedGaps) { /* * Get any mappings from the source alignment to the target (dataset) * sequence. */ // TODO there may be one AlignedCodonFrame per dataset sequence, or one with // all mappings. Would it help to constrain this? List mappings = al.getCodonFrame(seq); if (mappings == null || mappings.isEmpty()) { return false; } /* * Locate the aligned source sequence whose dataset sequence is mapped. We * just take the first match here (as we can't align like more than one * sequence). */ SequenceI alignFrom = null; AlignedCodonFrame mapping = null; for (AlignedCodonFrame mp : mappings) { alignFrom = mp.findAlignedSequence(seq, al); if (alignFrom != null) { mapping = mp; break; } } if (alignFrom == null) { return false; } alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(), preserveMappedGaps, preserveUnmappedGaps); return true; } /** * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to * match residues and codons. Flags control whether existing gaps in unmapped * (intron) and mapped (exon) regions are preserved or not. Gaps between * intron and exon are only retained if both flags are set. * * @param alignTo * @param alignFrom * @param mapping * @param myGap * @param sourceGap * @param preserveUnmappedGaps * @param preserveMappedGaps */ public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom, AlignedCodonFrame mapping, String myGap, char sourceGap, boolean preserveMappedGaps, boolean preserveUnmappedGaps) { // TODO generalise to work for Protein-Protein, dna-dna, dna-protein // aligned and dataset sequence positions, all base zero int thisSeqPos = 0; int sourceDsPos = 0; int basesWritten = 0; char myGapChar = myGap.charAt(0); int ratio = myGap.length(); int fromOffset = alignFrom.getStart() - 1; int toOffset = alignTo.getStart() - 1; int sourceGapMappedLength = 0; boolean inExon = false; final char[] thisSeq = alignTo.getSequence(); final char[] thatAligned = alignFrom.getSequence(); StringBuilder thisAligned = new StringBuilder(2 * thisSeq.length); /* * Traverse the 'model' aligned sequence */ for (char sourceChar : thatAligned) { if (sourceChar == sourceGap) { sourceGapMappedLength += ratio; continue; } /* * Found a non-gap character. Locate its mapped region if any. */ sourceDsPos++; // Note mapping positions are base 1, our sequence positions base 0 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom, sourceDsPos + fromOffset); if (mappedPos == null) { /* * unmapped position; treat like a gap */ sourceGapMappedLength += ratio; // System.err.println("Can't align: no codon mapping to residue " // + sourceDsPos + "(" + sourceChar + ")"); // return; continue; } int mappedCodonStart = mappedPos[0]; // position (1...) of codon start int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos StringBuilder trailingCopiedGap = new StringBuilder(); /* * Copy dna sequence up to and including this codon. Optionally, include * gaps before the codon starts (in introns) and/or after the codon starts * (in exons). * * Note this only works for 'linear' splicing, not reverse or interleaved. * But then 'align dna as protein' doesn't make much sense otherwise. */ int intronLength = 0; while (basesWritten + toOffset < mappedCodonEnd && thisSeqPos < thisSeq.length) { final char c = thisSeq[thisSeqPos++]; if (c != myGapChar) { basesWritten++; int sourcePosition = basesWritten + toOffset; if (sourcePosition < mappedCodonStart) { /* * Found an unmapped (intron) base. First add in any preceding gaps * (if wanted). */ if (preserveUnmappedGaps && trailingCopiedGap.length() > 0) { thisAligned.append(trailingCopiedGap.toString()); intronLength += trailingCopiedGap.length(); trailingCopiedGap = new StringBuilder(); } intronLength++; inExon = false; } else { final boolean startOfCodon = sourcePosition == mappedCodonStart; int gapsToAdd = calculateGapsToInsert(preserveMappedGaps, preserveUnmappedGaps, sourceGapMappedLength, inExon, trailingCopiedGap.length(), intronLength, startOfCodon); for (int i = 0; i < gapsToAdd; i++) { thisAligned.append(myGapChar); } sourceGapMappedLength = 0; inExon = true; } thisAligned.append(c); trailingCopiedGap = new StringBuilder(); } else { if (inExon && preserveMappedGaps) { trailingCopiedGap.append(myGapChar); } else if (!inExon && preserveUnmappedGaps) { trailingCopiedGap.append(myGapChar); } } } } /* * At end of model aligned sequence. Copy any remaining target sequence, optionally * including (intron) gaps. */ while (thisSeqPos < thisSeq.length) { final char c = thisSeq[thisSeqPos++]; if (c != myGapChar || preserveUnmappedGaps) { thisAligned.append(c); } sourceGapMappedLength--; } /* * finally add gaps to pad for any trailing source gaps or * unmapped characters */ if (preserveUnmappedGaps) { while (sourceGapMappedLength > 0) { thisAligned.append(myGapChar); sourceGapMappedLength--; } } /* * All done aligning, set the aligned sequence. */ alignTo.setSequence(new String(thisAligned)); } /** * Helper method to work out how many gaps to insert when realigning. * * @param preserveMappedGaps * @param preserveUnmappedGaps * @param sourceGapMappedLength * @param inExon * @param trailingCopiedGap * @param intronLength * @param startOfCodon * @return */ protected static int calculateGapsToInsert(boolean preserveMappedGaps, boolean preserveUnmappedGaps, int sourceGapMappedLength, boolean inExon, int trailingGapLength, int intronLength, final boolean startOfCodon) { int gapsToAdd = 0; if (startOfCodon) { /* * Reached start of codon. Ignore trailing gaps in intron unless we are * preserving gaps in both exon and intron. Ignore them anyway if the * protein alignment introduces a gap at least as large as the intronic * region. */ if (inExon && !preserveMappedGaps) { trailingGapLength = 0; } if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps)) { trailingGapLength = 0; } if (inExon) { gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength); } else { if (intronLength + trailingGapLength <= sourceGapMappedLength) { gapsToAdd = sourceGapMappedLength - intronLength; } else { gapsToAdd = Math.min(intronLength + trailingGapLength - sourceGapMappedLength, trailingGapLength); } } } else { /* * second or third base of codon; check for any gaps in dna */ if (!preserveMappedGaps) { trailingGapLength = 0; } gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength); } return gapsToAdd; } /** * Realigns the given protein to match the alignment of the dna, using codon * mappings to translate aligned codon positions to protein residues. * * @param protein * the alignment whose sequences are realigned by this method * @param dna * the dna alignment whose alignment we are 'copying' * @return the number of sequences that were realigned */ public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna) { List unmappedProtein = new ArrayList(); Map> alignedCodons = buildCodonColumnsMap( protein, dna, unmappedProtein); return alignProteinAs(protein, alignedCodons, unmappedProtein); } /** * Builds a map whose key is an aligned codon position (3 alignment column * numbers base 0), and whose value is a map from protein sequence to each * protein's peptide residue for that codon. The map generates an ordering of * the codons, and allows us to read off the peptides at each position in * order to assemble 'aligned' protein sequences. * * @param protein * the protein alignment * @param dna * the coding dna alignment * @param unmappedProtein * any unmapped proteins are added to this list * @return */ protected static Map> buildCodonColumnsMap( AlignmentI protein, AlignmentI dna, List unmappedProtein) { /* * maintain a list of any proteins with no mappings - these will be * rendered 'as is' in the protein alignment as we can't align them */ unmappedProtein.addAll(protein.getSequences()); List mappings = protein.getCodonFrames(); /* * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of * {dnaSequence, {proteinSequence, codonProduct}} at that position. The * comparator keeps the codon positions ordered. */ Map> alignedCodons = new TreeMap>( new CodonComparator()); for (SequenceI dnaSeq : dna.getSequences()) { for (AlignedCodonFrame mapping : mappings) { SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein); if (prot != null) { Mapping seqMap = mapping.getMappingForSequence(dnaSeq); addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap, alignedCodons); unmappedProtein.remove(prot); } } } /* * Finally add any unmapped peptide start residues (e.g. for incomplete * codons) as if at the codon position before the second residue */ // TODO resolve JAL-2022 so this fudge can be removed int mappedSequenceCount = protein.getHeight() - unmappedProtein.size(); addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount); return alignedCodons; } /** * Scans for any protein mapped from position 2 (meaning unmapped start * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the * preceding position in the alignment * * @param alignedCodons * the codon-to-peptide map * @param mappedSequenceCount * the number of distinct sequences in the map */ protected static void addUnmappedPeptideStarts( Map> alignedCodons, int mappedSequenceCount) { // TODO delete this ugly hack once JAL-2022 is resolved // i.e. we can model startPhase > 0 (incomplete start codon) List sequencesChecked = new ArrayList(); AlignedCodon lastCodon = null; Map toAdd = new HashMap(); for (Entry> entry : alignedCodons .entrySet()) { for (Entry sequenceCodon : entry.getValue() .entrySet()) { SequenceI seq = sequenceCodon.getKey(); if (sequencesChecked.contains(seq)) { continue; } sequencesChecked.add(seq); AlignedCodon codon = sequenceCodon.getValue(); if (codon.peptideCol > 1) { System.err .println("Problem mapping protein with >1 unmapped start positions: " + seq.getName()); } else if (codon.peptideCol == 1) { /* * first position (peptideCol == 0) was unmapped - add it */ if (lastCodon != null) { AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1, lastCodon.pos2, lastCodon.pos3, String.valueOf(seq .getCharAt(0)), 0); toAdd.put(seq, firstPeptide); } else { /* * unmapped residue at start of alignment (no prior column) - * 'insert' at nominal codon [0, 0, 0] */ AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0, String.valueOf(seq.getCharAt(0)), 0); toAdd.put(seq, firstPeptide); } } if (sequencesChecked.size() == mappedSequenceCount) { // no need to check past first mapped position in all sequences break; } } lastCodon = entry.getKey(); } /* * add any new codons safely after iterating over the map */ for (Entry startCodon : toAdd.entrySet()) { addCodonToMap(alignedCodons, startCodon.getValue(), startCodon.getKey()); } } /** * Update the aligned protein sequences to match the codon alignments given in * the map. * * @param protein * @param alignedCodons * an ordered map of codon positions (columns), with sequence/peptide * values present in each column * @param unmappedProtein * @return */ protected static int alignProteinAs(AlignmentI protein, Map> alignedCodons, List unmappedProtein) { /* * Prefill aligned sequences with gaps before inserting aligned protein * residues. */ int alignedWidth = alignedCodons.size(); char[] gaps = new char[alignedWidth]; Arrays.fill(gaps, protein.getGapCharacter()); String allGaps = String.valueOf(gaps); for (SequenceI seq : protein.getSequences()) { if (!unmappedProtein.contains(seq)) { seq.setSequence(allGaps); } } int column = 0; for (AlignedCodon codon : alignedCodons.keySet()) { final Map columnResidues = alignedCodons .get(codon); for (Entry entry : columnResidues.entrySet()) { // place translated codon at its column position in sequence entry.getKey().getSequence()[column] = entry.getValue().product .charAt(0); } column++; } return 0; } /** * Populate the map of aligned codons by traversing the given sequence * mapping, locating the aligned positions of mapped codons, and adding those * positions and their translation products to the map. * * @param dna * the aligned sequence we are mapping from * @param protein * the sequence to be aligned to the codons * @param gapChar * the gap character in the dna sequence * @param seqMap * a mapping to a sequence translation * @param alignedCodons * the map we are building up */ static void addCodonPositions(SequenceI dna, SequenceI protein, char gapChar, Mapping seqMap, Map> alignedCodons) { Iterator codons = seqMap.getCodonIterator(dna, gapChar); /* * add codon positions, and their peptide translations, to the alignment * map, while remembering the first codon mapped */ while (codons.hasNext()) { try { AlignedCodon codon = codons.next(); addCodonToMap(alignedCodons, codon, protein); } catch (IncompleteCodonException e) { // possible incomplete trailing codon - ignore } catch (NoSuchElementException e) { // possibly peptide lacking STOP } } } /** * Helper method to add a codon-to-peptide entry to the aligned codons map * * @param alignedCodons * @param codon * @param protein */ protected static void addCodonToMap( Map> alignedCodons, AlignedCodon codon, SequenceI protein) { Map seqProduct = alignedCodons.get(codon); if (seqProduct == null) { seqProduct = new HashMap(); alignedCodons.put(codon, seqProduct); } seqProduct.put(protein, codon); } /** * Returns true if a cDNA/Protein mapping either exists, or could be made, * between at least one pair of sequences in the two alignments. Currently, * the logic is: *

One alignment must be nucleotide, and the other protein
At least one pair of sequences must be already mapped, or mappable
Mappable means the nucleotide translation matches the protein sequence
The translation may ignore start and stop codons if present in the * nucleotide

* * @param al1 * @param al2 * @return */ public static boolean isMappable(AlignmentI al1, AlignmentI al2) { if (al1 == null || al2 == null) { return false; } /* * Require one nucleotide and one protein */ if (al1.isNucleotide() == al2.isNucleotide()) { return false; } AlignmentI dna = al1.isNucleotide() ? al1 : al2; AlignmentI protein = dna == al1 ? al2 : al1; List mappings = protein.getCodonFrames(); for (SequenceI dnaSeq : dna.getSequences()) { for (SequenceI proteinSeq : protein.getSequences()) { if (isMappable(dnaSeq, proteinSeq, mappings)) { return true; } } } return false; } /** * Returns true if the dna sequence is mapped, or could be mapped, to the * protein sequence. * * @param dnaSeq * @param proteinSeq * @param mappings * @return */ protected static boolean isMappable(SequenceI dnaSeq, SequenceI proteinSeq, List mappings) { if (dnaSeq == null || proteinSeq == null) { return false; } SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq : dnaSeq .getDatasetSequence(); SequenceI proteinDs = proteinSeq.getDatasetSequence() == null ? proteinSeq : proteinSeq.getDatasetSequence(); for (AlignedCodonFrame mapping : mappings) { if (proteinDs == mapping.getAaForDnaSeq(dnaDs)) { /* * already mapped */ return true; } } /* * Just try to make a mapping (it is not yet stored), test whether * successful. */ return mapCdnaToProtein(proteinDs, dnaDs) != null; } /** * Finds any reference annotations associated with the sequences in * sequenceScope, that are not already added to the alignment, and adds them * to the 'candidates' map. Also populates a lookup table of annotation * labels, keyed by calcId, for use in constructing tooltips or the like. * * @param sequenceScope * the sequences to scan for reference annotations * @param labelForCalcId * (optional) map to populate with label for calcId * @param candidates * map to populate with annotations for sequence * @param al * the alignment to check for presence of annotations */ public static void findAddableReferenceAnnotations( List sequenceScope, Map labelForCalcId, final Map> candidates, AlignmentI al) { if (sequenceScope == null) { return; } /* * For each sequence in scope, make a list of any annotations on the * underlying dataset sequence which are not already on the alignment. * * Add to a map of { alignmentSequence, } */ for (SequenceI seq : sequenceScope) { SequenceI dataset = seq.getDatasetSequence(); if (dataset == null) { continue; } AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation(); if (datasetAnnotations == null) { continue; } final List result = new ArrayList(); for (AlignmentAnnotation dsann : datasetAnnotations) { /* * Find matching annotations on the alignment. If none is found, then * add this annotation to the list of 'addable' annotations for this * sequence. */ final Iterable matchedAlignmentAnnotations = al .findAnnotations(seq, dsann.getCalcId(), dsann.label); if (!matchedAlignmentAnnotations.iterator().hasNext()) { result.add(dsann); if (labelForCalcId != null) { labelForCalcId.put(dsann.getCalcId(), dsann.label); } } } /* * Save any addable annotations for this sequence */ if (!result.isEmpty()) { candidates.put(seq, result); } } } /** * Adds annotations to the top of the alignment annotations, in the same order * as their related sequences. * * @param annotations * the annotations to add * @param alignment * the alignment to add them to * @param selectionGroup * current selection group (or null if none) */ public static void addReferenceAnnotations( Map> annotations, final AlignmentI alignment, final SequenceGroup selectionGroup) { for (SequenceI seq : annotations.keySet()) { for (AlignmentAnnotation ann : annotations.get(seq)) { AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann); int startRes = 0; int endRes = ann.annotations.length; if (selectionGroup != null) { startRes = selectionGroup.getStartRes(); endRes = selectionGroup.getEndRes(); } copyAnn.restrict(startRes, endRes); /* * Add to the sequence (sets copyAnn.datasetSequence), unless the * original annotation is already on the sequence. */ if (!seq.hasAnnotation(ann)) { seq.addAlignmentAnnotation(copyAnn); } // adjust for gaps copyAnn.adjustForAlignment(); // add to the alignment and set visible alignment.addAnnotation(copyAnn); copyAnn.visible = true; } } } /** * Set visibility of alignment annotations of specified types (labels), for * specified sequences. This supports controls like * "Show all secondary structure", "Hide all Temp factor", etc. * * @al the alignment to scan for annotations * @param types * the types (labels) of annotations to be updated * @param forSequences * if not null, only annotations linked to one of these sequences are * in scope for update; if null, acts on all sequence annotations * @param anyType * if this flag is true, 'types' is ignored (label not checked) * @param doShow * if true, set visibility on, else set off */ public static void showOrHideSequenceAnnotations(AlignmentI al, Collection types, List forSequences, boolean anyType, boolean doShow) { for (AlignmentAnnotation aa : al.getAlignmentAnnotation()) { if (anyType || types.contains(aa.label)) { if ((aa.sequenceRef != null) && (forSequences == null || forSequences .contains(aa.sequenceRef))) { aa.visible = doShow; } } } } /** * Returns true if either sequence has a cross-reference to the other * * @param seq1 * @param seq2 * @return */ public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2) { // Note: moved here from class CrossRef as the latter class has dependencies // not availability to the applet's classpath return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1); } /** * Returns true if seq1 has a cross-reference to seq2. Currently this assumes * that sequence name is structured as Source|AccessionId. * * @param seq1 * @param seq2 * @return */ public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2) { if (seq1 == null || seq2 == null) { return false; } String name = seq2.getName(); final DBRefEntry[] xrefs = seq1.getDBRefs(); if (xrefs != null) { for (DBRefEntry xref : xrefs) { String xrefName = xref.getSource() + "|" + xref.getAccessionId(); // case-insensitive test, consistent with DBRefEntry.equalRef() if (xrefName.equalsIgnoreCase(name)) { return true; } } } return false; } /** * Constructs an alignment consisting of the mapped (CDS) regions in the given * nucleotide sequences, and updates mappings to match. The CDS sequences are * added to the original alignment's dataset, which is shared by the new * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are * added to the alignment dataset. * * @param dna * aligned dna sequences * @param mappings * from dna to protein * @param al * @return an alignment whose sequences are the cds-only parts of the dna * sequences (or null if no mappings are found) */ public static AlignmentI makeCdsAlignment(SequenceI[] dna, List mappings, AlignmentI al) { List cdsSeqs = new ArrayList(); for (SequenceI seq : dna) { AlignedCodonFrame cdsMappings = new AlignedCodonFrame(); List seqMappings = MappingUtils .findMappingsForSequence(seq, mappings); List alignmentMappings = al.getCodonFrames(); for (AlignedCodonFrame mapping : seqMappings) { for (Mapping aMapping : mapping.getMappingsFromSequence(seq)) { SequenceI cdsSeq = makeCdsSequence(seq.getDatasetSequence(), aMapping); cdsSeqs.add(cdsSeq); /* * add a mapping from CDS to the (unchanged) mapped to range */ List cdsRange = Collections.singletonList(new int[] { 1, cdsSeq.getLength() }); MapList map = new MapList(cdsRange, aMapping.getMap() .getToRanges(), aMapping.getMap().getFromRatio(), aMapping.getMap().getToRatio()); cdsMappings.addMap(cdsSeq, aMapping.getTo(), map); /* * add another mapping from original 'from' range to CDS */ map = new MapList(aMapping.getMap().getFromRanges(), cdsRange, 1, 1); cdsMappings.addMap(seq.getDatasetSequence(), cdsSeq, map); alignmentMappings.add(cdsMappings); /* * transfer any features on dna that overlap the CDS */ transferFeatures(seq, cdsSeq, map, null, SequenceOntologyI.CDS); } } } /* * add CDS seqs to shared dataset */ Alignment dataset = al.getDataset(); for (SequenceI seq : cdsSeqs) { if (!dataset.getSequences().contains(seq.getDatasetSequence())) { dataset.addSequence(seq.getDatasetSequence()); } } AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs .size()])); cds.setDataset(dataset); return cds; } /** * Helper method that makes a CDS sequence as defined by the mappings from the * given sequence i.e. extracts the 'mapped from' ranges (which may be on * forward or reverse strand). * * @param seq * @param mapping * @return */ static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping) { char[] seqChars = seq.getSequence(); List fromRanges = mapping.getMap().getFromRanges(); int cdsWidth = MappingUtils.getLength(fromRanges); char[] newSeqChars = new char[cdsWidth]; int newPos = 0; for (int[] range : fromRanges) { if (range[0] <= range[1]) { // forward strand mapping - just copy the range int length = range[1] - range[0] + 1; System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos, length); newPos += length; } else { // reverse strand mapping - copy and complement one by one for (int i = range[0]; i >= range[1]; i--) { newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]); } } } SequenceI newSeq = new Sequence(seq.getName() + "|" + mapping.getTo().getName(), newSeqChars, 1, newPos); newSeq.createDatasetSequence(); return newSeq; } /** * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the * feature start/end ranges, optionally omitting specified feature types. * Returns the number of features copied. * * @param fromSeq * @param toSeq * @param select * if not null, only features of this type are copied (including * subtypes in the Sequence Ontology) * @param mapping * the mapping from 'fromSeq' to 'toSeq' * @param omitting */ public static int transferFeatures(SequenceI fromSeq, SequenceI toSeq, MapList mapping, String select, String... omitting) { SequenceI copyTo = toSeq; while (copyTo.getDatasetSequence() != null) { copyTo = copyTo.getDatasetSequence(); } SequenceOntologyI so = SequenceOntologyFactory.getInstance(); int count = 0; SequenceFeature[] sfs = fromSeq.getSequenceFeatures(); if (sfs != null) { for (SequenceFeature sf : sfs) { String type = sf.getType(); if (select != null && !so.isA(type, select)) { continue; } boolean omit = false; for (String toOmit : omitting) { if (type.equals(toOmit)) { omit = true; } } if (omit) { continue; } /* * locate the mapped range - null if either start or end is * not mapped (no partial overlaps are calculated) */ int start = sf.getBegin(); int end = sf.getEnd(); int[] mappedTo = mapping.locateInTo(start, end); /* * if whole exon range doesn't map, try interpreting it * as 5' or 3' exon overlapping the CDS range */ if (mappedTo == null) { mappedTo = mapping.locateInTo(end, end); if (mappedTo != null) { /* * end of exon is in CDS range - 5' overlap * to a range from the start of the peptide */ mappedTo[0] = 1; } } if (mappedTo == null) { mappedTo = mapping.locateInTo(start, start); if (mappedTo != null) { /* * start of exon is in CDS range - 3' overlap * to a range up to the end of the peptide */ mappedTo[1] = toSeq.getLength(); } } if (mappedTo != null) { SequenceFeature copy = new SequenceFeature(sf); copy.setBegin(Math.min(mappedTo[0], mappedTo[1])); copy.setEnd(Math.max(mappedTo[0], mappedTo[1])); copyTo.addSequenceFeature(copy); count++; } } } return count; } /** * Returns a mapping from dna to protein by inspecting sequence features of * type "CDS" on the dna. * * @param dnaSeq * @param proteinSeq * @return */ public static MapList mapCdsToProtein(SequenceI dnaSeq, SequenceI proteinSeq) { List ranges = findCdsPositions(dnaSeq); int mappedDnaLength = MappingUtils.getLength(ranges); int proteinLength = proteinSeq.getLength(); int proteinStart = proteinSeq.getStart(); int proteinEnd = proteinSeq.getEnd(); /* * incomplete start codon may mean X at start of peptide * we ignore both for mapping purposes */ if (proteinSeq.getCharAt(0) == 'X') { // todo JAL-2022 support startPhase > 0 proteinStart++; proteinLength--; } List proteinRange = new ArrayList(); /* * dna length should map to protein (or protein plus stop codon) */ int codesForResidues = mappedDnaLength / 3; if (codesForResidues == (proteinLength + 1)) { // assuming extra codon is for STOP and not in peptide codesForResidues--; } if (codesForResidues == proteinLength) { proteinRange.add(new int[] { proteinStart, proteinEnd }); return new MapList(ranges, proteinRange, 3, 1); } return null; } /** * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of * start/end positions of sequence features of type "CDS" (or a sub-type of * CDS in the Sequence Ontology). The ranges are sorted into ascending start * position order, so this method is only valid for linear CDS in the same * sense as the protein product. * * @param dnaSeq * @return */ public static List findCdsPositions(SequenceI dnaSeq) { List result = new ArrayList(); SequenceFeature[] sfs = dnaSeq.getSequenceFeatures(); if (sfs == null) { return result; } SequenceOntologyI so = SequenceOntologyFactory.getInstance(); int startPhase = 0; for (SequenceFeature sf : sfs) { /* * process a CDS feature (or a sub-type of CDS) */ if (so.isA(sf.getType(), SequenceOntologyI.CDS)) { int phase = 0; try { phase = Integer.parseInt(sf.getPhase()); } catch (NumberFormatException e) { // ignore } /* * phase > 0 on first codon means 5' incomplete - skip to the start * of the next codon; example ENST00000496384 */ int begin = sf.getBegin(); int end = sf.getEnd(); if (result.isEmpty()) { begin += phase; if (begin > end) { // shouldn't happen! System.err .println("Error: start phase extends beyond start CDS in " + dnaSeq.getName()); } } result.add(new int[] { begin, end }); } } /* * remove 'startPhase' positions (usually 0) from the first range * so we begin at the start of a complete codon */ if (!result.isEmpty()) { // TODO JAL-2022 correctly model start phase > 0 result.get(0)[0] += startPhase; } /* * Finally sort ranges by start position. This avoids a dependency on * keeping features in order on the sequence (if they are in order anyway, * the sort will have almost no work to do). The implicit assumption is CDS * ranges are assembled in order. Other cases should not use this method, * but instead construct an explicit mapping for CDS (e.g. EMBL parsing). */ Collections.sort(result, new Comparator() { @Override public int compare(int[] o1, int[] o2) { return Integer.compare(o1[0], o2[0]); } }); return result; } /** * Maps exon features from dna to protein, and computes variants in peptide * product generated by variants in dna, and adds them as sequence_variant * features on the protein sequence. Returns the number of variant features * added. * * @param dnaSeq * @param peptide * @param dnaToProtein */ public static int computeProteinFeatures(SequenceI dnaSeq, SequenceI peptide, MapList dnaToProtein) { while (dnaSeq.getDatasetSequence() != null) { dnaSeq = dnaSeq.getDatasetSequence(); } while (peptide.getDatasetSequence() != null) { peptide = peptide.getDatasetSequence(); } transferFeatures(dnaSeq, peptide, dnaToProtein, SequenceOntologyI.EXON); /* * compute protein variants from dna variants and codon mappings; * NB - alternatively we could retrieve this using the REST service e.g. * http://rest.ensembl.org/overlap/translation * /ENSP00000288602?feature=transcript_variation;content-type=text/xml * which would be a bit slower but possibly more reliable */ /* * build a map with codon variations for each potentially varying peptide */ LinkedHashMap[]> variants = buildDnaVariantsMap( dnaSeq, dnaToProtein); /* * scan codon variations, compute peptide variants and add to peptide sequence */ int count = 0; for (Entry[]> variant : variants.entrySet()) { int peptidePos = variant.getKey(); List[] codonVariants = variant.getValue(); count += computePeptideVariants(peptide, peptidePos, codonVariants); } /* * sort to get sequence features in start position order * - would be better to store in Sequence as a TreeSet or NCList? */ Arrays.sort(peptide.getSequenceFeatures(), new Comparator() { @Override public int compare(SequenceFeature o1, SequenceFeature o2) { int c = Integer.compare(o1.getBegin(), o2.getBegin()); return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd()) : c; } }); return count; } /** * Computes non-synonymous peptide variants from codon variants and adds them * as sequence_variant features on the protein sequence (one feature per * allele variant). Selected attributes (variant id, clinical significance) * are copied over to the new features. * * @param peptide * the protein sequence * @param peptidePos * the position to compute peptide variants for * @param codonVariants * a list of dna variants per codon position * @return the number of features added */ static int computePeptideVariants(SequenceI peptide, int peptidePos, List[] codonVariants) { String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); int count = 0; String base1 = codonVariants[0].get(0).base; String base2 = codonVariants[1].get(0).base; String base3 = codonVariants[2].get(0).base; /* * variants in first codon base */ for (DnaVariant var : codonVariants[0]) { if (var.variant != null) { String alleles = (String) var.variant.getValue("alleles"); if (alleles != null) { for (String base : alleles.split(",")) { String codon = base + base2 + base3; if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) { count++; } } } } } /* * variants in second codon base */ for (DnaVariant var : codonVariants[1]) { if (var.variant != null) { String alleles = (String) var.variant.getValue("alleles"); if (alleles != null) { for (String base : alleles.split(",")) { String codon = base1 + base + base3; if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) { count++; } } } } } /* * variants in third codon base */ for (DnaVariant var : codonVariants[2]) { if (var.variant != null) { String alleles = (String) var.variant.getValue("alleles"); if (alleles != null) { for (String base : alleles.split(",")) { String codon = base1 + base2 + base; if (addPeptideVariant(peptide, peptidePos, residue, var, codon)) { count++; } } } } } return count; } /** * Helper method that adds a peptide variant feature, provided the given codon * translates to a value different to the current residue (is a non-synonymous * variant). ID and clinical_significance attributes of the dna variant (if * present) are copied to the new feature. * * @param peptide * @param peptidePos * @param residue * @param var * @param codon * @return true if a feature was added, else false */ static boolean addPeptideVariant(SequenceI peptide, int peptidePos, String residue, DnaVariant var, String codon) { /* * get peptide translation of codon e.g. GAT -> D * note that variants which are not single alleles, * e.g. multibase variants or HGMD_MUTATION etc * are currently ignored here */ String trans = codon.contains("-") ? "-" : (codon.length() > 3 ? null : ResidueProperties .codonTranslate(codon)); if (trans != null && !trans.equals(residue)) { String desc = residue + "->" + trans; // set score to 0f so 'graduated colour' option is offered! SequenceFeature sf = new SequenceFeature( SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos, peptidePos, 0f, null); String id = (String) var.variant.getValue(ID); if (id != null) { if (id.startsWith(SEQUENCE_VARIANT)) { id = id.substring(SEQUENCE_VARIANT.length()); } sf.setValue(ID, id); // TODO handle other species variants StringBuilder link = new StringBuilder(32); try { link.append(desc).append(" ").append(id) .append("|http://www.ensembl.org/Homo_sapiens/Variation/Summary?v=") .append(URLEncoder.encode(id, "UTF-8")); sf.addLink(link.toString()); } catch (UnsupportedEncodingException e) { // as if } } String clinSig = (String) var.variant .getValue(CLINICAL_SIGNIFICANCE); if (clinSig != null) { sf.setValue(CLINICAL_SIGNIFICANCE, clinSig); } peptide.addSequenceFeature(sf); return true; } return false; } /** * Builds a map whose key is position in the protein sequence, and value is a * list of the base and all variants for each corresponding codon position * * @param dnaSeq * @param dnaToProtein * @return */ static LinkedHashMap[]> buildDnaVariantsMap( SequenceI dnaSeq, MapList dnaToProtein) { /* * map from peptide position to all variants of the codon which codes for it * LinkedHashMap ensures we keep the peptide features in sequence order */ LinkedHashMap[]> variants = new LinkedHashMap[]>(); SequenceOntologyI so = SequenceOntologyFactory.getInstance(); SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures(); if (dnaFeatures == null) { return variants; } int dnaStart = dnaSeq.getStart(); int[] lastCodon = null; int lastPeptidePostion = 0; /* * build a map of codon variations for peptides */ for (SequenceFeature sf : dnaFeatures) { int dnaCol = sf.getBegin(); if (dnaCol != sf.getEnd()) { // not handling multi-locus variant features continue; } if (so.isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) { int[] mapsTo = dnaToProtein.locateInTo(dnaCol, dnaCol); if (mapsTo == null) { // feature doesn't lie within coding region continue; } int peptidePosition = mapsTo[0]; List[] codonVariants = variants.get(peptidePosition); if (codonVariants == null) { codonVariants = new ArrayList[3]; codonVariants[0] = new ArrayList(); codonVariants[1] = new ArrayList(); codonVariants[2] = new ArrayList(); variants.put(peptidePosition, codonVariants); } /* * extract dna variants to a string array */ String alls = (String) sf.getValue("alleles"); if (alls == null) { continue; } String[] alleles = alls.toUpperCase().split(","); int i = 0; for (String allele : alleles) { alleles[i++] = allele.trim(); // lose any space characters "A, G" } /* * get this peptide's codon positions e.g. [3, 4, 5] or [4, 7, 10] */ int[] codon = peptidePosition == lastPeptidePostion ? lastCodon : MappingUtils.flattenRanges(dnaToProtein.locateInFrom( peptidePosition, peptidePosition)); lastPeptidePostion = peptidePosition; lastCodon = codon; /* * save nucleotide (and any variant) for each codon position */ for (int codonPos = 0; codonPos < 3; codonPos++) { String nucleotide = String.valueOf( dnaSeq.getCharAt(codon[codonPos] - dnaStart)) .toUpperCase(); List codonVariant = codonVariants[codonPos]; if (codon[codonPos] == dnaCol) { if (!codonVariant.isEmpty() && codonVariant.get(0).variant == null) { /* * already recorded base value, add this variant */ codonVariant.get(0).variant = sf; } else { /* * add variant with base value */ codonVariant.add(new DnaVariant(nucleotide, sf)); } } else if (codonVariant.isEmpty()) { /* * record (possibly non-varying) base value */ codonVariant.add(new DnaVariant(nucleotide)); } } } } return variants; } /** * Makes an alignment with a copy of the given sequences, adding in any * non-redundant sequences which are mapped to by the cross-referenced * sequences. * * @param seqs * @param xrefs * @return */ public static AlignmentI makeCopyAlignment(SequenceI[] seqs, SequenceI[] xrefs) { AlignmentI copy = new Alignment(new Alignment(seqs)); SequenceIdMatcher matcher = new SequenceIdMatcher(seqs); if (xrefs != null) { for (SequenceI xref : xrefs) { DBRefEntry[] dbrefs = xref.getDBRefs(); if (dbrefs != null) { for (DBRefEntry dbref : dbrefs) { if (dbref.getMap() == null || dbref.getMap().getTo() == null) { continue; } SequenceI mappedTo = dbref.getMap().getTo(); SequenceI match = matcher.findIdMatch(mappedTo); if (match == null) { matcher.add(mappedTo); copy.addSequence(mappedTo); } } } } } return copy; } /** * Try to align sequences in 'unaligned' to match the alignment of their * mapped regions in 'aligned'. For example, could use this to align CDS * sequences which are mapped to their parent cDNA sequences. * * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For * dna-to-protein or protein-to-dna use alternative methods. * * @param unaligned * sequences to be aligned * @param aligned * holds aligned sequences and their mappings * @return */ public static int alignAs(AlignmentI unaligned, AlignmentI aligned) { List unmapped = new ArrayList(); Map> columnMap = buildMappedColumnsMap( unaligned, aligned, unmapped); int width = columnMap.size(); char gap = unaligned.getGapCharacter(); int realignedCount = 0; for (SequenceI seq : unaligned.getSequences()) { if (!unmapped.contains(seq)) { char[] newSeq = new char[width]; Arrays.fill(newSeq, gap); int newCol = 0; int lastCol = 0; /* * traverse the map to find columns populated * by our sequence */ for (Integer column : columnMap.keySet()) { Character c = columnMap.get(column).get(seq); if (c != null) { /* * sequence has a character at this position * */ newSeq[newCol] = c; lastCol = newCol; } newCol++; } /* * trim trailing gaps */ if (lastCol < width) { char[] tmp = new char[lastCol + 1]; System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1); newSeq = tmp; } seq.setSequence(String.valueOf(newSeq)); realignedCount++; } } return realignedCount; } /** * Returns a map whose key is alignment column number (base 1), and whose * values are a map of sequence characters in that column. * * @param unaligned * @param aligned * @param unmapped * @return */ static Map> buildMappedColumnsMap( AlignmentI unaligned, AlignmentI aligned, List unmapped) { /* * Map will hold, for each aligned column position, a map of * {unalignedSequence, sequenceCharacter} at that position. * TreeMap keeps the entries in ascending column order. */ Map> map = new TreeMap>(); /* * r any sequences that have no mapping so can't be realigned */ unmapped.addAll(unaligned.getSequences()); List mappings = aligned.getCodonFrames(); for (SequenceI seq : unaligned.getSequences()) { for (AlignedCodonFrame mapping : mappings) { SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned); if (fromSeq != null) { Mapping seqMap = mapping.getMappingBetween(fromSeq, seq); if (addMappedPositions(seq, fromSeq, seqMap, map)) { unmapped.remove(seq); } } } } return map; } /** * Helper method that adds to a map the mapped column positions of a sequence.
* For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to * sequence. * * @param seq * the sequence whose column positions we are recording * @param fromSeq * a sequence that is mapped to the first sequence * @param seqMap * the mapping from 'fromSeq' to 'seq' * @param map * a map to add the column positions (in fromSeq) of the mapped * positions of seq * @return */ static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq, Mapping seqMap, Map> map) { if (seqMap == null) { return false; } char[] fromChars = fromSeq.getSequence(); int toStart = seq.getStart(); char[] toChars = seq.getSequence(); /* * traverse [start, end, start, end...] ranges in fromSeq */ for (int[] fromRange : seqMap.getMap().getFromRanges()) { for (int i = 0; i < fromRange.length - 1; i += 2) { boolean forward = fromRange[i + 1] >= fromRange[i]; /* * find the range mapped to (sequence positions base 1) */ int[] range = seqMap.locateMappedRange(fromRange[i], fromRange[i + 1]); if (range == null) { System.err.println("Error in mapping " + seqMap + " from " + fromSeq.getName()); return false; } int fromCol = fromSeq.findIndex(fromRange[i]); int mappedCharPos = range[0]; /* * walk over the 'from' aligned sequence in forward or reverse * direction; when a non-gap is found, record the column position * of the next character of the mapped-to sequence; stop when all * the characters of the range have been counted */ while (mappedCharPos <= range[1]) { if (!Comparison.isGap(fromChars[fromCol - 1])) { /* * mapped from sequence has a character in this column * record the column position for the mapped to character */ Map seqsMap = map.get(fromCol); if (seqsMap == null) { seqsMap = new HashMap(); map.put(fromCol, seqsMap); } seqsMap.put(seq, toChars[mappedCharPos - toStart]); mappedCharPos++; } fromCol += (forward ? 1 : -1); } } } return true; } // strictly temporary hack until proper criteria for aligning protein to cds // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot public static boolean looksLikeEnsembl(AlignmentI alignment) { for (SequenceI seq : alignment.getSequences()) { String name = seq.getName(); if (!name.startsWith("ENSG") && !name.startsWith("ENST")) { return false; } } return true; } }